"""Output writers, LRC parsing, and metadata extraction for txtlyric-to-lrc. Writers: - :func:`result_to_lrc` -- emits classic LRC, inserting a clear-display marker whenever the gap between consecutive segments exceeds a threshold so that lines do not visually persist through instrumental pauses. - :func:`result_to_lyricsfile` -- emits the YAML-based Lyricsfile format used by lrcget/lrclib, with word-level timings. LRC input: - :func:`parse_lrc_file` -- parses an LRC file into metadata tags and timed segments (with ``start``/``end``/``text``) suitable for ``stable_whisper.align_words``. """ from __future__ import annotations import os import re import sys from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import mutagen import stable_whisper import yaml song_file_extensions = { "mp3", "wav", "mp4", "mpeg", "mpga", "m4a", "webm", "flac", "opus", } def get_file_extension(filename: str) -> str: return os.path.splitext(filename)[1][1:] def file_is_audio(filename: str) -> bool: return get_file_extension(filename) in song_file_extensions # --------------------------------------------------------------------------- # Metadata # --------------------------------------------------------------------------- @dataclass class TrackMetadata: title: str artist: str album: Optional[str] = None duration_ms: Optional[int] = None language: Optional[str] = None offset_ms: Optional[int] = None def extract_metadata(audio_path: str, language: Optional[str] = None) -> TrackMetadata: """Read tags + duration from the audio file via mutagen. Falls back to the placeholder strings ``"Unknown"`` / ``"Unknown Artist"`` when the corresponding tag is missing, since Lyricsfile requires ``metadata.title`` and ``metadata.artist``. """ f = mutagen.File(audio_path, easy=True) return TrackMetadata( title=_first_tag(f, "title") or "Unknown", artist=_first_tag(f, "artist") or "Unknown Artist", album=_first_tag(f, "album"), duration_ms=_duration_ms(f), language=language, ) def _first_tag(f, key: str) -> Optional[str]: if f is None: return None val = f.get(key) if not val: return None if isinstance(val, list): val = val[0] if val else None if val is None: return None val = str(val).strip() return val or None def _duration_ms(f) -> Optional[int]: info = getattr(f, "info", None) if f is not None else None length = getattr(info, "length", None) if info is not None else None if length is None: return None return int(round(length * 1000)) # --------------------------------------------------------------------------- # LRC parsing # --------------------------------------------------------------------------- _LRC_METADATA_KEYS_TO_LYRICSFILE = { "ti": "title", "ar": "artist", "al": "album", "la": "language", "lang": "language", "language": "language", } _TIMESTAMP_RE = re.compile(r"^(\d{1,3}):(\d{2})(?:[.:](\d{2,3}))?$") _LINE_RE = re.compile(r"^\[([^\]]+)\](.*)$") @dataclass class ParsedLrc: """Result of parsing an LRC file.""" segments: List[dict] = field(default_factory=list) metadata_tags: Dict[str, str] = field(default_factory=dict) dropped_tags: Dict[str, str] = field(default_factory=dict) def _parse_lrc_timestamp(token: str) -> Optional[float]: """Parse an LRC timestamp token to seconds, or None if not a timestamp.""" m = _TIMESTAMP_RE.match(token.strip()) if not m: return None minutes = int(m.group(1)) seconds = int(m.group(2)) frac_raw = m.group(3) or "0" if len(frac_raw) == 2: frac_ms = int(frac_raw) * 10 else: frac_ms = int(frac_raw) return minutes * 60 + seconds + frac_ms / 1000.0 def _parse_lrc_length(value: str) -> Optional[int]: """Parse an LRC [length:...] value to milliseconds.""" value = value.strip() m = re.match(r"^(\d+):(\d{2})(?:[.:](\d{2,3}))?$", value) if not m: return None minutes = int(m.group(1)) seconds = int(m.group(2)) frac_raw = m.group(3) or "0" if len(frac_raw) == 2: frac_ms = int(frac_raw) * 10 else: frac_ms = int(frac_raw) return (minutes * 60 + seconds) * 1000 + frac_ms def _parse_lrc_offset(value: str) -> Optional[int]: """Parse an LRC [offset:...] value to milliseconds (integer, may be negative).""" value = value.strip() m = re.match(r"^([+-]?\d+)$", value) if not m: return None return int(m.group(1)) def parse_lrc_file(content: str) -> ParsedLrc: """Parse LRC content into timed segments and metadata. Returns a :class:`ParsedLrc` containing: - ``segments``: list of ``{"start": float, "end": float, "text": str}`` suitable for ``model.align_words()``. Gap markers (empty-text timestamps) contribute an ``end`` to the preceding segment but do not appear as segments themselves. - ``metadata_tags``: dict of recognised LRC metadata mapped to Lyricsfile-compatible keys. - ``dropped_tags``: dict of LRC metadata tags that have no Lyricsfile equivalent (warned about by the caller). If the LRC contains an ``[offset:...]`` tag, the offset is applied to all parsed timestamps so that the returned times are absolute. """ result = ParsedLrc() raw_timed: List[Tuple[float, str]] = [] for raw_line in content.splitlines(): raw_line = raw_line.strip() if not raw_line: continue m = _LINE_RE.match(raw_line) if not m: continue bracket_content = m.group(1) after_bracket = m.group(2) ts = _parse_lrc_timestamp(bracket_content) if ts is not None: raw_timed.append((ts, after_bracket.strip())) continue # Metadata tag: [key:value] if ":" in bracket_content: key, _, value = bracket_content.partition(":") key = key.strip().lower() value = value.strip() if not value: continue lyricsfile_key = _LRC_METADATA_KEYS_TO_LYRICSFILE.get(key) if lyricsfile_key is not None: result.metadata_tags[lyricsfile_key] = value elif key == "length": length_ms = _parse_lrc_length(value) if length_ms is not None: result.metadata_tags["duration_ms"] = str(length_ms) else: result.dropped_tags[key] = value elif key == "offset": offset_ms = _parse_lrc_offset(value) if offset_ms is not None: result.metadata_tags["offset_ms"] = str(offset_ms) else: result.dropped_tags[key] = value else: result.dropped_tags[key] = value if not raw_timed: return result raw_timed.sort(key=lambda t: t[0]) # Apply offset: shift all timestamps so output is absolute offset_s = 0.0 if "offset_ms" in result.metadata_tags: offset_s = int(result.metadata_tags["offset_ms"]) / 1000.0 adjusted: List[Tuple[float, str]] = [ (max(0.0, ts + offset_s), text) for ts, text in raw_timed ] # Build segments: non-empty text lines become segments; empty-text # lines (gap markers) contribute an end time to the preceding segment. pending_segments: List[dict] = [] for ts, text in adjusted: if text: pending_segments.append({"start": ts, "end": None, "text": text}) elif pending_segments: # Gap marker: set the preceding segment's end pending_segments[-1]["end"] = ts # Fill in missing end times: end of seg N = start of seg N+1 for i in range(len(pending_segments) - 1): if pending_segments[i]["end"] is None: pending_segments[i]["end"] = pending_segments[i + 1]["start"] # Last segment: if no explicit end (no trailing gap marker), use # start + 5s as a reasonable upper bound; align_words will confine # within whatever audio is available. if pending_segments and pending_segments[-1]["end"] is None: pending_segments[-1]["end"] = pending_segments[-1]["start"] + 5.0 result.segments = pending_segments return result def merge_metadata( audio_path: str, lrc_parsed: ParsedLrc, cli_language: Optional[str] = None, ) -> TrackMetadata: """Build a :class:`TrackMetadata` by merging LRC tags over mutagen tags. Precedence (highest first): LRC tag → mutagen tag → placeholder. The ``--language`` CLI flag overrides both LRC and mutagen for language. Warnings are printed to stderr for LRC metadata tags that have no Lyricsfile equivalent and are therefore dropped. """ audio_meta = extract_metadata(audio_path, language=cli_language) tags = lrc_parsed.metadata_tags title = tags.get("title") or audio_meta.title artist = tags.get("artist") or audio_meta.artist album = tags.get("album") or audio_meta.album duration_ms = audio_meta.duration_ms if "duration_ms" in tags: try: duration_ms = int(tags["duration_ms"]) except ValueError: pass language = cli_language or tags.get("language") or audio_meta.language for key, value in lrc_parsed.dropped_tags.items(): print( f" Warning: LRC tag [{key}:{value}] has no Lyricsfile equivalent; skipped", file=sys.stderr, ) return TrackMetadata( title=title, artist=artist, album=album, duration_ms=duration_ms, language=language, ) def _format_lrc_timestamp(seconds: float) -> str: if seconds < 0: seconds = 0.0 minutes = int(seconds // 60) remainder = seconds - minutes * 60 return f"{minutes:02d}:{remainder:05.2f}" def result_to_lrc( result: "stable_whisper.WhisperResult", gap_threshold: float = 1.5, ) -> str: """Render ``result`` as LRC text. Between consecutive segments, if ``next_segment.start - this_segment.end`` exceeds ``gap_threshold`` seconds, an empty timestamp is emitted at ``this_segment.end`` so consumers stop displaying the line during the pause. A trailing empty timestamp at the end of the last segment is always emitted. """ segments = list(result.segments) if not segments: return "" lines: List[str] = [] for i, seg in enumerate(segments): text = (seg.text or "").strip() lines.append(f"[{_format_lrc_timestamp(seg.start)}]{text}") next_seg = segments[i + 1] if i + 1 < len(segments) else None if next_seg is None: lines.append(f"[{_format_lrc_timestamp(seg.end)}]") elif next_seg.start - seg.end > gap_threshold: lines.append(f"[{_format_lrc_timestamp(seg.end)}]") return "\n".join(lines) + "\n" def result_to_lyricsfile( result: "stable_whisper.WhisperResult", metadata: TrackMetadata, ) -> str: """Render ``result`` as a Lyricsfile YAML string. The ``plain`` block is intentionally omitted; consumers receive only the synced ``lines`` array (with word-level timings when available). """ metadata_obj = { "title": metadata.title, "artist": metadata.artist, } if metadata.album: metadata_obj["album"] = metadata.album if metadata.duration_ms is not None: metadata_obj["duration_ms"] = metadata.duration_ms if metadata.language: metadata_obj["language"] = metadata.language if metadata.offset_ms is not None: metadata_obj["offset_ms"] = metadata.offset_ms metadata_obj["instrumental"] = False lines_out = [] for seg in result.segments: line_obj = { "text": (seg.text or "").strip(), "start_ms": _to_ms(seg.start), "end_ms": _to_ms(seg.end), } word_objs = _words_to_lyricsfile_words(seg.words) if seg.words else None if word_objs: line_obj["words"] = word_objs lines_out.append(line_obj) document = { "version": "1.0", "metadata": metadata_obj, "lines": lines_out, } return yaml.safe_dump( document, sort_keys=False, allow_unicode=True, default_flow_style=False, ) def _to_ms(seconds: float) -> int: return int(round(seconds * 1000)) def _words_to_lyricsfile_words(word_timings) -> List[dict]: """Convert stable-ts word objects to Lyricsfile word objects. Whisper's tokenization produces words with leading whitespace (e.g. ``" club"``). Lyricsfile expects trailing whitespace except on the final word. This re-attaches the leading space of word ``i+1`` onto the trailing edge of word ``i``. For CJK languages stable-ts splits without spaces, in which case no spacing is injected. """ bodies = [(w.word or "").lstrip() for w in word_timings] leading_spaces = [(w.word or "")[: len(w.word or "") - len((w.word or "").lstrip())] for w in word_timings] out: List[dict] = [] for i, w in enumerate(word_timings): text = bodies[i] if i + 1 < len(word_timings) and leading_spaces[i + 1]: text = text + " " out.append( { "text": text, "start_ms": _to_ms(w.start), "end_ms": _to_ms(w.end), } ) return out