add support for word-level timings in lyricfile format and lrc->lyricfile

2026-05-16 01:54:54 -07:00
parent 9161ab0b24
commit cc7928a532
6 changed files with 1121 additions and 72 deletions
@@ -2,79 +2,145 @@ import os

 import click
 import stable_whisper
-import srt
-import datetime

-song_file_extensions = set([
-    "mp3", # explicitly supported by whisper
-    "wav",
-    "mp4",
-    "mpeg",
-    "mpga",
-    "m4a",
-    "webm",
-    "flac", # stable-ts likely uses ffmpeg to convert this for whisper
-    "opus",
-])
+from output import (
+    extract_metadata,
+    file_is_audio,
+    merge_metadata,
+    parse_lrc_file,
+    result_to_lrc,
+    result_to_lyricsfile,
+)
+
+
+OUTPUT_EXTENSIONS = {
+    "yaml": "yaml",
+    "lrc": "lrc",
+}
+

@click.command()
@click.help_option("--help", "-h")
-@click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)')
-@click.option('--language', '-l', required=True, help='What language the lyrics are in')
-@click.argument('directory')
-def main(model, language, directory):
+@click.option(
+    "--model",
+    "-m",
+    default="small",
+    help="Which whisper model to use (choices are those of whisper.available_models)",
+)
+@click.option(
+    "--language",
+    "-l",
+    required=True,
+    help="What language the lyrics are in",
+)
+@click.option(
+    "--format",
+    "-f",
+    "output_format",
+    type=click.Choice(["yaml", "lrc"], case_sensitive=False),
+    default="yaml",
+    show_default=True,
+    help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).",
+)
+@click.argument("directory")
+def main(model, language, output_format, directory):
+    output_format = output_format.lower()
+    output_ext = OUTPUT_EXTENSIONS[output_format]
+
    print(f"Loading {model} model for use with language {language}...")
    model = stable_whisper.load_model(model)

    for (dirpath, _, filenames) in os.walk(directory):
-        if any([ file_is_audio(f) for f in filenames ]):
-            # audio files exist here, so let's process this folder
-            for f in filenames:
-                if file_is_audio(f):
-                    filename = os.path.join(dirpath, f)
-                    # get corresponding txt file name 
-                    # (we expect unaligned files to have the same filename as the song, except for the extension)
-                    unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt")
-                    # (we also expect existing .lrc files to contain aligned lyrics)
-                    aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc")
+        if not any(file_is_audio(f) for f in filenames):
+            continue
+        for f in filenames:
+            if not file_is_audio(f):
+                continue
+            audio_path = os.path.join(dirpath, f)
+            base, _ = os.path.splitext(audio_path)
+            output_path = base + "." + output_ext

-                    # run model to get aligned srt
-                    if not os.path.exists(unaligned_lyrics_filename):
-                        print(f"No corresponding unaligned lyric txt exists for {filename}")
-                        continue
-                    if os.path.exists(aligned_lyrics_filename):
-                        print(f"Corresponding aligned lyric txt already exists for {filename}")
-                        continue
-                    print(f"Aligning lyrics for {filename}")
-                    
-                    result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False)
+            if os.path.exists(output_path):
+                print(f"Aligned lyric file already exists for {audio_path}: {output_path}")
+                continue

-                    # turn srt to lrc
-                    srt = result.to_srt_vtt(filepath=None, word_level=False)
-                    lrc = srt_to_lrc(srt)
-                    print(f"Writing aligned lyrics at: {aligned_lyrics_filename}")
-                    open(aligned_lyrics_filename, "x").write(lrc)
-               
-def get_file_extension(filename: str) -> str:
-    # "asdf.omg.lol" -> [asdf.omg, .lol] -> lol
-    return os.path.splitext(filename)[1][1:]
+            lrc_path = base + ".lrc"
+            txt_path = base + ".txt"

-def file_is_audio(filename: str) -> bool:
-    return get_file_extension(filename) in song_file_extensions
+            if os.path.exists(lrc_path):
+                _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language)
+            elif os.path.exists(txt_path):
+                _align_from_txt(model, audio_path, txt_path, output_path, output_format, language)
+            else:
+                print(f"No .lrc or .txt sidecar found for {audio_path}")

-def timedelta_to_hhmmssss(td: datetime.timedelta) -> str:
-    dt = datetime.datetime(1969, 1, 1) + td
-    return dt.strftime('%M:%S.%f')[:-4]

-def srt_to_lrc(srt_text: str) -> str:
-    subs = list(srt.parse(srt_text))
-    lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs]
+def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language):
+    """Full alignment from plain text: determines line boundaries and word timings."""
+    print(f"Aligning lyrics for {audio_path} (from .txt)")
+    with open(txt_path) as fh:
+        unaligned_text = fh.read()
+
+    result = model.align(
+        audio_path,
+        unaligned_text,
+        language=language,
+        original_split=True,
+        regroup=False,
+        vad=True,
+    )
+
+    if not result or not list(result.segments):
+        print(f"  Alignment produced no segments; skipping")
+        return
+
+    result.adjust_gaps(one_section=True)
+
+    if output_format == "yaml":
+        metadata = extract_metadata(audio_path, language=language)
+        content = result_to_lyricsfile(result, metadata)
+    else:
+        content = result_to_lrc(result)
+
+    print(f"  Writing aligned lyrics at: {output_path}")
+    with open(output_path, "x") as out_fh:
+        out_fh.write(content)
+
+
+def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language):
+    """Word-level refinement of existing LRC: keeps line boundaries, adds word timings."""
+    print(f"Refining lyrics for {audio_path} (from .lrc)")
+    with open(lrc_path) as fh:
+        lrc_content = fh.read()
+
+    parsed = parse_lrc_file(lrc_content)
+
+    if not parsed.segments:
+        print(f"  No timed lines found in {lrc_path}; skipping")
+        return
+
+    result = model.align_words(
+        audio_path,
+        parsed.segments,
+        language=language,
+        vad=True,
+        regroup=False,
+    )
+
+    if not result or not list(result.segments):
+        print(f"  Word alignment produced no segments; skipping")
+        return
+
+    if output_format == "yaml":
+        metadata = merge_metadata(audio_path, parsed, cli_language=language)
+        content = result_to_lyricsfile(result, metadata)
+    else:
+        content = result_to_lrc(result)
+
+    print(f"  Writing refined lyrics at: {output_path}")
+    with open(output_path, "x") as out_fh:
+        out_fh.write(content)

-    # add the end of lyrics marker
-    end_time = subs[-1].end
-    lines.append(f"[{timedelta_to_hhmmssss(end_time)}]")
-    return "\n".join(lines)    

 if __name__ == "__main__":
    main()
-
@@ -0,0 +1,431 @@
+"""Output writers, LRC parsing, and metadata extraction for txtlyric-to-lrc.
+
+Writers:
+
+- :func:`result_to_lrc` -- emits classic LRC, inserting a clear-display
+  marker whenever the gap between consecutive segments exceeds a threshold
+  so that lines do not visually persist through instrumental pauses.
+- :func:`result_to_lyricsfile` -- emits the YAML-based Lyricsfile format
+  used by lrcget/lrclib, with word-level timings.
+
+LRC input:
+
+- :func:`parse_lrc_file` -- parses an LRC file into metadata tags and
+  timed segments (with ``start``/``end``/``text``) suitable for
+  ``stable_whisper.align_words``.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+import mutagen
+import stable_whisper
+import yaml
+
+
+song_file_extensions = {
+    "mp3",
+    "wav",
+    "mp4",
+    "mpeg",
+    "mpga",
+    "m4a",
+    "webm",
+    "flac",
+    "opus",
+}
+
+
+def get_file_extension(filename: str) -> str:
+    return os.path.splitext(filename)[1][1:]
+
+
+def file_is_audio(filename: str) -> bool:
+    return get_file_extension(filename) in song_file_extensions
+
+
+# ---------------------------------------------------------------------------
+# Metadata
+# ---------------------------------------------------------------------------
+
+@dataclass
+class TrackMetadata:
+    title: str
+    artist: str
+    album: Optional[str] = None
+    duration_ms: Optional[int] = None
+    language: Optional[str] = None
+    offset_ms: Optional[int] = None
+
+
+def extract_metadata(audio_path: str, language: Optional[str] = None) -> TrackMetadata:
+    """Read tags + duration from the audio file via mutagen.
+
+    Falls back to the placeholder strings ``"Unknown"`` / ``"Unknown Artist"``
+    when the corresponding tag is missing, since Lyricsfile requires
+    ``metadata.title`` and ``metadata.artist``.
+    """
+    f = mutagen.File(audio_path, easy=True)
+    return TrackMetadata(
+        title=_first_tag(f, "title") or "Unknown",
+        artist=_first_tag(f, "artist") or "Unknown Artist",
+        album=_first_tag(f, "album"),
+        duration_ms=_duration_ms(f),
+        language=language,
+    )
+
+
+def _first_tag(f, key: str) -> Optional[str]:
+    if f is None:
+        return None
+    val = f.get(key)
+    if not val:
+        return None
+    if isinstance(val, list):
+        val = val[0] if val else None
+    if val is None:
+        return None
+    val = str(val).strip()
+    return val or None
+
+
+def _duration_ms(f) -> Optional[int]:
+    info = getattr(f, "info", None) if f is not None else None
+    length = getattr(info, "length", None) if info is not None else None
+    if length is None:
+        return None
+    return int(round(length * 1000))
+
+
+# ---------------------------------------------------------------------------
+# LRC parsing
+# ---------------------------------------------------------------------------
+
+_LRC_METADATA_KEYS_TO_LYRICSFILE = {
+    "ti": "title",
+    "ar": "artist",
+    "al": "album",
+    "la": "language",
+    "lang": "language",
+    "language": "language",
+}
+
+_TIMESTAMP_RE = re.compile(r"^(\d{1,3}):(\d{2})(?:[.:](\d{2,3}))?$")
+_LINE_RE = re.compile(r"^\[([^\]]+)\](.*)$")
+
+
+@dataclass
+class ParsedLrc:
+    """Result of parsing an LRC file."""
+    segments: List[dict] = field(default_factory=list)
+    metadata_tags: Dict[str, str] = field(default_factory=dict)
+    dropped_tags: Dict[str, str] = field(default_factory=dict)
+
+
+def _parse_lrc_timestamp(token: str) -> Optional[float]:
+    """Parse an LRC timestamp token to seconds, or None if not a timestamp."""
+    m = _TIMESTAMP_RE.match(token.strip())
+    if not m:
+        return None
+    minutes = int(m.group(1))
+    seconds = int(m.group(2))
+    frac_raw = m.group(3) or "0"
+    if len(frac_raw) == 2:
+        frac_ms = int(frac_raw) * 10
+    else:
+        frac_ms = int(frac_raw)
+    return minutes * 60 + seconds + frac_ms / 1000.0
+
+
+def _parse_lrc_length(value: str) -> Optional[int]:
+    """Parse an LRC [length:...] value to milliseconds."""
+    value = value.strip()
+    m = re.match(r"^(\d+):(\d{2})(?:[.:](\d{2,3}))?$", value)
+    if not m:
+        return None
+    minutes = int(m.group(1))
+    seconds = int(m.group(2))
+    frac_raw = m.group(3) or "0"
+    if len(frac_raw) == 2:
+        frac_ms = int(frac_raw) * 10
+    else:
+        frac_ms = int(frac_raw)
+    return (minutes * 60 + seconds) * 1000 + frac_ms
+
+
+def _parse_lrc_offset(value: str) -> Optional[int]:
+    """Parse an LRC [offset:...] value to milliseconds (integer, may be negative)."""
+    value = value.strip()
+    m = re.match(r"^([+-]?\d+)$", value)
+    if not m:
+        return None
+    return int(m.group(1))
+
+
+def parse_lrc_file(content: str) -> ParsedLrc:
+    """Parse LRC content into timed segments and metadata.
+
+    Returns a :class:`ParsedLrc` containing:
+
+    - ``segments``: list of ``{"start": float, "end": float, "text": str}``
+      suitable for ``model.align_words()``.  Gap markers (empty-text
+      timestamps) contribute an ``end`` to the preceding segment but do not
+      appear as segments themselves.
+    - ``metadata_tags``: dict of recognised LRC metadata mapped to
+      Lyricsfile-compatible keys.
+    - ``dropped_tags``: dict of LRC metadata tags that have no Lyricsfile
+      equivalent (warned about by the caller).
+
+    If the LRC contains an ``[offset:...]`` tag, the offset is applied to
+    all parsed timestamps so that the returned times are absolute.
+    """
+    result = ParsedLrc()
+
+    raw_timed: List[Tuple[float, str]] = []
+
+    for raw_line in content.splitlines():
+        raw_line = raw_line.strip()
+        if not raw_line:
+            continue
+        m = _LINE_RE.match(raw_line)
+        if not m:
+            continue
+
+        bracket_content = m.group(1)
+        after_bracket = m.group(2)
+
+        ts = _parse_lrc_timestamp(bracket_content)
+        if ts is not None:
+            raw_timed.append((ts, after_bracket.strip()))
+            continue
+
+        # Metadata tag: [key:value]
+        if ":" in bracket_content:
+            key, _, value = bracket_content.partition(":")
+            key = key.strip().lower()
+            value = value.strip()
+            if not value:
+                continue
+
+            lyricsfile_key = _LRC_METADATA_KEYS_TO_LYRICSFILE.get(key)
+            if lyricsfile_key is not None:
+                result.metadata_tags[lyricsfile_key] = value
+            elif key == "length":
+                length_ms = _parse_lrc_length(value)
+                if length_ms is not None:
+                    result.metadata_tags["duration_ms"] = str(length_ms)
+                else:
+                    result.dropped_tags[key] = value
+            elif key == "offset":
+                offset_ms = _parse_lrc_offset(value)
+                if offset_ms is not None:
+                    result.metadata_tags["offset_ms"] = str(offset_ms)
+                else:
+                    result.dropped_tags[key] = value
+            else:
+                result.dropped_tags[key] = value
+
+    if not raw_timed:
+        return result
+
+    raw_timed.sort(key=lambda t: t[0])
+
+    # Apply offset: shift all timestamps so output is absolute
+    offset_s = 0.0
+    if "offset_ms" in result.metadata_tags:
+        offset_s = int(result.metadata_tags["offset_ms"]) / 1000.0
+
+    adjusted: List[Tuple[float, str]] = [
+        (max(0.0, ts + offset_s), text) for ts, text in raw_timed
+    ]
+
+    # Build segments: non-empty text lines become segments; empty-text
+    # lines (gap markers) contribute an end time to the preceding segment.
+    pending_segments: List[dict] = []
+    for ts, text in adjusted:
+        if text:
+            pending_segments.append({"start": ts, "end": None, "text": text})
+        elif pending_segments:
+            # Gap marker: set the preceding segment's end
+            pending_segments[-1]["end"] = ts
+
+    # Fill in missing end times: end of seg N = start of seg N+1
+    for i in range(len(pending_segments) - 1):
+        if pending_segments[i]["end"] is None:
+            pending_segments[i]["end"] = pending_segments[i + 1]["start"]
+
+    # Last segment: if no explicit end (no trailing gap marker), use
+    # start + 5s as a reasonable upper bound; align_words will confine
+    # within whatever audio is available.
+    if pending_segments and pending_segments[-1]["end"] is None:
+        pending_segments[-1]["end"] = pending_segments[-1]["start"] + 5.0
+
+    result.segments = pending_segments
+    return result
+
+
+def merge_metadata(
+    audio_path: str,
+    lrc_parsed: ParsedLrc,
+    cli_language: Optional[str] = None,
+) -> TrackMetadata:
+    """Build a :class:`TrackMetadata` by merging LRC tags over mutagen tags.
+
+    Precedence (highest first): LRC tag → mutagen tag → placeholder.
+    The ``--language`` CLI flag overrides both LRC and mutagen for language.
+
+    Warnings are printed to stderr for LRC metadata tags that have no
+    Lyricsfile equivalent and are therefore dropped.
+    """
+    audio_meta = extract_metadata(audio_path, language=cli_language)
+
+    tags = lrc_parsed.metadata_tags
+
+    title = tags.get("title") or audio_meta.title
+    artist = tags.get("artist") or audio_meta.artist
+    album = tags.get("album") or audio_meta.album
+
+    duration_ms = audio_meta.duration_ms
+    if "duration_ms" in tags:
+        try:
+            duration_ms = int(tags["duration_ms"])
+        except ValueError:
+            pass
+
+    language = cli_language or tags.get("language") or audio_meta.language
+
+    for key, value in lrc_parsed.dropped_tags.items():
+        print(
+            f"  Warning: LRC tag [{key}:{value}] has no Lyricsfile equivalent; skipped",
+            file=sys.stderr,
+        )
+
+    return TrackMetadata(
+        title=title,
+        artist=artist,
+        album=album,
+        duration_ms=duration_ms,
+        language=language,
+    )
+
+
+def _format_lrc_timestamp(seconds: float) -> str:
+    if seconds < 0:
+        seconds = 0.0
+    minutes = int(seconds // 60)
+    remainder = seconds - minutes * 60
+    return f"{minutes:02d}:{remainder:05.2f}"
+
+
+def result_to_lrc(
+    result: "stable_whisper.WhisperResult",
+    gap_threshold: float = 1.5,
+) -> str:
+    """Render ``result`` as LRC text.
+
+    Between consecutive segments, if ``next_segment.start - this_segment.end``
+    exceeds ``gap_threshold`` seconds, an empty timestamp is emitted at
+    ``this_segment.end`` so consumers stop displaying the line during the
+    pause. A trailing empty timestamp at the end of the last segment is
+    always emitted.
+    """
+    segments = list(result.segments)
+    if not segments:
+        return ""
+
+    lines: List[str] = []
+    for i, seg in enumerate(segments):
+        text = (seg.text or "").strip()
+        lines.append(f"[{_format_lrc_timestamp(seg.start)}]{text}")
+        next_seg = segments[i + 1] if i + 1 < len(segments) else None
+        if next_seg is None:
+            lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
+        elif next_seg.start - seg.end > gap_threshold:
+            lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
+
+    return "\n".join(lines) + "\n"
+
+
+def result_to_lyricsfile(
+    result: "stable_whisper.WhisperResult",
+    metadata: TrackMetadata,
+) -> str:
+    """Render ``result`` as a Lyricsfile YAML string.
+
+    The ``plain`` block is intentionally omitted; consumers receive only
+    the synced ``lines`` array (with word-level timings when available).
+    """
+    metadata_obj = {
+        "title": metadata.title,
+        "artist": metadata.artist,
+    }
+    if metadata.album:
+        metadata_obj["album"] = metadata.album
+    if metadata.duration_ms is not None:
+        metadata_obj["duration_ms"] = metadata.duration_ms
+    if metadata.language:
+        metadata_obj["language"] = metadata.language
+    if metadata.offset_ms is not None:
+        metadata_obj["offset_ms"] = metadata.offset_ms
+    metadata_obj["instrumental"] = False
+
+    lines_out = []
+    for seg in result.segments:
+        line_obj = {
+            "text": (seg.text or "").strip(),
+            "start_ms": _to_ms(seg.start),
+            "end_ms": _to_ms(seg.end),
+        }
+        word_objs = _words_to_lyricsfile_words(seg.words) if seg.words else None
+        if word_objs:
+            line_obj["words"] = word_objs
+        lines_out.append(line_obj)
+
+    document = {
+        "version": "1.0",
+        "metadata": metadata_obj,
+        "lines": lines_out,
+    }
+
+    return yaml.safe_dump(
+        document,
+        sort_keys=False,
+        allow_unicode=True,
+        default_flow_style=False,
+    )
+
+
+def _to_ms(seconds: float) -> int:
+    return int(round(seconds * 1000))
+
+
+def _words_to_lyricsfile_words(word_timings) -> List[dict]:
+    """Convert stable-ts word objects to Lyricsfile word objects.
+
+    Whisper's tokenization produces words with leading whitespace
+    (e.g. ``" club"``). Lyricsfile expects trailing whitespace except on
+    the final word. This re-attaches the leading space of word ``i+1``
+    onto the trailing edge of word ``i``. For CJK languages stable-ts
+    splits without spaces, in which case no spacing is injected.
+    """
+    bodies = [(w.word or "").lstrip() for w in word_timings]
+    leading_spaces = [(w.word or "")[: len(w.word or "") - len((w.word or "").lstrip())] for w in word_timings]
+
+    out: List[dict] = []
+    for i, w in enumerate(word_timings):
+        text = bodies[i]
+        if i + 1 < len(word_timings) and leading_spaces[i + 1]:
+            text = text + " "
+        out.append(
+            {
+                "text": text,
+                "start_ms": _to_ms(w.start),
+                "end_ms": _to_ms(w.end),
+            }
+        )
+    return out