add support for word-level timings in lyricfile format and lrc->lyricfile

2026-05-16 01:54:54 -07:00
parent 9161ab0b24
commit cc7928a532
6 changed files with 1121 additions and 72 deletions
@@ -2,79 +2,145 @@ import os

 import click
 import stable_whisper
-import srt
-import datetime

-song_file_extensions = set([
-    "mp3", # explicitly supported by whisper
-    "wav",
-    "mp4",
-    "mpeg",
-    "mpga",
-    "m4a",
-    "webm",
-    "flac", # stable-ts likely uses ffmpeg to convert this for whisper
-    "opus",
-])
+from output import (
+    extract_metadata,
+    file_is_audio,
+    merge_metadata,
+    parse_lrc_file,
+    result_to_lrc,
+    result_to_lyricsfile,
+)
+
+
+OUTPUT_EXTENSIONS = {
+    "yaml": "yaml",
+    "lrc": "lrc",
+}
+

@click.command()
@click.help_option("--help", "-h")
-@click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)')
-@click.option('--language', '-l', required=True, help='What language the lyrics are in')
-@click.argument('directory')
-def main(model, language, directory):
+@click.option(
+    "--model",
+    "-m",
+    default="small",
+    help="Which whisper model to use (choices are those of whisper.available_models)",
+)
+@click.option(
+    "--language",
+    "-l",
+    required=True,
+    help="What language the lyrics are in",
+)
+@click.option(
+    "--format",
+    "-f",
+    "output_format",
+    type=click.Choice(["yaml", "lrc"], case_sensitive=False),
+    default="yaml",
+    show_default=True,
+    help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).",
+)
+@click.argument("directory")
+def main(model, language, output_format, directory):
+    output_format = output_format.lower()
+    output_ext = OUTPUT_EXTENSIONS[output_format]
+
    print(f"Loading {model} model for use with language {language}...")
    model = stable_whisper.load_model(model)

    for (dirpath, _, filenames) in os.walk(directory):
-        if any([ file_is_audio(f) for f in filenames ]):
-            # audio files exist here, so let's process this folder
-            for f in filenames:
-                if file_is_audio(f):
-                    filename = os.path.join(dirpath, f)
-                    # get corresponding txt file name 
-                    # (we expect unaligned files to have the same filename as the song, except for the extension)
-                    unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt")
-                    # (we also expect existing .lrc files to contain aligned lyrics)
-                    aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc")
+        if not any(file_is_audio(f) for f in filenames):
+            continue
+        for f in filenames:
+            if not file_is_audio(f):
+                continue
+            audio_path = os.path.join(dirpath, f)
+            base, _ = os.path.splitext(audio_path)
+            output_path = base + "." + output_ext

-                    # run model to get aligned srt
-                    if not os.path.exists(unaligned_lyrics_filename):
-                        print(f"No corresponding unaligned lyric txt exists for {filename}")
-                        continue
-                    if os.path.exists(aligned_lyrics_filename):
-                        print(f"Corresponding aligned lyric txt already exists for {filename}")
-                        continue
-                    print(f"Aligning lyrics for {filename}")
-                    
-                    result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False)
+            if os.path.exists(output_path):
+                print(f"Aligned lyric file already exists for {audio_path}: {output_path}")
+                continue

-                    # turn srt to lrc
-                    srt = result.to_srt_vtt(filepath=None, word_level=False)
-                    lrc = srt_to_lrc(srt)
-                    print(f"Writing aligned lyrics at: {aligned_lyrics_filename}")
-                    open(aligned_lyrics_filename, "x").write(lrc)
-               
-def get_file_extension(filename: str) -> str:
-    # "asdf.omg.lol" -> [asdf.omg, .lol] -> lol
-    return os.path.splitext(filename)[1][1:]
+            lrc_path = base + ".lrc"
+            txt_path = base + ".txt"

-def file_is_audio(filename: str) -> bool:
-    return get_file_extension(filename) in song_file_extensions
+            if os.path.exists(lrc_path):
+                _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language)
+            elif os.path.exists(txt_path):
+                _align_from_txt(model, audio_path, txt_path, output_path, output_format, language)
+            else:
+                print(f"No .lrc or .txt sidecar found for {audio_path}")

-def timedelta_to_hhmmssss(td: datetime.timedelta) -> str:
-    dt = datetime.datetime(1969, 1, 1) + td
-    return dt.strftime('%M:%S.%f')[:-4]

-def srt_to_lrc(srt_text: str) -> str:
-    subs = list(srt.parse(srt_text))
-    lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs]
+def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language):
+    """Full alignment from plain text: determines line boundaries and word timings."""
+    print(f"Aligning lyrics for {audio_path} (from .txt)")
+    with open(txt_path) as fh:
+        unaligned_text = fh.read()
+
+    result = model.align(
+        audio_path,
+        unaligned_text,
+        language=language,
+        original_split=True,
+        regroup=False,
+        vad=True,
+    )
+
+    if not result or not list(result.segments):
+        print(f"  Alignment produced no segments; skipping")
+        return
+
+    result.adjust_gaps(one_section=True)
+
+    if output_format == "yaml":
+        metadata = extract_metadata(audio_path, language=language)
+        content = result_to_lyricsfile(result, metadata)
+    else:
+        content = result_to_lrc(result)
+
+    print(f"  Writing aligned lyrics at: {output_path}")
+    with open(output_path, "x") as out_fh:
+        out_fh.write(content)
+
+
+def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language):
+    """Word-level refinement of existing LRC: keeps line boundaries, adds word timings."""
+    print(f"Refining lyrics for {audio_path} (from .lrc)")
+    with open(lrc_path) as fh:
+        lrc_content = fh.read()
+
+    parsed = parse_lrc_file(lrc_content)
+
+    if not parsed.segments:
+        print(f"  No timed lines found in {lrc_path}; skipping")
+        return
+
+    result = model.align_words(
+        audio_path,
+        parsed.segments,
+        language=language,
+        vad=True,
+        regroup=False,
+    )
+
+    if not result or not list(result.segments):
+        print(f"  Word alignment produced no segments; skipping")
+        return
+
+    if output_format == "yaml":
+        metadata = merge_metadata(audio_path, parsed, cli_language=language)
+        content = result_to_lyricsfile(result, metadata)
+    else:
+        content = result_to_lrc(result)
+
+    print(f"  Writing refined lyrics at: {output_path}")
+    with open(output_path, "x") as out_fh:
+        out_fh.write(content)

-    # add the end of lyrics marker
-    end_time = subs[-1].end
-    lines.append(f"[{timedelta_to_hhmmssss(end_time)}]")
-    return "\n".join(lines)    

 if __name__ == "__main__":
    main()
-