add support for word-level timings in lyricfile format and lrc->lyricfile

This commit is contained in:
2026-05-16 01:54:54 -07:00
parent 9161ab0b24
commit cc7928a532
6 changed files with 1121 additions and 72 deletions
+125 -59
View File
@@ -2,79 +2,145 @@ import os
import click
import stable_whisper
import srt
import datetime
song_file_extensions = set([
"mp3", # explicitly supported by whisper
"wav",
"mp4",
"mpeg",
"mpga",
"m4a",
"webm",
"flac", # stable-ts likely uses ffmpeg to convert this for whisper
"opus",
])
from output import (
extract_metadata,
file_is_audio,
merge_metadata,
parse_lrc_file,
result_to_lrc,
result_to_lyricsfile,
)
OUTPUT_EXTENSIONS = {
"yaml": "yaml",
"lrc": "lrc",
}
@click.command()
@click.help_option("--help", "-h")
@click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)')
@click.option('--language', '-l', required=True, help='What language the lyrics are in')
@click.argument('directory')
def main(model, language, directory):
@click.option(
"--model",
"-m",
default="small",
help="Which whisper model to use (choices are those of whisper.available_models)",
)
@click.option(
"--language",
"-l",
required=True,
help="What language the lyrics are in",
)
@click.option(
"--format",
"-f",
"output_format",
type=click.Choice(["yaml", "lrc"], case_sensitive=False),
default="yaml",
show_default=True,
help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).",
)
@click.argument("directory")
def main(model, language, output_format, directory):
output_format = output_format.lower()
output_ext = OUTPUT_EXTENSIONS[output_format]
print(f"Loading {model} model for use with language {language}...")
model = stable_whisper.load_model(model)
for (dirpath, _, filenames) in os.walk(directory):
if any([ file_is_audio(f) for f in filenames ]):
# audio files exist here, so let's process this folder
for f in filenames:
if file_is_audio(f):
filename = os.path.join(dirpath, f)
# get corresponding txt file name
# (we expect unaligned files to have the same filename as the song, except for the extension)
unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt")
# (we also expect existing .lrc files to contain aligned lyrics)
aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc")
if not any(file_is_audio(f) for f in filenames):
continue
for f in filenames:
if not file_is_audio(f):
continue
audio_path = os.path.join(dirpath, f)
base, _ = os.path.splitext(audio_path)
output_path = base + "." + output_ext
# run model to get aligned srt
if not os.path.exists(unaligned_lyrics_filename):
print(f"No corresponding unaligned lyric txt exists for {filename}")
continue
if os.path.exists(aligned_lyrics_filename):
print(f"Corresponding aligned lyric txt already exists for {filename}")
continue
print(f"Aligning lyrics for {filename}")
result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False)
if os.path.exists(output_path):
print(f"Aligned lyric file already exists for {audio_path}: {output_path}")
continue
# turn srt to lrc
srt = result.to_srt_vtt(filepath=None, word_level=False)
lrc = srt_to_lrc(srt)
print(f"Writing aligned lyrics at: {aligned_lyrics_filename}")
open(aligned_lyrics_filename, "x").write(lrc)
def get_file_extension(filename: str) -> str:
# "asdf.omg.lol" -> [asdf.omg, .lol] -> lol
return os.path.splitext(filename)[1][1:]
lrc_path = base + ".lrc"
txt_path = base + ".txt"
def file_is_audio(filename: str) -> bool:
return get_file_extension(filename) in song_file_extensions
if os.path.exists(lrc_path):
_refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language)
elif os.path.exists(txt_path):
_align_from_txt(model, audio_path, txt_path, output_path, output_format, language)
else:
print(f"No .lrc or .txt sidecar found for {audio_path}")
def timedelta_to_hhmmssss(td: datetime.timedelta) -> str:
dt = datetime.datetime(1969, 1, 1) + td
return dt.strftime('%M:%S.%f')[:-4]
def srt_to_lrc(srt_text: str) -> str:
subs = list(srt.parse(srt_text))
lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs]
def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language):
"""Full alignment from plain text: determines line boundaries and word timings."""
print(f"Aligning lyrics for {audio_path} (from .txt)")
with open(txt_path) as fh:
unaligned_text = fh.read()
result = model.align(
audio_path,
unaligned_text,
language=language,
original_split=True,
regroup=False,
vad=True,
)
if not result or not list(result.segments):
print(f" Alignment produced no segments; skipping")
return
result.adjust_gaps(one_section=True)
if output_format == "yaml":
metadata = extract_metadata(audio_path, language=language)
content = result_to_lyricsfile(result, metadata)
else:
content = result_to_lrc(result)
print(f" Writing aligned lyrics at: {output_path}")
with open(output_path, "x") as out_fh:
out_fh.write(content)
def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language):
"""Word-level refinement of existing LRC: keeps line boundaries, adds word timings."""
print(f"Refining lyrics for {audio_path} (from .lrc)")
with open(lrc_path) as fh:
lrc_content = fh.read()
parsed = parse_lrc_file(lrc_content)
if not parsed.segments:
print(f" No timed lines found in {lrc_path}; skipping")
return
result = model.align_words(
audio_path,
parsed.segments,
language=language,
vad=True,
regroup=False,
)
if not result or not list(result.segments):
print(f" Word alignment produced no segments; skipping")
return
if output_format == "yaml":
metadata = merge_metadata(audio_path, parsed, cli_language=language)
content = result_to_lyricsfile(result, metadata)
else:
content = result_to_lrc(result)
print(f" Writing refined lyrics at: {output_path}")
with open(output_path, "x") as out_fh:
out_fh.write(content)
# add the end of lyrics marker
end_time = subs[-1].end
lines.append(f"[{timedelta_to_hhmmssss(end_time)}]")
return "\n".join(lines)
if __name__ == "__main__":
main()