add support for word-level timings in lyricfile format and lrc->lyricfile
This commit is contained in:
+125
-59
@@ -2,79 +2,145 @@ import os
|
||||
|
||||
import click
|
||||
import stable_whisper
|
||||
import srt
|
||||
import datetime
|
||||
|
||||
song_file_extensions = set([
|
||||
"mp3", # explicitly supported by whisper
|
||||
"wav",
|
||||
"mp4",
|
||||
"mpeg",
|
||||
"mpga",
|
||||
"m4a",
|
||||
"webm",
|
||||
"flac", # stable-ts likely uses ffmpeg to convert this for whisper
|
||||
"opus",
|
||||
])
|
||||
from output import (
|
||||
extract_metadata,
|
||||
file_is_audio,
|
||||
merge_metadata,
|
||||
parse_lrc_file,
|
||||
result_to_lrc,
|
||||
result_to_lyricsfile,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_EXTENSIONS = {
|
||||
"yaml": "yaml",
|
||||
"lrc": "lrc",
|
||||
}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.help_option("--help", "-h")
|
||||
@click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)')
|
||||
@click.option('--language', '-l', required=True, help='What language the lyrics are in')
|
||||
@click.argument('directory')
|
||||
def main(model, language, directory):
|
||||
@click.option(
|
||||
"--model",
|
||||
"-m",
|
||||
default="small",
|
||||
help="Which whisper model to use (choices are those of whisper.available_models)",
|
||||
)
|
||||
@click.option(
|
||||
"--language",
|
||||
"-l",
|
||||
required=True,
|
||||
help="What language the lyrics are in",
|
||||
)
|
||||
@click.option(
|
||||
"--format",
|
||||
"-f",
|
||||
"output_format",
|
||||
type=click.Choice(["yaml", "lrc"], case_sensitive=False),
|
||||
default="yaml",
|
||||
show_default=True,
|
||||
help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).",
|
||||
)
|
||||
@click.argument("directory")
|
||||
def main(model, language, output_format, directory):
|
||||
output_format = output_format.lower()
|
||||
output_ext = OUTPUT_EXTENSIONS[output_format]
|
||||
|
||||
print(f"Loading {model} model for use with language {language}...")
|
||||
model = stable_whisper.load_model(model)
|
||||
|
||||
for (dirpath, _, filenames) in os.walk(directory):
|
||||
if any([ file_is_audio(f) for f in filenames ]):
|
||||
# audio files exist here, so let's process this folder
|
||||
for f in filenames:
|
||||
if file_is_audio(f):
|
||||
filename = os.path.join(dirpath, f)
|
||||
# get corresponding txt file name
|
||||
# (we expect unaligned files to have the same filename as the song, except for the extension)
|
||||
unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt")
|
||||
# (we also expect existing .lrc files to contain aligned lyrics)
|
||||
aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc")
|
||||
if not any(file_is_audio(f) for f in filenames):
|
||||
continue
|
||||
for f in filenames:
|
||||
if not file_is_audio(f):
|
||||
continue
|
||||
audio_path = os.path.join(dirpath, f)
|
||||
base, _ = os.path.splitext(audio_path)
|
||||
output_path = base + "." + output_ext
|
||||
|
||||
# run model to get aligned srt
|
||||
if not os.path.exists(unaligned_lyrics_filename):
|
||||
print(f"No corresponding unaligned lyric txt exists for {filename}")
|
||||
continue
|
||||
if os.path.exists(aligned_lyrics_filename):
|
||||
print(f"Corresponding aligned lyric txt already exists for {filename}")
|
||||
continue
|
||||
print(f"Aligning lyrics for {filename}")
|
||||
|
||||
result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False)
|
||||
if os.path.exists(output_path):
|
||||
print(f"Aligned lyric file already exists for {audio_path}: {output_path}")
|
||||
continue
|
||||
|
||||
# turn srt to lrc
|
||||
srt = result.to_srt_vtt(filepath=None, word_level=False)
|
||||
lrc = srt_to_lrc(srt)
|
||||
print(f"Writing aligned lyrics at: {aligned_lyrics_filename}")
|
||||
open(aligned_lyrics_filename, "x").write(lrc)
|
||||
|
||||
def get_file_extension(filename: str) -> str:
|
||||
# "asdf.omg.lol" -> [asdf.omg, .lol] -> lol
|
||||
return os.path.splitext(filename)[1][1:]
|
||||
lrc_path = base + ".lrc"
|
||||
txt_path = base + ".txt"
|
||||
|
||||
def file_is_audio(filename: str) -> bool:
|
||||
return get_file_extension(filename) in song_file_extensions
|
||||
if os.path.exists(lrc_path):
|
||||
_refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language)
|
||||
elif os.path.exists(txt_path):
|
||||
_align_from_txt(model, audio_path, txt_path, output_path, output_format, language)
|
||||
else:
|
||||
print(f"No .lrc or .txt sidecar found for {audio_path}")
|
||||
|
||||
def timedelta_to_hhmmssss(td: datetime.timedelta) -> str:
|
||||
dt = datetime.datetime(1969, 1, 1) + td
|
||||
return dt.strftime('%M:%S.%f')[:-4]
|
||||
|
||||
def srt_to_lrc(srt_text: str) -> str:
|
||||
subs = list(srt.parse(srt_text))
|
||||
lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs]
|
||||
def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language):
|
||||
"""Full alignment from plain text: determines line boundaries and word timings."""
|
||||
print(f"Aligning lyrics for {audio_path} (from .txt)")
|
||||
with open(txt_path) as fh:
|
||||
unaligned_text = fh.read()
|
||||
|
||||
result = model.align(
|
||||
audio_path,
|
||||
unaligned_text,
|
||||
language=language,
|
||||
original_split=True,
|
||||
regroup=False,
|
||||
vad=True,
|
||||
)
|
||||
|
||||
if not result or not list(result.segments):
|
||||
print(f" Alignment produced no segments; skipping")
|
||||
return
|
||||
|
||||
result.adjust_gaps(one_section=True)
|
||||
|
||||
if output_format == "yaml":
|
||||
metadata = extract_metadata(audio_path, language=language)
|
||||
content = result_to_lyricsfile(result, metadata)
|
||||
else:
|
||||
content = result_to_lrc(result)
|
||||
|
||||
print(f" Writing aligned lyrics at: {output_path}")
|
||||
with open(output_path, "x") as out_fh:
|
||||
out_fh.write(content)
|
||||
|
||||
|
||||
def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language):
|
||||
"""Word-level refinement of existing LRC: keeps line boundaries, adds word timings."""
|
||||
print(f"Refining lyrics for {audio_path} (from .lrc)")
|
||||
with open(lrc_path) as fh:
|
||||
lrc_content = fh.read()
|
||||
|
||||
parsed = parse_lrc_file(lrc_content)
|
||||
|
||||
if not parsed.segments:
|
||||
print(f" No timed lines found in {lrc_path}; skipping")
|
||||
return
|
||||
|
||||
result = model.align_words(
|
||||
audio_path,
|
||||
parsed.segments,
|
||||
language=language,
|
||||
vad=True,
|
||||
regroup=False,
|
||||
)
|
||||
|
||||
if not result or not list(result.segments):
|
||||
print(f" Word alignment produced no segments; skipping")
|
||||
return
|
||||
|
||||
if output_format == "yaml":
|
||||
metadata = merge_metadata(audio_path, parsed, cli_language=language)
|
||||
content = result_to_lyricsfile(result, metadata)
|
||||
else:
|
||||
content = result_to_lrc(result)
|
||||
|
||||
print(f" Writing refined lyrics at: {output_path}")
|
||||
with open(output_path, "x") as out_fh:
|
||||
out_fh.write(content)
|
||||
|
||||
# add the end of lyrics marker
|
||||
end_time = subs[-1].end
|
||||
lines.append(f"[{timedelta_to_hhmmssss(end_time)}]")
|
||||
return "\n".join(lines)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user