import os import click import stable_whisper from output import ( extract_metadata, file_is_audio, merge_metadata, parse_lrc_file, result_to_lrc, result_to_lyricsfile, ) OUTPUT_EXTENSIONS = { "yaml": "yaml", "lrc": "lrc", } @click.command() @click.help_option("--help", "-h") @click.option( "--model", "-m", default="small", help="Which whisper model to use (choices are those of whisper.available_models)", ) @click.option( "--language", "-l", required=True, help="What language the lyrics are in", ) @click.option( "--format", "-f", "output_format", type=click.Choice(["yaml", "lrc"], case_sensitive=False), default="yaml", show_default=True, help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).", ) @click.argument("directory") def main(model, language, output_format, directory): output_format = output_format.lower() output_ext = OUTPUT_EXTENSIONS[output_format] print(f"Loading {model} model for use with language {language}...") model = stable_whisper.load_model(model) for (dirpath, _, filenames) in os.walk(directory): if not any(file_is_audio(f) for f in filenames): continue for f in filenames: if not file_is_audio(f): continue audio_path = os.path.join(dirpath, f) base, _ = os.path.splitext(audio_path) output_path = base + "." + output_ext if os.path.exists(output_path): print(f"Aligned lyric file already exists for {audio_path}: {output_path}") continue lrc_path = base + ".lrc" txt_path = base + ".txt" if os.path.exists(lrc_path): _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language) elif os.path.exists(txt_path): _align_from_txt(model, audio_path, txt_path, output_path, output_format, language) else: print(f"No .lrc or .txt sidecar found for {audio_path}") def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language): """Full alignment from plain text: determines line boundaries and word timings.""" print(f"Aligning lyrics for {audio_path} (from .txt)") with open(txt_path) as fh: unaligned_text = fh.read() result = model.align( audio_path, unaligned_text, language=language, original_split=True, regroup=False, vad=True, ) if not result or not list(result.segments): print(f" Alignment produced no segments; skipping") return result.adjust_gaps(one_section=True) if output_format == "yaml": metadata = extract_metadata(audio_path, language=language) content = result_to_lyricsfile(result, metadata) else: content = result_to_lrc(result) print(f" Writing aligned lyrics at: {output_path}") with open(output_path, "x") as out_fh: out_fh.write(content) def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language): """Word-level refinement of existing LRC: keeps line boundaries, adds word timings.""" print(f"Refining lyrics for {audio_path} (from .lrc)") with open(lrc_path) as fh: lrc_content = fh.read() parsed = parse_lrc_file(lrc_content) if not parsed.segments: print(f" No timed lines found in {lrc_path}; skipping") return result = model.align_words( audio_path, parsed.segments, language=language, vad=True, regroup=False, ) if not result or not list(result.segments): print(f" Word alignment produced no segments; skipping") return if output_format == "yaml": metadata = merge_metadata(audio_path, parsed, cli_language=language) content = result_to_lyricsfile(result, metadata) else: content = result_to_lrc(result) print(f" Writing refined lyrics at: {output_path}") with open(output_path, "x") as out_fh: out_fh.write(content) if __name__ == "__main__": main()