import os import click import stable_whisper import srt import datetime song_file_extensions = set([ "mp3", # explicitly supported by whisper "wav", "mp4", "mpeg", "mpga", "m4a", "webm", "flac", # stable-ts likely uses ffmpeg to convert this for whisper ]) @click.command() @click.help_option("--help", "-h") @click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)') @click.option('--language', '-l', required=True) @click.argument('directory') def main(model, language, directory): print(f"Loading {model} model for use with language {language}...") model = stable_whisper.load_model(model) for (dirpath, _, filenames) in os.walk(directory): if any([ file_is_audio(f) for f in filenames ]): # audio files exist here, so let's process this folder for f in filenames: if get_file_extension(f) in song_file_extensions: filename = os.path.join(dirpath, f) # get corresponding txt file name # (we expect unaligned files to have the same filename as the song, except for the extension) unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt") # (we also expect existing .lrc files to contain aligned lyrics) aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc") # run model to get aligned srt if not os.path.exists(unaligned_lyrics_filename): print(f"No corresponding unaligned lyric txt exists for {filename}") continue if os.path.exists(aligned_lyrics_filename): print(f"Corresponding aligned lyric txt already exists for {filename}") continue print(f"Aligning lyrics for {filename}") result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False) # turn srt to lrc srt = result.to_srt_vtt(filepath=None, word_level=False) lrc = srt_to_lrc(srt) print(f"Writing aligned lyrics at: {aligned_lyrics_filename}") open(aligned_lyrics_filename, "x").write(lrc) def get_file_extension(filename: str) -> str: # "asdf.omg.lol" -> [asdf.omg, .lol] -> lol return os.path.splitext(filename)[1][1:] def file_is_audio(filename: str) -> bool: return get_file_extension(filename) in song_file_extensions def timedelta_to_hhmmssss(td: datetime.timedelta) -> str: dt = datetime.datetime(1969, 1, 1) + td return dt.strftime('%M:%S.%f')[:-4] def srt_to_lrc(srt_text: str) -> str: subs = list(srt.parse(srt_text)) lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs] # add the end of lyrics marker end_time = subs[-1].end lines.append(f"[{timedelta_to_hhmmssss(end_time)}]") return "\n".join(lines) if __name__ == "__main__": main()