add support for word-level timings in lyricfile format and lrc->lyricfile
This commit is contained in:
+125
-59
@@ -2,79 +2,145 @@ import os
|
||||
|
||||
import click
|
||||
import stable_whisper
|
||||
import srt
|
||||
import datetime
|
||||
|
||||
song_file_extensions = set([
|
||||
"mp3", # explicitly supported by whisper
|
||||
"wav",
|
||||
"mp4",
|
||||
"mpeg",
|
||||
"mpga",
|
||||
"m4a",
|
||||
"webm",
|
||||
"flac", # stable-ts likely uses ffmpeg to convert this for whisper
|
||||
"opus",
|
||||
])
|
||||
from output import (
|
||||
extract_metadata,
|
||||
file_is_audio,
|
||||
merge_metadata,
|
||||
parse_lrc_file,
|
||||
result_to_lrc,
|
||||
result_to_lyricsfile,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_EXTENSIONS = {
|
||||
"yaml": "yaml",
|
||||
"lrc": "lrc",
|
||||
}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.help_option("--help", "-h")
|
||||
@click.option('--model', '-m', default='small', help='Which whisper model to use (choices are those of whisper.available_models)')
|
||||
@click.option('--language', '-l', required=True, help='What language the lyrics are in')
|
||||
@click.argument('directory')
|
||||
def main(model, language, directory):
|
||||
@click.option(
|
||||
"--model",
|
||||
"-m",
|
||||
default="small",
|
||||
help="Which whisper model to use (choices are those of whisper.available_models)",
|
||||
)
|
||||
@click.option(
|
||||
"--language",
|
||||
"-l",
|
||||
required=True,
|
||||
help="What language the lyrics are in",
|
||||
)
|
||||
@click.option(
|
||||
"--format",
|
||||
"-f",
|
||||
"output_format",
|
||||
type=click.Choice(["yaml", "lrc"], case_sensitive=False),
|
||||
default="yaml",
|
||||
show_default=True,
|
||||
help="Output format. 'yaml' writes a Lyricsfile (.yaml) with word-level timings; 'lrc' writes classic LRC (.lrc).",
|
||||
)
|
||||
@click.argument("directory")
|
||||
def main(model, language, output_format, directory):
|
||||
output_format = output_format.lower()
|
||||
output_ext = OUTPUT_EXTENSIONS[output_format]
|
||||
|
||||
print(f"Loading {model} model for use with language {language}...")
|
||||
model = stable_whisper.load_model(model)
|
||||
|
||||
for (dirpath, _, filenames) in os.walk(directory):
|
||||
if any([ file_is_audio(f) for f in filenames ]):
|
||||
# audio files exist here, so let's process this folder
|
||||
for f in filenames:
|
||||
if file_is_audio(f):
|
||||
filename = os.path.join(dirpath, f)
|
||||
# get corresponding txt file name
|
||||
# (we expect unaligned files to have the same filename as the song, except for the extension)
|
||||
unaligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".txt")
|
||||
# (we also expect existing .lrc files to contain aligned lyrics)
|
||||
aligned_lyrics_filename = os.path.join(dirpath, os.path.splitext(f)[0] + ".lrc")
|
||||
if not any(file_is_audio(f) for f in filenames):
|
||||
continue
|
||||
for f in filenames:
|
||||
if not file_is_audio(f):
|
||||
continue
|
||||
audio_path = os.path.join(dirpath, f)
|
||||
base, _ = os.path.splitext(audio_path)
|
||||
output_path = base + "." + output_ext
|
||||
|
||||
# run model to get aligned srt
|
||||
if not os.path.exists(unaligned_lyrics_filename):
|
||||
print(f"No corresponding unaligned lyric txt exists for {filename}")
|
||||
continue
|
||||
if os.path.exists(aligned_lyrics_filename):
|
||||
print(f"Corresponding aligned lyric txt already exists for {filename}")
|
||||
continue
|
||||
print(f"Aligning lyrics for {filename}")
|
||||
|
||||
result: stable_whisper.WhisperResult = model.align(filename, open(unaligned_lyrics_filename).read(), language=language, original_split=True, regroup=False)
|
||||
if os.path.exists(output_path):
|
||||
print(f"Aligned lyric file already exists for {audio_path}: {output_path}")
|
||||
continue
|
||||
|
||||
# turn srt to lrc
|
||||
srt = result.to_srt_vtt(filepath=None, word_level=False)
|
||||
lrc = srt_to_lrc(srt)
|
||||
print(f"Writing aligned lyrics at: {aligned_lyrics_filename}")
|
||||
open(aligned_lyrics_filename, "x").write(lrc)
|
||||
|
||||
def get_file_extension(filename: str) -> str:
|
||||
# "asdf.omg.lol" -> [asdf.omg, .lol] -> lol
|
||||
return os.path.splitext(filename)[1][1:]
|
||||
lrc_path = base + ".lrc"
|
||||
txt_path = base + ".txt"
|
||||
|
||||
def file_is_audio(filename: str) -> bool:
|
||||
return get_file_extension(filename) in song_file_extensions
|
||||
if os.path.exists(lrc_path):
|
||||
_refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language)
|
||||
elif os.path.exists(txt_path):
|
||||
_align_from_txt(model, audio_path, txt_path, output_path, output_format, language)
|
||||
else:
|
||||
print(f"No .lrc or .txt sidecar found for {audio_path}")
|
||||
|
||||
def timedelta_to_hhmmssss(td: datetime.timedelta) -> str:
|
||||
dt = datetime.datetime(1969, 1, 1) + td
|
||||
return dt.strftime('%M:%S.%f')[:-4]
|
||||
|
||||
def srt_to_lrc(srt_text: str) -> str:
|
||||
subs = list(srt.parse(srt_text))
|
||||
lines = [f"[{timedelta_to_hhmmssss(s.start)}]{s.content}" for s in subs]
|
||||
def _align_from_txt(model, audio_path, txt_path, output_path, output_format, language):
|
||||
"""Full alignment from plain text: determines line boundaries and word timings."""
|
||||
print(f"Aligning lyrics for {audio_path} (from .txt)")
|
||||
with open(txt_path) as fh:
|
||||
unaligned_text = fh.read()
|
||||
|
||||
result = model.align(
|
||||
audio_path,
|
||||
unaligned_text,
|
||||
language=language,
|
||||
original_split=True,
|
||||
regroup=False,
|
||||
vad=True,
|
||||
)
|
||||
|
||||
if not result or not list(result.segments):
|
||||
print(f" Alignment produced no segments; skipping")
|
||||
return
|
||||
|
||||
result.adjust_gaps(one_section=True)
|
||||
|
||||
if output_format == "yaml":
|
||||
metadata = extract_metadata(audio_path, language=language)
|
||||
content = result_to_lyricsfile(result, metadata)
|
||||
else:
|
||||
content = result_to_lrc(result)
|
||||
|
||||
print(f" Writing aligned lyrics at: {output_path}")
|
||||
with open(output_path, "x") as out_fh:
|
||||
out_fh.write(content)
|
||||
|
||||
|
||||
def _refine_from_lrc(model, audio_path, lrc_path, output_path, output_format, language):
|
||||
"""Word-level refinement of existing LRC: keeps line boundaries, adds word timings."""
|
||||
print(f"Refining lyrics for {audio_path} (from .lrc)")
|
||||
with open(lrc_path) as fh:
|
||||
lrc_content = fh.read()
|
||||
|
||||
parsed = parse_lrc_file(lrc_content)
|
||||
|
||||
if not parsed.segments:
|
||||
print(f" No timed lines found in {lrc_path}; skipping")
|
||||
return
|
||||
|
||||
result = model.align_words(
|
||||
audio_path,
|
||||
parsed.segments,
|
||||
language=language,
|
||||
vad=True,
|
||||
regroup=False,
|
||||
)
|
||||
|
||||
if not result or not list(result.segments):
|
||||
print(f" Word alignment produced no segments; skipping")
|
||||
return
|
||||
|
||||
if output_format == "yaml":
|
||||
metadata = merge_metadata(audio_path, parsed, cli_language=language)
|
||||
content = result_to_lyricsfile(result, metadata)
|
||||
else:
|
||||
content = result_to_lrc(result)
|
||||
|
||||
print(f" Writing refined lyrics at: {output_path}")
|
||||
with open(output_path, "x") as out_fh:
|
||||
out_fh.write(content)
|
||||
|
||||
# add the end of lyrics marker
|
||||
end_time = subs[-1].end
|
||||
lines.append(f"[{timedelta_to_hhmmssss(end_time)}]")
|
||||
return "\n".join(lines)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -0,0 +1,431 @@
|
||||
"""Output writers, LRC parsing, and metadata extraction for txtlyric-to-lrc.
|
||||
|
||||
Writers:
|
||||
|
||||
- :func:`result_to_lrc` -- emits classic LRC, inserting a clear-display
|
||||
marker whenever the gap between consecutive segments exceeds a threshold
|
||||
so that lines do not visually persist through instrumental pauses.
|
||||
- :func:`result_to_lyricsfile` -- emits the YAML-based Lyricsfile format
|
||||
used by lrcget/lrclib, with word-level timings.
|
||||
|
||||
LRC input:
|
||||
|
||||
- :func:`parse_lrc_file` -- parses an LRC file into metadata tags and
|
||||
timed segments (with ``start``/``end``/``text``) suitable for
|
||||
``stable_whisper.align_words``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import mutagen
|
||||
import stable_whisper
|
||||
import yaml
|
||||
|
||||
|
||||
song_file_extensions = {
|
||||
"mp3",
|
||||
"wav",
|
||||
"mp4",
|
||||
"mpeg",
|
||||
"mpga",
|
||||
"m4a",
|
||||
"webm",
|
||||
"flac",
|
||||
"opus",
|
||||
}
|
||||
|
||||
|
||||
def get_file_extension(filename: str) -> str:
|
||||
return os.path.splitext(filename)[1][1:]
|
||||
|
||||
|
||||
def file_is_audio(filename: str) -> bool:
|
||||
return get_file_extension(filename) in song_file_extensions
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metadata
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class TrackMetadata:
|
||||
title: str
|
||||
artist: str
|
||||
album: Optional[str] = None
|
||||
duration_ms: Optional[int] = None
|
||||
language: Optional[str] = None
|
||||
offset_ms: Optional[int] = None
|
||||
|
||||
|
||||
def extract_metadata(audio_path: str, language: Optional[str] = None) -> TrackMetadata:
|
||||
"""Read tags + duration from the audio file via mutagen.
|
||||
|
||||
Falls back to the placeholder strings ``"Unknown"`` / ``"Unknown Artist"``
|
||||
when the corresponding tag is missing, since Lyricsfile requires
|
||||
``metadata.title`` and ``metadata.artist``.
|
||||
"""
|
||||
f = mutagen.File(audio_path, easy=True)
|
||||
return TrackMetadata(
|
||||
title=_first_tag(f, "title") or "Unknown",
|
||||
artist=_first_tag(f, "artist") or "Unknown Artist",
|
||||
album=_first_tag(f, "album"),
|
||||
duration_ms=_duration_ms(f),
|
||||
language=language,
|
||||
)
|
||||
|
||||
|
||||
def _first_tag(f, key: str) -> Optional[str]:
|
||||
if f is None:
|
||||
return None
|
||||
val = f.get(key)
|
||||
if not val:
|
||||
return None
|
||||
if isinstance(val, list):
|
||||
val = val[0] if val else None
|
||||
if val is None:
|
||||
return None
|
||||
val = str(val).strip()
|
||||
return val or None
|
||||
|
||||
|
||||
def _duration_ms(f) -> Optional[int]:
|
||||
info = getattr(f, "info", None) if f is not None else None
|
||||
length = getattr(info, "length", None) if info is not None else None
|
||||
if length is None:
|
||||
return None
|
||||
return int(round(length * 1000))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LRC parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LRC_METADATA_KEYS_TO_LYRICSFILE = {
|
||||
"ti": "title",
|
||||
"ar": "artist",
|
||||
"al": "album",
|
||||
"la": "language",
|
||||
"lang": "language",
|
||||
"language": "language",
|
||||
}
|
||||
|
||||
_TIMESTAMP_RE = re.compile(r"^(\d{1,3}):(\d{2})(?:[.:](\d{2,3}))?$")
|
||||
_LINE_RE = re.compile(r"^\[([^\]]+)\](.*)$")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedLrc:
|
||||
"""Result of parsing an LRC file."""
|
||||
segments: List[dict] = field(default_factory=list)
|
||||
metadata_tags: Dict[str, str] = field(default_factory=dict)
|
||||
dropped_tags: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _parse_lrc_timestamp(token: str) -> Optional[float]:
|
||||
"""Parse an LRC timestamp token to seconds, or None if not a timestamp."""
|
||||
m = _TIMESTAMP_RE.match(token.strip())
|
||||
if not m:
|
||||
return None
|
||||
minutes = int(m.group(1))
|
||||
seconds = int(m.group(2))
|
||||
frac_raw = m.group(3) or "0"
|
||||
if len(frac_raw) == 2:
|
||||
frac_ms = int(frac_raw) * 10
|
||||
else:
|
||||
frac_ms = int(frac_raw)
|
||||
return minutes * 60 + seconds + frac_ms / 1000.0
|
||||
|
||||
|
||||
def _parse_lrc_length(value: str) -> Optional[int]:
|
||||
"""Parse an LRC [length:...] value to milliseconds."""
|
||||
value = value.strip()
|
||||
m = re.match(r"^(\d+):(\d{2})(?:[.:](\d{2,3}))?$", value)
|
||||
if not m:
|
||||
return None
|
||||
minutes = int(m.group(1))
|
||||
seconds = int(m.group(2))
|
||||
frac_raw = m.group(3) or "0"
|
||||
if len(frac_raw) == 2:
|
||||
frac_ms = int(frac_raw) * 10
|
||||
else:
|
||||
frac_ms = int(frac_raw)
|
||||
return (minutes * 60 + seconds) * 1000 + frac_ms
|
||||
|
||||
|
||||
def _parse_lrc_offset(value: str) -> Optional[int]:
|
||||
"""Parse an LRC [offset:...] value to milliseconds (integer, may be negative)."""
|
||||
value = value.strip()
|
||||
m = re.match(r"^([+-]?\d+)$", value)
|
||||
if not m:
|
||||
return None
|
||||
return int(m.group(1))
|
||||
|
||||
|
||||
def parse_lrc_file(content: str) -> ParsedLrc:
|
||||
"""Parse LRC content into timed segments and metadata.
|
||||
|
||||
Returns a :class:`ParsedLrc` containing:
|
||||
|
||||
- ``segments``: list of ``{"start": float, "end": float, "text": str}``
|
||||
suitable for ``model.align_words()``. Gap markers (empty-text
|
||||
timestamps) contribute an ``end`` to the preceding segment but do not
|
||||
appear as segments themselves.
|
||||
- ``metadata_tags``: dict of recognised LRC metadata mapped to
|
||||
Lyricsfile-compatible keys.
|
||||
- ``dropped_tags``: dict of LRC metadata tags that have no Lyricsfile
|
||||
equivalent (warned about by the caller).
|
||||
|
||||
If the LRC contains an ``[offset:...]`` tag, the offset is applied to
|
||||
all parsed timestamps so that the returned times are absolute.
|
||||
"""
|
||||
result = ParsedLrc()
|
||||
|
||||
raw_timed: List[Tuple[float, str]] = []
|
||||
|
||||
for raw_line in content.splitlines():
|
||||
raw_line = raw_line.strip()
|
||||
if not raw_line:
|
||||
continue
|
||||
m = _LINE_RE.match(raw_line)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
bracket_content = m.group(1)
|
||||
after_bracket = m.group(2)
|
||||
|
||||
ts = _parse_lrc_timestamp(bracket_content)
|
||||
if ts is not None:
|
||||
raw_timed.append((ts, after_bracket.strip()))
|
||||
continue
|
||||
|
||||
# Metadata tag: [key:value]
|
||||
if ":" in bracket_content:
|
||||
key, _, value = bracket_content.partition(":")
|
||||
key = key.strip().lower()
|
||||
value = value.strip()
|
||||
if not value:
|
||||
continue
|
||||
|
||||
lyricsfile_key = _LRC_METADATA_KEYS_TO_LYRICSFILE.get(key)
|
||||
if lyricsfile_key is not None:
|
||||
result.metadata_tags[lyricsfile_key] = value
|
||||
elif key == "length":
|
||||
length_ms = _parse_lrc_length(value)
|
||||
if length_ms is not None:
|
||||
result.metadata_tags["duration_ms"] = str(length_ms)
|
||||
else:
|
||||
result.dropped_tags[key] = value
|
||||
elif key == "offset":
|
||||
offset_ms = _parse_lrc_offset(value)
|
||||
if offset_ms is not None:
|
||||
result.metadata_tags["offset_ms"] = str(offset_ms)
|
||||
else:
|
||||
result.dropped_tags[key] = value
|
||||
else:
|
||||
result.dropped_tags[key] = value
|
||||
|
||||
if not raw_timed:
|
||||
return result
|
||||
|
||||
raw_timed.sort(key=lambda t: t[0])
|
||||
|
||||
# Apply offset: shift all timestamps so output is absolute
|
||||
offset_s = 0.0
|
||||
if "offset_ms" in result.metadata_tags:
|
||||
offset_s = int(result.metadata_tags["offset_ms"]) / 1000.0
|
||||
|
||||
adjusted: List[Tuple[float, str]] = [
|
||||
(max(0.0, ts + offset_s), text) for ts, text in raw_timed
|
||||
]
|
||||
|
||||
# Build segments: non-empty text lines become segments; empty-text
|
||||
# lines (gap markers) contribute an end time to the preceding segment.
|
||||
pending_segments: List[dict] = []
|
||||
for ts, text in adjusted:
|
||||
if text:
|
||||
pending_segments.append({"start": ts, "end": None, "text": text})
|
||||
elif pending_segments:
|
||||
# Gap marker: set the preceding segment's end
|
||||
pending_segments[-1]["end"] = ts
|
||||
|
||||
# Fill in missing end times: end of seg N = start of seg N+1
|
||||
for i in range(len(pending_segments) - 1):
|
||||
if pending_segments[i]["end"] is None:
|
||||
pending_segments[i]["end"] = pending_segments[i + 1]["start"]
|
||||
|
||||
# Last segment: if no explicit end (no trailing gap marker), use
|
||||
# start + 5s as a reasonable upper bound; align_words will confine
|
||||
# within whatever audio is available.
|
||||
if pending_segments and pending_segments[-1]["end"] is None:
|
||||
pending_segments[-1]["end"] = pending_segments[-1]["start"] + 5.0
|
||||
|
||||
result.segments = pending_segments
|
||||
return result
|
||||
|
||||
|
||||
def merge_metadata(
|
||||
audio_path: str,
|
||||
lrc_parsed: ParsedLrc,
|
||||
cli_language: Optional[str] = None,
|
||||
) -> TrackMetadata:
|
||||
"""Build a :class:`TrackMetadata` by merging LRC tags over mutagen tags.
|
||||
|
||||
Precedence (highest first): LRC tag → mutagen tag → placeholder.
|
||||
The ``--language`` CLI flag overrides both LRC and mutagen for language.
|
||||
|
||||
Warnings are printed to stderr for LRC metadata tags that have no
|
||||
Lyricsfile equivalent and are therefore dropped.
|
||||
"""
|
||||
audio_meta = extract_metadata(audio_path, language=cli_language)
|
||||
|
||||
tags = lrc_parsed.metadata_tags
|
||||
|
||||
title = tags.get("title") or audio_meta.title
|
||||
artist = tags.get("artist") or audio_meta.artist
|
||||
album = tags.get("album") or audio_meta.album
|
||||
|
||||
duration_ms = audio_meta.duration_ms
|
||||
if "duration_ms" in tags:
|
||||
try:
|
||||
duration_ms = int(tags["duration_ms"])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
language = cli_language or tags.get("language") or audio_meta.language
|
||||
|
||||
for key, value in lrc_parsed.dropped_tags.items():
|
||||
print(
|
||||
f" Warning: LRC tag [{key}:{value}] has no Lyricsfile equivalent; skipped",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
return TrackMetadata(
|
||||
title=title,
|
||||
artist=artist,
|
||||
album=album,
|
||||
duration_ms=duration_ms,
|
||||
language=language,
|
||||
)
|
||||
|
||||
|
||||
def _format_lrc_timestamp(seconds: float) -> str:
|
||||
if seconds < 0:
|
||||
seconds = 0.0
|
||||
minutes = int(seconds // 60)
|
||||
remainder = seconds - minutes * 60
|
||||
return f"{minutes:02d}:{remainder:05.2f}"
|
||||
|
||||
|
||||
def result_to_lrc(
|
||||
result: "stable_whisper.WhisperResult",
|
||||
gap_threshold: float = 1.5,
|
||||
) -> str:
|
||||
"""Render ``result`` as LRC text.
|
||||
|
||||
Between consecutive segments, if ``next_segment.start - this_segment.end``
|
||||
exceeds ``gap_threshold`` seconds, an empty timestamp is emitted at
|
||||
``this_segment.end`` so consumers stop displaying the line during the
|
||||
pause. A trailing empty timestamp at the end of the last segment is
|
||||
always emitted.
|
||||
"""
|
||||
segments = list(result.segments)
|
||||
if not segments:
|
||||
return ""
|
||||
|
||||
lines: List[str] = []
|
||||
for i, seg in enumerate(segments):
|
||||
text = (seg.text or "").strip()
|
||||
lines.append(f"[{_format_lrc_timestamp(seg.start)}]{text}")
|
||||
next_seg = segments[i + 1] if i + 1 < len(segments) else None
|
||||
if next_seg is None:
|
||||
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
|
||||
elif next_seg.start - seg.end > gap_threshold:
|
||||
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def result_to_lyricsfile(
|
||||
result: "stable_whisper.WhisperResult",
|
||||
metadata: TrackMetadata,
|
||||
) -> str:
|
||||
"""Render ``result`` as a Lyricsfile YAML string.
|
||||
|
||||
The ``plain`` block is intentionally omitted; consumers receive only
|
||||
the synced ``lines`` array (with word-level timings when available).
|
||||
"""
|
||||
metadata_obj = {
|
||||
"title": metadata.title,
|
||||
"artist": metadata.artist,
|
||||
}
|
||||
if metadata.album:
|
||||
metadata_obj["album"] = metadata.album
|
||||
if metadata.duration_ms is not None:
|
||||
metadata_obj["duration_ms"] = metadata.duration_ms
|
||||
if metadata.language:
|
||||
metadata_obj["language"] = metadata.language
|
||||
if metadata.offset_ms is not None:
|
||||
metadata_obj["offset_ms"] = metadata.offset_ms
|
||||
metadata_obj["instrumental"] = False
|
||||
|
||||
lines_out = []
|
||||
for seg in result.segments:
|
||||
line_obj = {
|
||||
"text": (seg.text or "").strip(),
|
||||
"start_ms": _to_ms(seg.start),
|
||||
"end_ms": _to_ms(seg.end),
|
||||
}
|
||||
word_objs = _words_to_lyricsfile_words(seg.words) if seg.words else None
|
||||
if word_objs:
|
||||
line_obj["words"] = word_objs
|
||||
lines_out.append(line_obj)
|
||||
|
||||
document = {
|
||||
"version": "1.0",
|
||||
"metadata": metadata_obj,
|
||||
"lines": lines_out,
|
||||
}
|
||||
|
||||
return yaml.safe_dump(
|
||||
document,
|
||||
sort_keys=False,
|
||||
allow_unicode=True,
|
||||
default_flow_style=False,
|
||||
)
|
||||
|
||||
|
||||
def _to_ms(seconds: float) -> int:
|
||||
return int(round(seconds * 1000))
|
||||
|
||||
|
||||
def _words_to_lyricsfile_words(word_timings) -> List[dict]:
|
||||
"""Convert stable-ts word objects to Lyricsfile word objects.
|
||||
|
||||
Whisper's tokenization produces words with leading whitespace
|
||||
(e.g. ``" club"``). Lyricsfile expects trailing whitespace except on
|
||||
the final word. This re-attaches the leading space of word ``i+1``
|
||||
onto the trailing edge of word ``i``. For CJK languages stable-ts
|
||||
splits without spaces, in which case no spacing is injected.
|
||||
"""
|
||||
bodies = [(w.word or "").lstrip() for w in word_timings]
|
||||
leading_spaces = [(w.word or "")[: len(w.word or "") - len((w.word or "").lstrip())] for w in word_timings]
|
||||
|
||||
out: List[dict] = []
|
||||
for i, w in enumerate(word_timings):
|
||||
text = bodies[i]
|
||||
if i + 1 < len(word_timings) and leading_spaces[i + 1]:
|
||||
text = text + " "
|
||||
out.append(
|
||||
{
|
||||
"text": text,
|
||||
"start_ms": _to_ms(w.start),
|
||||
"end_ms": _to_ms(w.end),
|
||||
}
|
||||
)
|
||||
return out
|
||||
Reference in New Issue
Block a user