432 lines
13 KiB
Python
432 lines
13 KiB
Python
|
|
"""Output writers, LRC parsing, and metadata extraction for txtlyric-to-lrc.
|
||
|
|
|
||
|
|
Writers:
|
||
|
|
|
||
|
|
- :func:`result_to_lrc` -- emits classic LRC, inserting a clear-display
|
||
|
|
marker whenever the gap between consecutive segments exceeds a threshold
|
||
|
|
so that lines do not visually persist through instrumental pauses.
|
||
|
|
- :func:`result_to_lyricsfile` -- emits the YAML-based Lyricsfile format
|
||
|
|
used by lrcget/lrclib, with word-level timings.
|
||
|
|
|
||
|
|
LRC input:
|
||
|
|
|
||
|
|
- :func:`parse_lrc_file` -- parses an LRC file into metadata tags and
|
||
|
|
timed segments (with ``start``/``end``/``text``) suitable for
|
||
|
|
``stable_whisper.align_words``.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from typing import Dict, List, Optional, Tuple
|
||
|
|
|
||
|
|
import mutagen
|
||
|
|
import stable_whisper
|
||
|
|
import yaml
|
||
|
|
|
||
|
|
|
||
|
|
song_file_extensions = {
|
||
|
|
"mp3",
|
||
|
|
"wav",
|
||
|
|
"mp4",
|
||
|
|
"mpeg",
|
||
|
|
"mpga",
|
||
|
|
"m4a",
|
||
|
|
"webm",
|
||
|
|
"flac",
|
||
|
|
"opus",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_file_extension(filename: str) -> str:
|
||
|
|
return os.path.splitext(filename)[1][1:]
|
||
|
|
|
||
|
|
|
||
|
|
def file_is_audio(filename: str) -> bool:
|
||
|
|
return get_file_extension(filename) in song_file_extensions
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Metadata
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class TrackMetadata:
|
||
|
|
title: str
|
||
|
|
artist: str
|
||
|
|
album: Optional[str] = None
|
||
|
|
duration_ms: Optional[int] = None
|
||
|
|
language: Optional[str] = None
|
||
|
|
offset_ms: Optional[int] = None
|
||
|
|
|
||
|
|
|
||
|
|
def extract_metadata(audio_path: str, language: Optional[str] = None) -> TrackMetadata:
|
||
|
|
"""Read tags + duration from the audio file via mutagen.
|
||
|
|
|
||
|
|
Falls back to the placeholder strings ``"Unknown"`` / ``"Unknown Artist"``
|
||
|
|
when the corresponding tag is missing, since Lyricsfile requires
|
||
|
|
``metadata.title`` and ``metadata.artist``.
|
||
|
|
"""
|
||
|
|
f = mutagen.File(audio_path, easy=True)
|
||
|
|
return TrackMetadata(
|
||
|
|
title=_first_tag(f, "title") or "Unknown",
|
||
|
|
artist=_first_tag(f, "artist") or "Unknown Artist",
|
||
|
|
album=_first_tag(f, "album"),
|
||
|
|
duration_ms=_duration_ms(f),
|
||
|
|
language=language,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _first_tag(f, key: str) -> Optional[str]:
|
||
|
|
if f is None:
|
||
|
|
return None
|
||
|
|
val = f.get(key)
|
||
|
|
if not val:
|
||
|
|
return None
|
||
|
|
if isinstance(val, list):
|
||
|
|
val = val[0] if val else None
|
||
|
|
if val is None:
|
||
|
|
return None
|
||
|
|
val = str(val).strip()
|
||
|
|
return val or None
|
||
|
|
|
||
|
|
|
||
|
|
def _duration_ms(f) -> Optional[int]:
|
||
|
|
info = getattr(f, "info", None) if f is not None else None
|
||
|
|
length = getattr(info, "length", None) if info is not None else None
|
||
|
|
if length is None:
|
||
|
|
return None
|
||
|
|
return int(round(length * 1000))
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# LRC parsing
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
_LRC_METADATA_KEYS_TO_LYRICSFILE = {
|
||
|
|
"ti": "title",
|
||
|
|
"ar": "artist",
|
||
|
|
"al": "album",
|
||
|
|
"la": "language",
|
||
|
|
"lang": "language",
|
||
|
|
"language": "language",
|
||
|
|
}
|
||
|
|
|
||
|
|
_TIMESTAMP_RE = re.compile(r"^(\d{1,3}):(\d{2})(?:[.:](\d{2,3}))?$")
|
||
|
|
_LINE_RE = re.compile(r"^\[([^\]]+)\](.*)$")
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ParsedLrc:
|
||
|
|
"""Result of parsing an LRC file."""
|
||
|
|
segments: List[dict] = field(default_factory=list)
|
||
|
|
metadata_tags: Dict[str, str] = field(default_factory=dict)
|
||
|
|
dropped_tags: Dict[str, str] = field(default_factory=dict)
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_lrc_timestamp(token: str) -> Optional[float]:
|
||
|
|
"""Parse an LRC timestamp token to seconds, or None if not a timestamp."""
|
||
|
|
m = _TIMESTAMP_RE.match(token.strip())
|
||
|
|
if not m:
|
||
|
|
return None
|
||
|
|
minutes = int(m.group(1))
|
||
|
|
seconds = int(m.group(2))
|
||
|
|
frac_raw = m.group(3) or "0"
|
||
|
|
if len(frac_raw) == 2:
|
||
|
|
frac_ms = int(frac_raw) * 10
|
||
|
|
else:
|
||
|
|
frac_ms = int(frac_raw)
|
||
|
|
return minutes * 60 + seconds + frac_ms / 1000.0
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_lrc_length(value: str) -> Optional[int]:
|
||
|
|
"""Parse an LRC [length:...] value to milliseconds."""
|
||
|
|
value = value.strip()
|
||
|
|
m = re.match(r"^(\d+):(\d{2})(?:[.:](\d{2,3}))?$", value)
|
||
|
|
if not m:
|
||
|
|
return None
|
||
|
|
minutes = int(m.group(1))
|
||
|
|
seconds = int(m.group(2))
|
||
|
|
frac_raw = m.group(3) or "0"
|
||
|
|
if len(frac_raw) == 2:
|
||
|
|
frac_ms = int(frac_raw) * 10
|
||
|
|
else:
|
||
|
|
frac_ms = int(frac_raw)
|
||
|
|
return (minutes * 60 + seconds) * 1000 + frac_ms
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_lrc_offset(value: str) -> Optional[int]:
|
||
|
|
"""Parse an LRC [offset:...] value to milliseconds (integer, may be negative)."""
|
||
|
|
value = value.strip()
|
||
|
|
m = re.match(r"^([+-]?\d+)$", value)
|
||
|
|
if not m:
|
||
|
|
return None
|
||
|
|
return int(m.group(1))
|
||
|
|
|
||
|
|
|
||
|
|
def parse_lrc_file(content: str) -> ParsedLrc:
|
||
|
|
"""Parse LRC content into timed segments and metadata.
|
||
|
|
|
||
|
|
Returns a :class:`ParsedLrc` containing:
|
||
|
|
|
||
|
|
- ``segments``: list of ``{"start": float, "end": float, "text": str}``
|
||
|
|
suitable for ``model.align_words()``. Gap markers (empty-text
|
||
|
|
timestamps) contribute an ``end`` to the preceding segment but do not
|
||
|
|
appear as segments themselves.
|
||
|
|
- ``metadata_tags``: dict of recognised LRC metadata mapped to
|
||
|
|
Lyricsfile-compatible keys.
|
||
|
|
- ``dropped_tags``: dict of LRC metadata tags that have no Lyricsfile
|
||
|
|
equivalent (warned about by the caller).
|
||
|
|
|
||
|
|
If the LRC contains an ``[offset:...]`` tag, the offset is applied to
|
||
|
|
all parsed timestamps so that the returned times are absolute.
|
||
|
|
"""
|
||
|
|
result = ParsedLrc()
|
||
|
|
|
||
|
|
raw_timed: List[Tuple[float, str]] = []
|
||
|
|
|
||
|
|
for raw_line in content.splitlines():
|
||
|
|
raw_line = raw_line.strip()
|
||
|
|
if not raw_line:
|
||
|
|
continue
|
||
|
|
m = _LINE_RE.match(raw_line)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
|
||
|
|
bracket_content = m.group(1)
|
||
|
|
after_bracket = m.group(2)
|
||
|
|
|
||
|
|
ts = _parse_lrc_timestamp(bracket_content)
|
||
|
|
if ts is not None:
|
||
|
|
raw_timed.append((ts, after_bracket.strip()))
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Metadata tag: [key:value]
|
||
|
|
if ":" in bracket_content:
|
||
|
|
key, _, value = bracket_content.partition(":")
|
||
|
|
key = key.strip().lower()
|
||
|
|
value = value.strip()
|
||
|
|
if not value:
|
||
|
|
continue
|
||
|
|
|
||
|
|
lyricsfile_key = _LRC_METADATA_KEYS_TO_LYRICSFILE.get(key)
|
||
|
|
if lyricsfile_key is not None:
|
||
|
|
result.metadata_tags[lyricsfile_key] = value
|
||
|
|
elif key == "length":
|
||
|
|
length_ms = _parse_lrc_length(value)
|
||
|
|
if length_ms is not None:
|
||
|
|
result.metadata_tags["duration_ms"] = str(length_ms)
|
||
|
|
else:
|
||
|
|
result.dropped_tags[key] = value
|
||
|
|
elif key == "offset":
|
||
|
|
offset_ms = _parse_lrc_offset(value)
|
||
|
|
if offset_ms is not None:
|
||
|
|
result.metadata_tags["offset_ms"] = str(offset_ms)
|
||
|
|
else:
|
||
|
|
result.dropped_tags[key] = value
|
||
|
|
else:
|
||
|
|
result.dropped_tags[key] = value
|
||
|
|
|
||
|
|
if not raw_timed:
|
||
|
|
return result
|
||
|
|
|
||
|
|
raw_timed.sort(key=lambda t: t[0])
|
||
|
|
|
||
|
|
# Apply offset: shift all timestamps so output is absolute
|
||
|
|
offset_s = 0.0
|
||
|
|
if "offset_ms" in result.metadata_tags:
|
||
|
|
offset_s = int(result.metadata_tags["offset_ms"]) / 1000.0
|
||
|
|
|
||
|
|
adjusted: List[Tuple[float, str]] = [
|
||
|
|
(max(0.0, ts + offset_s), text) for ts, text in raw_timed
|
||
|
|
]
|
||
|
|
|
||
|
|
# Build segments: non-empty text lines become segments; empty-text
|
||
|
|
# lines (gap markers) contribute an end time to the preceding segment.
|
||
|
|
pending_segments: List[dict] = []
|
||
|
|
for ts, text in adjusted:
|
||
|
|
if text:
|
||
|
|
pending_segments.append({"start": ts, "end": None, "text": text})
|
||
|
|
elif pending_segments:
|
||
|
|
# Gap marker: set the preceding segment's end
|
||
|
|
pending_segments[-1]["end"] = ts
|
||
|
|
|
||
|
|
# Fill in missing end times: end of seg N = start of seg N+1
|
||
|
|
for i in range(len(pending_segments) - 1):
|
||
|
|
if pending_segments[i]["end"] is None:
|
||
|
|
pending_segments[i]["end"] = pending_segments[i + 1]["start"]
|
||
|
|
|
||
|
|
# Last segment: if no explicit end (no trailing gap marker), use
|
||
|
|
# start + 5s as a reasonable upper bound; align_words will confine
|
||
|
|
# within whatever audio is available.
|
||
|
|
if pending_segments and pending_segments[-1]["end"] is None:
|
||
|
|
pending_segments[-1]["end"] = pending_segments[-1]["start"] + 5.0
|
||
|
|
|
||
|
|
result.segments = pending_segments
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def merge_metadata(
|
||
|
|
audio_path: str,
|
||
|
|
lrc_parsed: ParsedLrc,
|
||
|
|
cli_language: Optional[str] = None,
|
||
|
|
) -> TrackMetadata:
|
||
|
|
"""Build a :class:`TrackMetadata` by merging LRC tags over mutagen tags.
|
||
|
|
|
||
|
|
Precedence (highest first): LRC tag → mutagen tag → placeholder.
|
||
|
|
The ``--language`` CLI flag overrides both LRC and mutagen for language.
|
||
|
|
|
||
|
|
Warnings are printed to stderr for LRC metadata tags that have no
|
||
|
|
Lyricsfile equivalent and are therefore dropped.
|
||
|
|
"""
|
||
|
|
audio_meta = extract_metadata(audio_path, language=cli_language)
|
||
|
|
|
||
|
|
tags = lrc_parsed.metadata_tags
|
||
|
|
|
||
|
|
title = tags.get("title") or audio_meta.title
|
||
|
|
artist = tags.get("artist") or audio_meta.artist
|
||
|
|
album = tags.get("album") or audio_meta.album
|
||
|
|
|
||
|
|
duration_ms = audio_meta.duration_ms
|
||
|
|
if "duration_ms" in tags:
|
||
|
|
try:
|
||
|
|
duration_ms = int(tags["duration_ms"])
|
||
|
|
except ValueError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
language = cli_language or tags.get("language") or audio_meta.language
|
||
|
|
|
||
|
|
for key, value in lrc_parsed.dropped_tags.items():
|
||
|
|
print(
|
||
|
|
f" Warning: LRC tag [{key}:{value}] has no Lyricsfile equivalent; skipped",
|
||
|
|
file=sys.stderr,
|
||
|
|
)
|
||
|
|
|
||
|
|
return TrackMetadata(
|
||
|
|
title=title,
|
||
|
|
artist=artist,
|
||
|
|
album=album,
|
||
|
|
duration_ms=duration_ms,
|
||
|
|
language=language,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _format_lrc_timestamp(seconds: float) -> str:
|
||
|
|
if seconds < 0:
|
||
|
|
seconds = 0.0
|
||
|
|
minutes = int(seconds // 60)
|
||
|
|
remainder = seconds - minutes * 60
|
||
|
|
return f"{minutes:02d}:{remainder:05.2f}"
|
||
|
|
|
||
|
|
|
||
|
|
def result_to_lrc(
|
||
|
|
result: "stable_whisper.WhisperResult",
|
||
|
|
gap_threshold: float = 1.5,
|
||
|
|
) -> str:
|
||
|
|
"""Render ``result`` as LRC text.
|
||
|
|
|
||
|
|
Between consecutive segments, if ``next_segment.start - this_segment.end``
|
||
|
|
exceeds ``gap_threshold`` seconds, an empty timestamp is emitted at
|
||
|
|
``this_segment.end`` so consumers stop displaying the line during the
|
||
|
|
pause. A trailing empty timestamp at the end of the last segment is
|
||
|
|
always emitted.
|
||
|
|
"""
|
||
|
|
segments = list(result.segments)
|
||
|
|
if not segments:
|
||
|
|
return ""
|
||
|
|
|
||
|
|
lines: List[str] = []
|
||
|
|
for i, seg in enumerate(segments):
|
||
|
|
text = (seg.text or "").strip()
|
||
|
|
lines.append(f"[{_format_lrc_timestamp(seg.start)}]{text}")
|
||
|
|
next_seg = segments[i + 1] if i + 1 < len(segments) else None
|
||
|
|
if next_seg is None:
|
||
|
|
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
|
||
|
|
elif next_seg.start - seg.end > gap_threshold:
|
||
|
|
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
|
||
|
|
|
||
|
|
return "\n".join(lines) + "\n"
|
||
|
|
|
||
|
|
|
||
|
|
def result_to_lyricsfile(
|
||
|
|
result: "stable_whisper.WhisperResult",
|
||
|
|
metadata: TrackMetadata,
|
||
|
|
) -> str:
|
||
|
|
"""Render ``result`` as a Lyricsfile YAML string.
|
||
|
|
|
||
|
|
The ``plain`` block is intentionally omitted; consumers receive only
|
||
|
|
the synced ``lines`` array (with word-level timings when available).
|
||
|
|
"""
|
||
|
|
metadata_obj = {
|
||
|
|
"title": metadata.title,
|
||
|
|
"artist": metadata.artist,
|
||
|
|
}
|
||
|
|
if metadata.album:
|
||
|
|
metadata_obj["album"] = metadata.album
|
||
|
|
if metadata.duration_ms is not None:
|
||
|
|
metadata_obj["duration_ms"] = metadata.duration_ms
|
||
|
|
if metadata.language:
|
||
|
|
metadata_obj["language"] = metadata.language
|
||
|
|
if metadata.offset_ms is not None:
|
||
|
|
metadata_obj["offset_ms"] = metadata.offset_ms
|
||
|
|
metadata_obj["instrumental"] = False
|
||
|
|
|
||
|
|
lines_out = []
|
||
|
|
for seg in result.segments:
|
||
|
|
line_obj = {
|
||
|
|
"text": (seg.text or "").strip(),
|
||
|
|
"start_ms": _to_ms(seg.start),
|
||
|
|
"end_ms": _to_ms(seg.end),
|
||
|
|
}
|
||
|
|
word_objs = _words_to_lyricsfile_words(seg.words) if seg.words else None
|
||
|
|
if word_objs:
|
||
|
|
line_obj["words"] = word_objs
|
||
|
|
lines_out.append(line_obj)
|
||
|
|
|
||
|
|
document = {
|
||
|
|
"version": "1.0",
|
||
|
|
"metadata": metadata_obj,
|
||
|
|
"lines": lines_out,
|
||
|
|
}
|
||
|
|
|
||
|
|
return yaml.safe_dump(
|
||
|
|
document,
|
||
|
|
sort_keys=False,
|
||
|
|
allow_unicode=True,
|
||
|
|
default_flow_style=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _to_ms(seconds: float) -> int:
|
||
|
|
return int(round(seconds * 1000))
|
||
|
|
|
||
|
|
|
||
|
|
def _words_to_lyricsfile_words(word_timings) -> List[dict]:
|
||
|
|
"""Convert stable-ts word objects to Lyricsfile word objects.
|
||
|
|
|
||
|
|
Whisper's tokenization produces words with leading whitespace
|
||
|
|
(e.g. ``" club"``). Lyricsfile expects trailing whitespace except on
|
||
|
|
the final word. This re-attaches the leading space of word ``i+1``
|
||
|
|
onto the trailing edge of word ``i``. For CJK languages stable-ts
|
||
|
|
splits without spaces, in which case no spacing is injected.
|
||
|
|
"""
|
||
|
|
bodies = [(w.word or "").lstrip() for w in word_timings]
|
||
|
|
leading_spaces = [(w.word or "")[: len(w.word or "") - len((w.word or "").lstrip())] for w in word_timings]
|
||
|
|
|
||
|
|
out: List[dict] = []
|
||
|
|
for i, w in enumerate(word_timings):
|
||
|
|
text = bodies[i]
|
||
|
|
if i + 1 < len(word_timings) and leading_spaces[i + 1]:
|
||
|
|
text = text + " "
|
||
|
|
out.append(
|
||
|
|
{
|
||
|
|
"text": text,
|
||
|
|
"start_ms": _to_ms(w.start),
|
||
|
|
"end_ms": _to_ms(w.end),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
return out
|