Files
txtlyric-to-lrc/txtlyric_to_lrc/output.py
T

432 lines
13 KiB
Python
Raw Normal View History

"""Output writers, LRC parsing, and metadata extraction for txtlyric-to-lrc.
Writers:
- :func:`result_to_lrc` -- emits classic LRC, inserting a clear-display
marker whenever the gap between consecutive segments exceeds a threshold
so that lines do not visually persist through instrumental pauses.
- :func:`result_to_lyricsfile` -- emits the YAML-based Lyricsfile format
used by lrcget/lrclib, with word-level timings.
LRC input:
- :func:`parse_lrc_file` -- parses an LRC file into metadata tags and
timed segments (with ``start``/``end``/``text``) suitable for
``stable_whisper.align_words``.
"""
from __future__ import annotations
import os
import re
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
import mutagen
import stable_whisper
import yaml
song_file_extensions = {
"mp3",
"wav",
"mp4",
"mpeg",
"mpga",
"m4a",
"webm",
"flac",
"opus",
}
def get_file_extension(filename: str) -> str:
return os.path.splitext(filename)[1][1:]
def file_is_audio(filename: str) -> bool:
return get_file_extension(filename) in song_file_extensions
# ---------------------------------------------------------------------------
# Metadata
# ---------------------------------------------------------------------------
@dataclass
class TrackMetadata:
title: str
artist: str
album: Optional[str] = None
duration_ms: Optional[int] = None
language: Optional[str] = None
offset_ms: Optional[int] = None
def extract_metadata(audio_path: str, language: Optional[str] = None) -> TrackMetadata:
"""Read tags + duration from the audio file via mutagen.
Falls back to the placeholder strings ``"Unknown"`` / ``"Unknown Artist"``
when the corresponding tag is missing, since Lyricsfile requires
``metadata.title`` and ``metadata.artist``.
"""
f = mutagen.File(audio_path, easy=True)
return TrackMetadata(
title=_first_tag(f, "title") or "Unknown",
artist=_first_tag(f, "artist") or "Unknown Artist",
album=_first_tag(f, "album"),
duration_ms=_duration_ms(f),
language=language,
)
def _first_tag(f, key: str) -> Optional[str]:
if f is None:
return None
val = f.get(key)
if not val:
return None
if isinstance(val, list):
val = val[0] if val else None
if val is None:
return None
val = str(val).strip()
return val or None
def _duration_ms(f) -> Optional[int]:
info = getattr(f, "info", None) if f is not None else None
length = getattr(info, "length", None) if info is not None else None
if length is None:
return None
return int(round(length * 1000))
# ---------------------------------------------------------------------------
# LRC parsing
# ---------------------------------------------------------------------------
_LRC_METADATA_KEYS_TO_LYRICSFILE = {
"ti": "title",
"ar": "artist",
"al": "album",
"la": "language",
"lang": "language",
"language": "language",
}
_TIMESTAMP_RE = re.compile(r"^(\d{1,3}):(\d{2})(?:[.:](\d{2,3}))?$")
_LINE_RE = re.compile(r"^\[([^\]]+)\](.*)$")
@dataclass
class ParsedLrc:
"""Result of parsing an LRC file."""
segments: List[dict] = field(default_factory=list)
metadata_tags: Dict[str, str] = field(default_factory=dict)
dropped_tags: Dict[str, str] = field(default_factory=dict)
def _parse_lrc_timestamp(token: str) -> Optional[float]:
"""Parse an LRC timestamp token to seconds, or None if not a timestamp."""
m = _TIMESTAMP_RE.match(token.strip())
if not m:
return None
minutes = int(m.group(1))
seconds = int(m.group(2))
frac_raw = m.group(3) or "0"
if len(frac_raw) == 2:
frac_ms = int(frac_raw) * 10
else:
frac_ms = int(frac_raw)
return minutes * 60 + seconds + frac_ms / 1000.0
def _parse_lrc_length(value: str) -> Optional[int]:
"""Parse an LRC [length:...] value to milliseconds."""
value = value.strip()
m = re.match(r"^(\d+):(\d{2})(?:[.:](\d{2,3}))?$", value)
if not m:
return None
minutes = int(m.group(1))
seconds = int(m.group(2))
frac_raw = m.group(3) or "0"
if len(frac_raw) == 2:
frac_ms = int(frac_raw) * 10
else:
frac_ms = int(frac_raw)
return (minutes * 60 + seconds) * 1000 + frac_ms
def _parse_lrc_offset(value: str) -> Optional[int]:
"""Parse an LRC [offset:...] value to milliseconds (integer, may be negative)."""
value = value.strip()
m = re.match(r"^([+-]?\d+)$", value)
if not m:
return None
return int(m.group(1))
def parse_lrc_file(content: str) -> ParsedLrc:
"""Parse LRC content into timed segments and metadata.
Returns a :class:`ParsedLrc` containing:
- ``segments``: list of ``{"start": float, "end": float, "text": str}``
suitable for ``model.align_words()``. Gap markers (empty-text
timestamps) contribute an ``end`` to the preceding segment but do not
appear as segments themselves.
- ``metadata_tags``: dict of recognised LRC metadata mapped to
Lyricsfile-compatible keys.
- ``dropped_tags``: dict of LRC metadata tags that have no Lyricsfile
equivalent (warned about by the caller).
If the LRC contains an ``[offset:...]`` tag, the offset is applied to
all parsed timestamps so that the returned times are absolute.
"""
result = ParsedLrc()
raw_timed: List[Tuple[float, str]] = []
for raw_line in content.splitlines():
raw_line = raw_line.strip()
if not raw_line:
continue
m = _LINE_RE.match(raw_line)
if not m:
continue
bracket_content = m.group(1)
after_bracket = m.group(2)
ts = _parse_lrc_timestamp(bracket_content)
if ts is not None:
raw_timed.append((ts, after_bracket.strip()))
continue
# Metadata tag: [key:value]
if ":" in bracket_content:
key, _, value = bracket_content.partition(":")
key = key.strip().lower()
value = value.strip()
if not value:
continue
lyricsfile_key = _LRC_METADATA_KEYS_TO_LYRICSFILE.get(key)
if lyricsfile_key is not None:
result.metadata_tags[lyricsfile_key] = value
elif key == "length":
length_ms = _parse_lrc_length(value)
if length_ms is not None:
result.metadata_tags["duration_ms"] = str(length_ms)
else:
result.dropped_tags[key] = value
elif key == "offset":
offset_ms = _parse_lrc_offset(value)
if offset_ms is not None:
result.metadata_tags["offset_ms"] = str(offset_ms)
else:
result.dropped_tags[key] = value
else:
result.dropped_tags[key] = value
if not raw_timed:
return result
raw_timed.sort(key=lambda t: t[0])
# Apply offset: shift all timestamps so output is absolute
offset_s = 0.0
if "offset_ms" in result.metadata_tags:
offset_s = int(result.metadata_tags["offset_ms"]) / 1000.0
adjusted: List[Tuple[float, str]] = [
(max(0.0, ts + offset_s), text) for ts, text in raw_timed
]
# Build segments: non-empty text lines become segments; empty-text
# lines (gap markers) contribute an end time to the preceding segment.
pending_segments: List[dict] = []
for ts, text in adjusted:
if text:
pending_segments.append({"start": ts, "end": None, "text": text})
elif pending_segments:
# Gap marker: set the preceding segment's end
pending_segments[-1]["end"] = ts
# Fill in missing end times: end of seg N = start of seg N+1
for i in range(len(pending_segments) - 1):
if pending_segments[i]["end"] is None:
pending_segments[i]["end"] = pending_segments[i + 1]["start"]
# Last segment: if no explicit end (no trailing gap marker), use
# start + 5s as a reasonable upper bound; align_words will confine
# within whatever audio is available.
if pending_segments and pending_segments[-1]["end"] is None:
pending_segments[-1]["end"] = pending_segments[-1]["start"] + 5.0
result.segments = pending_segments
return result
def merge_metadata(
audio_path: str,
lrc_parsed: ParsedLrc,
cli_language: Optional[str] = None,
) -> TrackMetadata:
"""Build a :class:`TrackMetadata` by merging LRC tags over mutagen tags.
Precedence (highest first): LRC tag → mutagen tag → placeholder.
The ``--language`` CLI flag overrides both LRC and mutagen for language.
Warnings are printed to stderr for LRC metadata tags that have no
Lyricsfile equivalent and are therefore dropped.
"""
audio_meta = extract_metadata(audio_path, language=cli_language)
tags = lrc_parsed.metadata_tags
title = tags.get("title") or audio_meta.title
artist = tags.get("artist") or audio_meta.artist
album = tags.get("album") or audio_meta.album
duration_ms = audio_meta.duration_ms
if "duration_ms" in tags:
try:
duration_ms = int(tags["duration_ms"])
except ValueError:
pass
language = cli_language or tags.get("language") or audio_meta.language
for key, value in lrc_parsed.dropped_tags.items():
print(
f" Warning: LRC tag [{key}:{value}] has no Lyricsfile equivalent; skipped",
file=sys.stderr,
)
return TrackMetadata(
title=title,
artist=artist,
album=album,
duration_ms=duration_ms,
language=language,
)
def _format_lrc_timestamp(seconds: float) -> str:
if seconds < 0:
seconds = 0.0
minutes = int(seconds // 60)
remainder = seconds - minutes * 60
return f"{minutes:02d}:{remainder:05.2f}"
def result_to_lrc(
result: "stable_whisper.WhisperResult",
gap_threshold: float = 1.5,
) -> str:
"""Render ``result`` as LRC text.
Between consecutive segments, if ``next_segment.start - this_segment.end``
exceeds ``gap_threshold`` seconds, an empty timestamp is emitted at
``this_segment.end`` so consumers stop displaying the line during the
pause. A trailing empty timestamp at the end of the last segment is
always emitted.
"""
segments = list(result.segments)
if not segments:
return ""
lines: List[str] = []
for i, seg in enumerate(segments):
text = (seg.text or "").strip()
lines.append(f"[{_format_lrc_timestamp(seg.start)}]{text}")
next_seg = segments[i + 1] if i + 1 < len(segments) else None
if next_seg is None:
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
elif next_seg.start - seg.end > gap_threshold:
lines.append(f"[{_format_lrc_timestamp(seg.end)}]")
return "\n".join(lines) + "\n"
def result_to_lyricsfile(
result: "stable_whisper.WhisperResult",
metadata: TrackMetadata,
) -> str:
"""Render ``result`` as a Lyricsfile YAML string.
The ``plain`` block is intentionally omitted; consumers receive only
the synced ``lines`` array (with word-level timings when available).
"""
metadata_obj = {
"title": metadata.title,
"artist": metadata.artist,
}
if metadata.album:
metadata_obj["album"] = metadata.album
if metadata.duration_ms is not None:
metadata_obj["duration_ms"] = metadata.duration_ms
if metadata.language:
metadata_obj["language"] = metadata.language
if metadata.offset_ms is not None:
metadata_obj["offset_ms"] = metadata.offset_ms
metadata_obj["instrumental"] = False
lines_out = []
for seg in result.segments:
line_obj = {
"text": (seg.text or "").strip(),
"start_ms": _to_ms(seg.start),
"end_ms": _to_ms(seg.end),
}
word_objs = _words_to_lyricsfile_words(seg.words) if seg.words else None
if word_objs:
line_obj["words"] = word_objs
lines_out.append(line_obj)
document = {
"version": "1.0",
"metadata": metadata_obj,
"lines": lines_out,
}
return yaml.safe_dump(
document,
sort_keys=False,
allow_unicode=True,
default_flow_style=False,
)
def _to_ms(seconds: float) -> int:
return int(round(seconds * 1000))
def _words_to_lyricsfile_words(word_timings) -> List[dict]:
"""Convert stable-ts word objects to Lyricsfile word objects.
Whisper's tokenization produces words with leading whitespace
(e.g. ``" club"``). Lyricsfile expects trailing whitespace except on
the final word. This re-attaches the leading space of word ``i+1``
onto the trailing edge of word ``i``. For CJK languages stable-ts
splits without spaces, in which case no spacing is injected.
"""
bodies = [(w.word or "").lstrip() for w in word_timings]
leading_spaces = [(w.word or "")[: len(w.word or "") - len((w.word or "").lstrip())] for w in word_timings]
out: List[dict] = []
for i, w in enumerate(word_timings):
text = bodies[i]
if i + 1 < len(word_timings) and leading_spaces[i + 1]:
text = text + " "
out.append(
{
"text": text,
"start_ms": _to_ms(w.start),
"end_ms": _to_ms(w.end),
}
)
return out