wiki/tasks/karaoke/transcribe.py

"""
transcribe.py — Whisper транскрипция аудио → [{start, end, text}]

Поддержка:
  - faster-whisper (CPU) по умолчанию
  - Готовые файлы .lrc, .srt (парсинг таймингов)
  - Простой .txt (forced alignment через Whisper word-timestamps)
"""

import os
import re
import sys
from pathlib import Path
from typing import Optional

# ---------------------------------------------------------------------------
# Whisper транскрипция
# ---------------------------------------------------------------------------

def transcribe_whisper(audio_path: str, model_size: str = "base",
                       language: Optional[str] = None,
                       device: str = "cpu") -> list[dict]:
    """Запуск faster-whisper. Возвращает список сегментов [{start, end, text}]."""
    try:
        from faster_whisper import WhisperModel
    except ImportError:
        print("[transcribe] faster-whisper не установлен. Ставим...", flush=True)
        os.system(f"{sys.executable} -m pip install faster-whisper")
        from faster_whisper import WhisperModel

    model = WhisperModel(model_size, device=device, compute_type="int8")
    segments, _ = model.transcribe(audio_path, language=language,
                                    word_timestamps=True)

    results = []
    for seg in segments:
        text = seg.text.strip()
        if not text:
            continue
        results.append({
            "start": round(seg.start, 2),
            "end":   round(seg.end, 2),
            "text":  text,
        })
    return results


# ---------------------------------------------------------------------------
# Парсинг .lrc
# ---------------------------------------------------------------------------

_RE_LRC = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2,3})\](.*)")

def parse_lrc(path: str) -> list[dict]:
    lines = []
    for line in Path(path).read_text(encoding="utf-8").splitlines():
        m = _RE_LRC.match(line.strip())
        if not m:
            continue
        mins, secs, frac, text = m.groups()
        frac = frac.ljust(3, "0")[:3]
        timestamp = int(mins) * 60 + int(secs) + int(frac) / 1000
        text = text.strip()
        if text:
            lines.append({"start": round(timestamp, 2),
                          "end": None,                     # заполним позже
                          "text": text})

    # Длительность каждой строки = начало следующей
    for i in range(len(lines) - 1):
        lines[i]["end"] = lines[i + 1]["start"]
    if lines:
        lines[-1]["end"] = lines[-1]["start"] + 4.0
    return lines


# ---------------------------------------------------------------------------
# Парсинг .srt
# ---------------------------------------------------------------------------

_RE_SRT_TIME = re.compile(r"(\d{2}):(\d{2}):(\d{2}),(\d{3})")

def _srt_ts_to_sec(h, m, s, ms):
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000


def parse_srt(path: str) -> list[dict]:
    results = []
    blocks = re.split(r"\n\s*\n", Path(path).read_text(encoding="utf-8").strip())
    for block in blocks:
        parts = block.strip().splitlines()
        if len(parts) < 3:
            continue
        m_from = _RE_SRT_TIME.match(parts[1].split(" --> ")[0].strip())
        m_to   = _RE_SRT_TIME.match(parts[1].split(" --> ")[1].strip())
        if not m_from or not m_to:
            continue
        start = _srt_ts_to_sec(*m_from.groups())
        end   = _srt_ts_to_sec(*m_to.groups())
        text  = " ".join(parts[2:]).strip()
        if text:
            results.append({"start": round(start, 2),
                            "end":   round(end, 2),
                            "text":  text})
    return results


# ---------------------------------------------------------------------------
# .txt forced alignment через whisper word-timestamps
# ---------------------------------------------------------------------------

def align_txt(audio_path: str, txt_path: str,
              model_size: str = "base",
              device: str = "cpu") -> list[dict]:
    """Align plain .txt lyrics to audio by splitting Whisper segments by lines."""
    from faster_whisper import WhisperModel

    txt_lines = Path(txt_path).read_text(encoding="utf-8").splitlines()
    txt_lines = [l.strip() for l in txt_lines if l.strip()]
    total_lines = len(txt_lines)
    if total_lines == 0:
        return []

    model = WhisperModel(model_size, device=device, compute_type="int8")
    segments, _ = model.transcribe(audio_path, word_timestamps=True)

    # Собираем полнотекст из whisper
    whisper_parts = []
    for seg in segments:
        whisper_parts.append(seg.text.strip())
    full_whisper = " ".join(whisper_parts)

    segment_lines = max(total_lines, len(whisper_parts))

    # Равномерно распределяем строки по whisper-сегментам
    if len(whisper_parts) == 0:
        return []

    results = []
    line_idx = 0
    for i, seg in enumerate(segments):
        seg_text = seg.text.strip()
        if not seg_text:
            continue

        # Сколько строк текста привязать к этому сегменту
        if i == len(whisper_parts) - 1:
            # Последний сегмент — все оставшиеся строки
            count = total_lines - line_idx
        else:
            # Пропорционально по символам
            ratio = len(seg_text) / len(full_whisper)
            count = max(1, round(total_lines * ratio))
            count = min(count, total_lines - line_idx)

        for j in range(count):
            if line_idx >= total_lines:
                break
            t_start = seg.start + j * (seg.end - seg.start) / max(count, 1)
            t_end = seg.start + (j + 1) * (seg.end - seg.start) / max(count, 1)
            results.append({
                "start": round(t_start, 2),
                "end":   round(t_end, 2),
                "text":  txt_lines[line_idx],
            })
            line_idx += 1

    return results


# ---------------------------------------------------------------------------
# Главная функция
# ---------------------------------------------------------------------------

def transcribe(audio_path: str, text_path: Optional[str] = None,
               model_size: str = "base", device: str = "cpu",
               language: Optional[str] = None) -> list[dict]:
    """
    Универсальная функция транскрипции.

    Параметры:
        audio_path — путь к аудиофайлу
        text_path  — путь к .lrc/.srt/.txt (опционально)
        model_size — размер Whisper-модели (tiny, base, small, medium, large)
        device     — "cpu" или "cuda"
        language   — код языка (напр. "ru", "en") или None для авто

    Возвращает: [{start, end, text}, ...]
    """
    if text_path:
        ext = Path(text_path).suffix.lower()
        if ext == ".lrc":
            return parse_lrc(text_path)
        elif ext == ".srt":
            return parse_srt(text_path)
        elif ext == ".txt":
            return align_txt(audio_path, text_path, model_size, device)
        else:
            print(f"[transcribe] Неподдерживаемый формат: {text_path}")
            sys.exit(1)
    else:
        print(f"[transcribe] Запускаем Whisper ({model_size}, {device})…")
        return transcribe_whisper(audio_path, model_size, language, device)