204 lines
7.6 KiB
Python
204 lines
7.6 KiB
Python
"""
|
|
transcribe.py — Whisper транскрипция аудио → [{start, end, text}]
|
|
|
|
Поддержка:
|
|
- faster-whisper (CPU) по умолчанию
|
|
- Готовые файлы .lrc, .srt (парсинг таймингов)
|
|
- Простой .txt (forced alignment через Whisper word-timestamps)
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Whisper транскрипция
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def transcribe_whisper(audio_path: str, model_size: str = "base",
|
|
language: Optional[str] = None,
|
|
device: str = "cpu") -> list[dict]:
|
|
"""Запуск faster-whisper. Возвращает список сегментов [{start, end, text}]."""
|
|
try:
|
|
from faster_whisper import WhisperModel
|
|
except ImportError:
|
|
print("[transcribe] faster-whisper не установлен. Ставим...", flush=True)
|
|
os.system(f"{sys.executable} -m pip install faster-whisper")
|
|
from faster_whisper import WhisperModel
|
|
|
|
model = WhisperModel(model_size, device=device, compute_type="int8")
|
|
segments, _ = model.transcribe(audio_path, language=language,
|
|
word_timestamps=True)
|
|
|
|
results = []
|
|
for seg in segments:
|
|
text = seg.text.strip()
|
|
if not text:
|
|
continue
|
|
results.append({
|
|
"start": round(seg.start, 2),
|
|
"end": round(seg.end, 2),
|
|
"text": text,
|
|
})
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Парсинг .lrc
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_RE_LRC = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2,3})\](.*)")
|
|
|
|
def parse_lrc(path: str) -> list[dict]:
|
|
lines = []
|
|
for line in Path(path).read_text(encoding="utf-8").splitlines():
|
|
m = _RE_LRC.match(line.strip())
|
|
if not m:
|
|
continue
|
|
mins, secs, frac, text = m.groups()
|
|
frac = frac.ljust(3, "0")[:3]
|
|
timestamp = int(mins) * 60 + int(secs) + int(frac) / 1000
|
|
text = text.strip()
|
|
if text:
|
|
lines.append({"start": round(timestamp, 2),
|
|
"end": None, # заполним позже
|
|
"text": text})
|
|
|
|
# Длительность каждой строки = начало следующей
|
|
for i in range(len(lines) - 1):
|
|
lines[i]["end"] = lines[i + 1]["start"]
|
|
if lines:
|
|
lines[-1]["end"] = lines[-1]["start"] + 4.0
|
|
return lines
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Парсинг .srt
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_RE_SRT_TIME = re.compile(r"(\d{2}):(\d{2}):(\d{2}),(\d{3})")
|
|
|
|
def _srt_ts_to_sec(h, m, s, ms):
|
|
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
|
|
|
|
|
|
def parse_srt(path: str) -> list[dict]:
|
|
results = []
|
|
blocks = re.split(r"\n\s*\n", Path(path).read_text(encoding="utf-8").strip())
|
|
for block in blocks:
|
|
parts = block.strip().splitlines()
|
|
if len(parts) < 3:
|
|
continue
|
|
m_from = _RE_SRT_TIME.match(parts[1].split(" --> ")[0].strip())
|
|
m_to = _RE_SRT_TIME.match(parts[1].split(" --> ")[1].strip())
|
|
if not m_from or not m_to:
|
|
continue
|
|
start = _srt_ts_to_sec(*m_from.groups())
|
|
end = _srt_ts_to_sec(*m_to.groups())
|
|
text = " ".join(parts[2:]).strip()
|
|
if text:
|
|
results.append({"start": round(start, 2),
|
|
"end": round(end, 2),
|
|
"text": text})
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# .txt forced alignment через whisper word-timestamps
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def align_txt(audio_path: str, txt_path: str,
|
|
model_size: str = "base",
|
|
device: str = "cpu") -> list[dict]:
|
|
"""Align plain .txt lyrics to audio by splitting Whisper segments by lines."""
|
|
from faster_whisper import WhisperModel
|
|
|
|
txt_lines = Path(txt_path).read_text(encoding="utf-8").splitlines()
|
|
txt_lines = [l.strip() for l in txt_lines if l.strip()]
|
|
total_lines = len(txt_lines)
|
|
if total_lines == 0:
|
|
return []
|
|
|
|
model = WhisperModel(model_size, device=device, compute_type="int8")
|
|
segments, _ = model.transcribe(audio_path, word_timestamps=True)
|
|
|
|
# Собираем полнотекст из whisper
|
|
whisper_parts = []
|
|
for seg in segments:
|
|
whisper_parts.append(seg.text.strip())
|
|
full_whisper = " ".join(whisper_parts)
|
|
|
|
segment_lines = max(total_lines, len(whisper_parts))
|
|
|
|
# Равномерно распределяем строки по whisper-сегментам
|
|
if len(whisper_parts) == 0:
|
|
return []
|
|
|
|
results = []
|
|
line_idx = 0
|
|
for i, seg in enumerate(segments):
|
|
seg_text = seg.text.strip()
|
|
if not seg_text:
|
|
continue
|
|
|
|
# Сколько строк текста привязать к этому сегменту
|
|
if i == len(whisper_parts) - 1:
|
|
# Последний сегмент — все оставшиеся строки
|
|
count = total_lines - line_idx
|
|
else:
|
|
# Пропорционально по символам
|
|
ratio = len(seg_text) / len(full_whisper)
|
|
count = max(1, round(total_lines * ratio))
|
|
count = min(count, total_lines - line_idx)
|
|
|
|
for j in range(count):
|
|
if line_idx >= total_lines:
|
|
break
|
|
t_start = seg.start + j * (seg.end - seg.start) / max(count, 1)
|
|
t_end = seg.start + (j + 1) * (seg.end - seg.start) / max(count, 1)
|
|
results.append({
|
|
"start": round(t_start, 2),
|
|
"end": round(t_end, 2),
|
|
"text": txt_lines[line_idx],
|
|
})
|
|
line_idx += 1
|
|
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Главная функция
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def transcribe(audio_path: str, text_path: Optional[str] = None,
|
|
model_size: str = "base", device: str = "cpu",
|
|
language: Optional[str] = None) -> list[dict]:
|
|
"""
|
|
Универсальная функция транскрипции.
|
|
|
|
Параметры:
|
|
audio_path — путь к аудиофайлу
|
|
text_path — путь к .lrc/.srt/.txt (опционально)
|
|
model_size — размер Whisper-модели (tiny, base, small, medium, large)
|
|
device — "cpu" или "cuda"
|
|
language — код языка (напр. "ru", "en") или None для авто
|
|
|
|
Возвращает: [{start, end, text}, ...]
|
|
"""
|
|
if text_path:
|
|
ext = Path(text_path).suffix.lower()
|
|
if ext == ".lrc":
|
|
return parse_lrc(text_path)
|
|
elif ext == ".srt":
|
|
return parse_srt(text_path)
|
|
elif ext == ".txt":
|
|
return align_txt(audio_path, text_path, model_size, device)
|
|
else:
|
|
print(f"[transcribe] Неподдерживаемый формат: {text_path}")
|
|
sys.exit(1)
|
|
else:
|
|
print(f"[transcribe] Запускаем Whisper ({model_size}, {device})…")
|
|
return transcribe_whisper(audio_path, model_size, language, device)
|