Files
wiki/tasks/snowbike-rag/scripts/reindex_safe.py
2026-04-12 21:55:33 +03:00

152 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Безопасная переиндексация ChromaDB с малыми батчами и GC.
Запускается в фоне не мешая Flask-сервису.
"""
import gc
import json
import os
import sys
import time
from pathlib import Path
CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb")
DATA_FILE = Path(__file__).parent.parent / "data" / "messages.jsonl"
COLLECTION_NAME = "snowbike_embeddings"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
BATCH_SIZE = 8 # Smaller batch to save memory
SLEEP_BETWEEN_BATCHES = 0.3 # Seconds to sleep between batches (reduce CPU pressure)
MAX_TEXT_LEN = 512
LOG_FILE = Path(__file__).parent.parent / "data" / "logs" / "reindex_safe.log"
def log(msg):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] {msg}"
print(line, flush=True)
with open(LOG_FILE, "a") as f:
f.write(line + "\n")
def load_messages():
messages = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
msg = json.loads(line)
if len(msg.get("text", "")) >= 5:
messages.append(msg)
return messages
def main():
log("=== Безопасная переиндексация ChromaDB ===")
import chromadb
from sentence_transformers import SentenceTransformer
log(f"Загружаем сообщения из {DATA_FILE}...")
messages = load_messages()
log(f"Загружено {len(messages)} сообщений")
log(f"Загружаем модель {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)
log("Модель загружена")
log(f"Подключаемся к ChromaDB: {CHROMA_PATH}")
client = chromadb.PersistentClient(path=CHROMA_PATH)
try:
collection = client.get_collection(COLLECTION_NAME)
existing_count = collection.count()
log(f"Коллекция существует, {existing_count} документов")
except Exception:
collection = client.create_collection(
name=COLLECTION_NAME,
metadata={"description": "Snowbike Russia Telegram messages embeddings"}
)
existing_count = 0
log("Коллекция создана")
# Получаем уже проиндексированные IDs
if existing_count > 0:
log(f"Получаем {existing_count} существующих ID...")
existing_ids = set(collection.get(include=[])["ids"])
messages = [m for m in messages if str(m["id"]) not in existing_ids]
log(f"Осталось проиндексировать: {len(messages)} сообщений")
if not messages:
log("Всё уже проиндексировано!")
return
total = len(messages)
indexed = 0
start_time = time.time()
errors = 0
log(f"Начинаем индексацию {total} сообщений (батч={BATCH_SIZE})...")
for i in range(0, total, BATCH_SIZE):
batch = messages[i:i + BATCH_SIZE]
texts = [m["text"][:MAX_TEXT_LEN] for m in batch]
ids = [str(m["id"]) for m in batch]
metadatas = []
for m in batch:
date_str = m.get("date", "")
try:
month = int(date_str[5:7]) if len(date_str) >= 7 else 0
except (ValueError, IndexError):
month = 0
metadatas.append({
"topic_id": m["topic_id"],
"topic_title": m["topic_title"],
"date": date_str,
"from_id": str(m.get("from_id", "")),
"month": month,
})
try:
embeddings = model.encode(texts, show_progress_bar=False).tolist()
collection.add(
ids=ids,
embeddings=embeddings,
documents=texts,
metadatas=metadatas,
)
except Exception as e:
errors += 1
log(f"ОШИБКА батча {i}: {e}")
if errors > 10:
log("Слишком много ошибок, останавливаемся")
break
time.sleep(1)
continue
indexed += len(batch)
if indexed % 1000 == 0 or indexed == total:
elapsed = time.time() - start_time
speed = indexed / elapsed if elapsed > 0 else 0
eta = (total - indexed) / speed if speed > 0 else 0
progress = (indexed / total) * 100
log(f"Прогресс: {indexed}/{total} ({progress:.1f}%) | {speed:.0f} msg/s | ETA: {eta:.0f}s")
time.sleep(SLEEP_BETWEEN_BATCHES)
# GC каждые 100 батчей
if (i // BATCH_SIZE) % 100 == 0:
gc.collect()
elapsed = time.time() - start_time
final_count = collection.count()
log(f"=== Индексация завершена за {elapsed:.0f}с ===")
log(f"Итого в коллекции: {final_count} документов")
log(f"Ошибок: {errors}")
if __name__ == "__main__":
main()