#!/usr/bin/env python3 """ Безопасная переиндексация ChromaDB с малыми батчами и GC. Запускается в фоне не мешая Flask-сервису. """ import gc import json import os import sys import time from pathlib import Path CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb") DATA_FILE = Path(__file__).parent.parent / "data" / "messages.jsonl" COLLECTION_NAME = "snowbike_embeddings" MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" BATCH_SIZE = 8 # Smaller batch to save memory SLEEP_BETWEEN_BATCHES = 0.3 # Seconds to sleep between batches (reduce CPU pressure) MAX_TEXT_LEN = 512 LOG_FILE = Path(__file__).parent.parent / "data" / "logs" / "reindex_safe.log" def log(msg): ts = time.strftime("%Y-%m-%d %H:%M:%S") line = f"[{ts}] {msg}" print(line, flush=True) with open(LOG_FILE, "a") as f: f.write(line + "\n") def load_messages(): messages = [] with open(DATA_FILE, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: msg = json.loads(line) if len(msg.get("text", "")) >= 5: messages.append(msg) return messages def main(): log("=== Безопасная переиндексация ChromaDB ===") import chromadb from sentence_transformers import SentenceTransformer log(f"Загружаем сообщения из {DATA_FILE}...") messages = load_messages() log(f"Загружено {len(messages)} сообщений") log(f"Загружаем модель {MODEL_NAME}...") model = SentenceTransformer(MODEL_NAME) log("Модель загружена") log(f"Подключаемся к ChromaDB: {CHROMA_PATH}") client = chromadb.PersistentClient(path=CHROMA_PATH) try: collection = client.get_collection(COLLECTION_NAME) existing_count = collection.count() log(f"Коллекция существует, {existing_count} документов") except Exception: collection = client.create_collection( name=COLLECTION_NAME, metadata={"description": "Snowbike Russia Telegram messages embeddings"} ) existing_count = 0 log("Коллекция создана") # Получаем уже проиндексированные IDs if existing_count > 0: log(f"Получаем {existing_count} существующих ID...") existing_ids = set(collection.get(include=[])["ids"]) messages = [m for m in messages if str(m["id"]) not in existing_ids] log(f"Осталось проиндексировать: {len(messages)} сообщений") if not messages: log("Всё уже проиндексировано!") return total = len(messages) indexed = 0 start_time = time.time() errors = 0 log(f"Начинаем индексацию {total} сообщений (батч={BATCH_SIZE})...") for i in range(0, total, BATCH_SIZE): batch = messages[i:i + BATCH_SIZE] texts = [m["text"][:MAX_TEXT_LEN] for m in batch] ids = [str(m["id"]) for m in batch] metadatas = [] for m in batch: date_str = m.get("date", "") try: month = int(date_str[5:7]) if len(date_str) >= 7 else 0 except (ValueError, IndexError): month = 0 metadatas.append({ "topic_id": m["topic_id"], "topic_title": m["topic_title"], "date": date_str, "from_id": str(m.get("from_id", "")), "month": month, }) try: embeddings = model.encode(texts, show_progress_bar=False).tolist() collection.add( ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas, ) except Exception as e: errors += 1 log(f"ОШИБКА батча {i}: {e}") if errors > 10: log("Слишком много ошибок, останавливаемся") break time.sleep(1) continue indexed += len(batch) if indexed % 1000 == 0 or indexed == total: elapsed = time.time() - start_time speed = indexed / elapsed if elapsed > 0 else 0 eta = (total - indexed) / speed if speed > 0 else 0 progress = (indexed / total) * 100 log(f"Прогресс: {indexed}/{total} ({progress:.1f}%) | {speed:.0f} msg/s | ETA: {eta:.0f}s") time.sleep(SLEEP_BETWEEN_BATCHES) # GC каждые 100 батчей if (i // BATCH_SIZE) % 100 == 0: gc.collect() elapsed = time.time() - start_time final_count = collection.count() log(f"=== Индексация завершена за {elapsed:.0f}с ===") log(f"Итого в коллекции: {final_count} документов") log(f"Ошибок: {errors}") if __name__ == "__main__": main()