#!/usr/bin/env python3 """ Минимальный инкрементальный индексер — добавляет только сообщения из указанного файла. Не делает полного ребилда. """ import json, sys, time from pathlib import Path CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb") COLLECTION_NAME = "snowbike_embeddings" MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" BATCH_SIZE = 64 MAX_TEXT_LEN = 512 INCREMENTAL_FILE = Path(__file__).parent.parent / "data" / "incremental_20260407.jsonl" def main(): print("=== Инкрементальная индексация ChromaDB ===") import chromadb from sentence_transformers import SentenceTransformer # Load messages from incremental file only messages = [] with open(INCREMENTAL_FILE, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: msg = json.loads(line) if len(msg.get("text", "")) >= 5: messages.append(msg) print(f"Загружено из incremental: {len(messages)} сообщений") print(f"Загружаем модель {MODEL_NAME}...") model = SentenceTransformer(MODEL_NAME) print("Модель загружена") client = chromadb.PersistentClient(path=CHROMA_PATH) collection = client.get_collection(COLLECTION_NAME) existing_count = collection.count() print(f"Текущий размер коллекции: {existing_count}") # Check which IDs already exist existing_ids = set(collection.get(include=[])["ids"]) to_add = [m for m in messages if str(m["id"]) not in existing_ids] print(f"Уже есть: {len(messages) - len(to_add)}, добавляем: {len(to_add)}") if not to_add: print("Нечего добавлять!") return total = len(to_add) indexed = 0 start_time = time.time() for i in range(0, total, BATCH_SIZE): batch = to_add[i:i + BATCH_SIZE] texts = [m["text"][:MAX_TEXT_LEN] for m in batch] ids = [str(m["id"]) for m in batch] metadatas = [] for m in batch: date_str = m.get("date", "") try: month = int(date_str[5:7]) if len(date_str) >= 7 else 0 except (ValueError, IndexError): month = 0 metadatas.append({ "topic_id": m["topic_id"], "topic_title": m["topic_title"], "date": date_str, "from_id": str(m.get("from_id", "")), "month": month, }) embeddings = model.encode(texts, show_progress_bar=False).tolist() collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas) indexed += len(batch) print(f" {indexed}/{total} ({100*indexed/total:.1f}%)", end="\r") elapsed = time.time() - start_time print(f"\nГотово за {elapsed:.0f}с. Итого в коллекции: {collection.count()}") if __name__ == "__main__": main()