Files
wiki/tasks/snowbike-rag/scripts/index_incremental_chroma.py
2026-04-12 21:55:33 +03:00

85 lines
3.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Минимальный инкрементальный индексер — добавляет только сообщения из указанного файла.
Не делает полного ребилда.
"""
import json, sys, time
from pathlib import Path
CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb")
COLLECTION_NAME = "snowbike_embeddings"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
BATCH_SIZE = 64
MAX_TEXT_LEN = 512
INCREMENTAL_FILE = Path(__file__).parent.parent / "data" / "incremental_20260407.jsonl"
def main():
print("=== Инкрементальная индексация ChromaDB ===")
import chromadb
from sentence_transformers import SentenceTransformer
# Load messages from incremental file only
messages = []
with open(INCREMENTAL_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
msg = json.loads(line)
if len(msg.get("text", "")) >= 5:
messages.append(msg)
print(f"Загружено из incremental: {len(messages)} сообщений")
print(f"Загружаем модель {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)
print("Модель загружена")
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(COLLECTION_NAME)
existing_count = collection.count()
print(f"Текущий размер коллекции: {existing_count}")
# Check which IDs already exist
existing_ids = set(collection.get(include=[])["ids"])
to_add = [m for m in messages if str(m["id"]) not in existing_ids]
print(f"Уже есть: {len(messages) - len(to_add)}, добавляем: {len(to_add)}")
if not to_add:
print("Нечего добавлять!")
return
total = len(to_add)
indexed = 0
start_time = time.time()
for i in range(0, total, BATCH_SIZE):
batch = to_add[i:i + BATCH_SIZE]
texts = [m["text"][:MAX_TEXT_LEN] for m in batch]
ids = [str(m["id"]) for m in batch]
metadatas = []
for m in batch:
date_str = m.get("date", "")
try:
month = int(date_str[5:7]) if len(date_str) >= 7 else 0
except (ValueError, IndexError):
month = 0
metadatas.append({
"topic_id": m["topic_id"],
"topic_title": m["topic_title"],
"date": date_str,
"from_id": str(m.get("from_id", "")),
"month": month,
})
embeddings = model.encode(texts, show_progress_bar=False).tolist()
collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
indexed += len(batch)
print(f" {indexed}/{total} ({100*indexed/total:.1f}%)", end="\r")
elapsed = time.time() - start_time
print(f"\nГотово за {elapsed:.0f}с. Итого в коллекции: {collection.count()}")
if __name__ == "__main__":
main()