85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Минимальный инкрементальный индексер — добавляет только сообщения из указанного файла.
|
||
Не делает полного ребилда.
|
||
"""
|
||
import json, sys, time
|
||
from pathlib import Path
|
||
|
||
CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb")
|
||
COLLECTION_NAME = "snowbike_embeddings"
|
||
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||
BATCH_SIZE = 64
|
||
MAX_TEXT_LEN = 512
|
||
INCREMENTAL_FILE = Path(__file__).parent.parent / "data" / "incremental_20260407.jsonl"
|
||
|
||
def main():
|
||
print("=== Инкрементальная индексация ChromaDB ===")
|
||
|
||
import chromadb
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
# Load messages from incremental file only
|
||
messages = []
|
||
with open(INCREMENTAL_FILE, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
msg = json.loads(line)
|
||
if len(msg.get("text", "")) >= 5:
|
||
messages.append(msg)
|
||
|
||
print(f"Загружено из incremental: {len(messages)} сообщений")
|
||
|
||
print(f"Загружаем модель {MODEL_NAME}...")
|
||
model = SentenceTransformer(MODEL_NAME)
|
||
print("Модель загружена")
|
||
|
||
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
||
collection = client.get_collection(COLLECTION_NAME)
|
||
existing_count = collection.count()
|
||
print(f"Текущий размер коллекции: {existing_count}")
|
||
|
||
# Check which IDs already exist
|
||
existing_ids = set(collection.get(include=[])["ids"])
|
||
to_add = [m for m in messages if str(m["id"]) not in existing_ids]
|
||
print(f"Уже есть: {len(messages) - len(to_add)}, добавляем: {len(to_add)}")
|
||
|
||
if not to_add:
|
||
print("Нечего добавлять!")
|
||
return
|
||
|
||
total = len(to_add)
|
||
indexed = 0
|
||
start_time = time.time()
|
||
|
||
for i in range(0, total, BATCH_SIZE):
|
||
batch = to_add[i:i + BATCH_SIZE]
|
||
texts = [m["text"][:MAX_TEXT_LEN] for m in batch]
|
||
ids = [str(m["id"]) for m in batch]
|
||
metadatas = []
|
||
for m in batch:
|
||
date_str = m.get("date", "")
|
||
try:
|
||
month = int(date_str[5:7]) if len(date_str) >= 7 else 0
|
||
except (ValueError, IndexError):
|
||
month = 0
|
||
metadatas.append({
|
||
"topic_id": m["topic_id"],
|
||
"topic_title": m["topic_title"],
|
||
"date": date_str,
|
||
"from_id": str(m.get("from_id", "")),
|
||
"month": month,
|
||
})
|
||
embeddings = model.encode(texts, show_progress_bar=False).tolist()
|
||
collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metadatas)
|
||
indexed += len(batch)
|
||
print(f" {indexed}/{total} ({100*indexed/total:.1f}%)", end="\r")
|
||
|
||
elapsed = time.time() - start_time
|
||
print(f"\nГотово за {elapsed:.0f}с. Итого в коллекции: {collection.count()}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|