299 lines
10 KiB
Python
299 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Гибридный поиск: Meilisearch + ChromaDB + LLM суммаризация.
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
# Загружаем .env
|
||
def load_env():
|
||
env_file = Path.home() / ".openclaw" / ".env"
|
||
if env_file.exists():
|
||
with open(env_file) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line and not line.startswith("#") and "=" in line:
|
||
key, _, val = line.partition("=")
|
||
os.environ.setdefault(key.strip(), val.strip())
|
||
|
||
load_env()
|
||
|
||
MEILI_URL = "http://127.0.0.1:7700"
|
||
CHROMA_PATH = str(Path(__file__).parent.parent / "data" / "chromadb")
|
||
COLLECTION_NAME = "snowbike_embeddings"
|
||
INDEX_NAME = "snowbike_messages"
|
||
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||
OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY", "")
|
||
LLM_MODEL = "anthropic/claude-sonnet-4-5"
|
||
|
||
SYSTEM_PROMPT = """Ты — помощник по сноубайкам. Отвечаешь на вопросы на основе сообщений из Telegram-группы «Сноубайк Россия».
|
||
|
||
Правила:
|
||
1. Отвечай только на русском языке
|
||
2. Используй только информацию из предоставленных сообщений
|
||
3. Если информации недостаточно — честно скажи об этом
|
||
4. Всегда указывай источники: дата, топик, автор
|
||
5. Агрегируй мнения: если несколько человек говорят одно и то же — обобщи
|
||
6. Будь конкретен и краток"""
|
||
|
||
USER_PROMPT_TEMPLATE = """Вопрос: {question}
|
||
|
||
Найденные сообщения из чата:
|
||
{context}
|
||
|
||
Дай полный ответ на вопрос, основываясь только на этих сообщениях. Укажи источники (дата, топик, ID сообщения)."""
|
||
|
||
|
||
# Глобальные клиенты (инициализируются лениво)
|
||
_meili_client = None
|
||
_chroma_collection = None
|
||
_embed_model = None
|
||
|
||
|
||
def get_meili_client():
|
||
global _meili_client
|
||
if _meili_client is None:
|
||
import meilisearch
|
||
_meili_client = meilisearch.Client(MEILI_URL)
|
||
return _meili_client
|
||
|
||
|
||
def get_chroma_collection():
|
||
global _chroma_collection
|
||
if _chroma_collection is None:
|
||
import chromadb
|
||
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
||
_chroma_collection = client.get_collection(COLLECTION_NAME)
|
||
return _chroma_collection
|
||
|
||
|
||
def get_embed_model():
|
||
global _embed_model
|
||
if _embed_model is None:
|
||
from sentence_transformers import SentenceTransformer
|
||
_embed_model = SentenceTransformer(MODEL_NAME)
|
||
return _embed_model
|
||
|
||
|
||
def search_meilisearch(query: str, topic_ids: Optional[list] = None, limit: int = 20) -> list:
|
||
"""Полнотекстовый поиск в Meilisearch."""
|
||
try:
|
||
client = get_meili_client()
|
||
index = client.get_index(INDEX_NAME)
|
||
|
||
params = {"limit": limit, "attributesToRetrieve": ["*"]}
|
||
|
||
if topic_ids:
|
||
filter_str = " OR ".join(f"topic_id = {t}" for t in topic_ids)
|
||
params["filter"] = filter_str
|
||
|
||
result = index.search(query, params)
|
||
return result.get("hits", [])
|
||
except Exception as e:
|
||
print(f"Meilisearch ошибка: {e}", file=sys.stderr)
|
||
return []
|
||
|
||
|
||
def search_chromadb(query: str, topic_ids: Optional[list] = None, limit: int = 20) -> list:
|
||
"""Семантический поиск в ChromaDB."""
|
||
try:
|
||
model = get_embed_model()
|
||
collection = get_chroma_collection()
|
||
|
||
embedding = model.encode(query).tolist()
|
||
|
||
where = None
|
||
if topic_ids and len(topic_ids) == 1:
|
||
where = {"topic_id": topic_ids[0]}
|
||
elif topic_ids and len(topic_ids) > 1:
|
||
where = {"$or": [{"topic_id": t} for t in topic_ids]}
|
||
|
||
kwargs = {
|
||
"query_embeddings": [embedding],
|
||
"n_results": limit,
|
||
"include": ["documents", "metadatas", "distances"],
|
||
}
|
||
if where:
|
||
kwargs["where"] = where
|
||
|
||
result = collection.query(**kwargs)
|
||
|
||
hits = []
|
||
ids = result.get("ids", [[]])[0]
|
||
docs = result.get("documents", [[]])[0]
|
||
metas = result.get("metadatas", [[]])[0]
|
||
dists = result.get("distances", [[]])[0]
|
||
|
||
for msg_id, doc, meta, dist in zip(ids, docs, metas, dists):
|
||
hits.append({
|
||
"id": int(msg_id),
|
||
"text": doc,
|
||
"topic_id": meta.get("topic_id"),
|
||
"topic_title": meta.get("topic_title", ""),
|
||
"date": meta.get("date", ""),
|
||
"from_id": meta.get("from_id", ""),
|
||
"_chroma_distance": dist,
|
||
})
|
||
|
||
return hits
|
||
except Exception as e:
|
||
print(f"ChromaDB ошибка: {e}", file=sys.stderr)
|
||
return []
|
||
|
||
|
||
def merge_results(meili_hits: list, chroma_hits: list, limit: int = 15) -> list:
|
||
"""Объединяем и дедуплицируем результаты."""
|
||
seen_ids = set()
|
||
merged = []
|
||
|
||
# Meilisearch результаты — с более высоким приоритетом
|
||
for hit in meili_hits:
|
||
msg_id = hit.get("id")
|
||
if msg_id and msg_id not in seen_ids:
|
||
hit["_source"] = "meilisearch"
|
||
merged.append(hit)
|
||
seen_ids.add(msg_id)
|
||
|
||
# ChromaDB результаты — дополняем
|
||
for hit in chroma_hits:
|
||
msg_id = hit.get("id")
|
||
if msg_id and msg_id not in seen_ids:
|
||
hit["_source"] = "chromadb"
|
||
merged.append(hit)
|
||
seen_ids.add(msg_id)
|
||
|
||
return merged[:limit]
|
||
|
||
|
||
def format_context(messages: list) -> str:
|
||
"""Форматируем контекст для LLM."""
|
||
lines = []
|
||
for i, msg in enumerate(messages, 1):
|
||
date = msg.get("date", "")[:10] # Только дата, без времени
|
||
topic = msg.get("topic_title", "?")
|
||
author = msg.get("from_id", "?")
|
||
text = msg.get("text", "")
|
||
msg_id = msg.get("id", "?")
|
||
|
||
lines.append(f"[{i}] ID:{msg_id} | {date} | {topic} | автор:{author}")
|
||
lines.append(f" {text}")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def ask_llm(question: str, context: str) -> str:
|
||
"""Задаём вопрос LLM через OpenRouter."""
|
||
import requests
|
||
|
||
if not OPENROUTER_KEY:
|
||
return "ОШИБКА: OPENROUTER_API_KEY не задан в ~/.openclaw/.env"
|
||
|
||
user_msg = USER_PROMPT_TEMPLATE.format(question=question, context=context)
|
||
|
||
response = requests.post(
|
||
"https://openrouter.ai/api/v1/chat/completions",
|
||
headers={
|
||
"Authorization": f"Bearer {OPENROUTER_KEY}",
|
||
"Content-Type": "application/json",
|
||
"HTTP-Referer": "https://snowbike-rag.local",
|
||
"X-Title": "Snowbike RAG",
|
||
},
|
||
json={
|
||
"model": LLM_MODEL,
|
||
"messages": [
|
||
{"role": "system", "content": SYSTEM_PROMPT},
|
||
{"role": "user", "content": user_msg},
|
||
],
|
||
"max_tokens": 1500,
|
||
"temperature": 0.3,
|
||
},
|
||
timeout=30,
|
||
)
|
||
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
return data["choices"][0]["message"]["content"]
|
||
|
||
|
||
def search(query: str, topic_ids: Optional[list] = None, limit: int = 15) -> dict:
|
||
"""
|
||
Главная функция гибридного поиска.
|
||
|
||
Returns:
|
||
dict с полями: query, answer, sources, count, time_ms
|
||
"""
|
||
start_time = time.time()
|
||
|
||
# Поиск
|
||
meili_hits = search_meilisearch(query, topic_ids, limit=20)
|
||
chroma_hits = search_chromadb(query, topic_ids, limit=20)
|
||
|
||
merged = merge_results(meili_hits, chroma_hits, limit=limit)
|
||
|
||
if not merged:
|
||
return {
|
||
"query": query,
|
||
"answer": "К сожалению, по вашему запросу ничего не найдено в базе сообщений.",
|
||
"sources": [],
|
||
"count": 0,
|
||
"time_ms": int((time.time() - start_time) * 1000),
|
||
}
|
||
|
||
# Формируем контекст для LLM
|
||
context = format_context(merged)
|
||
|
||
# LLM ответ
|
||
answer = ask_llm(query, context)
|
||
|
||
# Источники
|
||
sources = [
|
||
{
|
||
"id": msg.get("id"),
|
||
"date": msg.get("date", "")[:10],
|
||
"topic": msg.get("topic_title", ""),
|
||
"author": str(msg.get("from_id", "")),
|
||
"text_preview": msg.get("text", "")[:100],
|
||
}
|
||
for msg in merged
|
||
]
|
||
|
||
elapsed_ms = int((time.time() - start_time) * 1000)
|
||
|
||
return {
|
||
"query": query,
|
||
"answer": answer,
|
||
"sources": sources,
|
||
"count": len(merged),
|
||
"time_ms": elapsed_ms,
|
||
}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Тестовые запросы
|
||
test_queries = [
|
||
"Какое масло рекомендуют для Polaris 850?",
|
||
"Где лучше кататься зимой в Подмосковье?",
|
||
"Какие гусеницы подходят на Timber S800?",
|
||
"Кто продавал запчасти для Lynx в Китае?",
|
||
"Какие проблемы бывают с Yamaha Mountain Max?",
|
||
]
|
||
|
||
query = sys.argv[1] if len(sys.argv) > 1 else test_queries[0]
|
||
|
||
print(f"Запрос: {query}\n")
|
||
result = search(query)
|
||
|
||
print(f"Найдено источников: {result['count']}")
|
||
print(f"Время: {result['time_ms']} мс\n")
|
||
print("=== ОТВЕТ ===")
|
||
print(result["answer"])
|
||
print("\n=== ИСТОЧНИКИ ===")
|
||
for s in result["sources"][:5]:
|
||
print(f" [{s['id']}] {s['date']} | {s['topic']} | {s['text_preview'][:80]}...")
|