90 lines
3.1 KiB
Python
90 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Парсинг сырых batch-файлов Telegram в плоский JSONL.
|
|
Вход: /data/telegram-collector/raw/1242788123/{topic_id}/batch_*.json
|
|
Выход: tasks/snowbike-rag/data/messages.jsonl
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import glob
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
RAW_BASE = "/home/node/.openclaw/workspace/data/telegram-collector/raw/1242788123"
|
|
OUTPUT_DIR = Path(__file__).parent.parent / "data"
|
|
OUTPUT_FILE = OUTPUT_DIR / "messages.jsonl"
|
|
META_FILE = os.path.join(RAW_BASE, "meta.json")
|
|
|
|
|
|
def load_meta():
|
|
with open(META_FILE, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def parse_all(output_file=None):
|
|
if output_file is None:
|
|
output_file = OUTPUT_FILE
|
|
|
|
meta = load_meta()
|
|
topics = meta.get("topics", {})
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
total = 0
|
|
skipped = 0
|
|
|
|
print(f"Топиков: {len(topics)}")
|
|
print(f"Выходной файл: {output_file}")
|
|
|
|
with open(output_file, "w", encoding="utf-8") as out:
|
|
for topic_id, topic_title in topics.items():
|
|
topic_dir = os.path.join(RAW_BASE, topic_id)
|
|
if not os.path.isdir(topic_dir):
|
|
print(f" Пропуск топика {topic_id} ({topic_title}): директория не найдена")
|
|
continue
|
|
|
|
batch_files = sorted(glob.glob(os.path.join(topic_dir, "batch_*.json")))
|
|
topic_count = 0
|
|
|
|
for batch_file in batch_files:
|
|
try:
|
|
with open(batch_file, "r", encoding="utf-8") as f:
|
|
messages = json.load(f)
|
|
|
|
for msg in messages:
|
|
text = msg.get("text", "")
|
|
if not text or not text.strip():
|
|
skipped += 1
|
|
continue
|
|
|
|
record = {
|
|
"id": msg["id"],
|
|
"text": text.strip(),
|
|
"date": msg.get("date", ""),
|
|
"topic_id": int(topic_id),
|
|
"topic_title": topic_title,
|
|
"from_id": msg.get("from_id"),
|
|
"reply_to_msg_id": msg.get("reply_to_msg_id"),
|
|
"has_media": bool(msg.get("media")),
|
|
}
|
|
|
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
topic_count += 1
|
|
total += 1
|
|
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
print(f" ОШИБКА в {batch_file}: {e}")
|
|
continue
|
|
|
|
print(f" Топик {topic_id} ({topic_title}): {topic_count} сообщений")
|
|
|
|
print(f"\nИтого: {total} сообщений сохранено, {skipped} пропущено (без текста)")
|
|
print(f"Файл: {output_file}")
|
|
return total
|
|
|
|
|
|
if __name__ == "__main__":
|
|
count = parse_all()
|
|
sys.exit(0 if count > 0 else 1)
|