#!/usr/bin/env python3 """ Парсинг сырых batch-файлов Telegram в плоский JSONL. Вход: /data/telegram-collector/raw/1242788123/{topic_id}/batch_*.json Выход: tasks/snowbike-rag/data/messages.jsonl """ import json import os import glob import sys from pathlib import Path RAW_BASE = "/home/node/.openclaw/workspace/data/telegram-collector/raw/1242788123" OUTPUT_DIR = Path(__file__).parent.parent / "data" OUTPUT_FILE = OUTPUT_DIR / "messages.jsonl" META_FILE = os.path.join(RAW_BASE, "meta.json") def load_meta(): with open(META_FILE, "r", encoding="utf-8") as f: return json.load(f) def parse_all(output_file=None): if output_file is None: output_file = OUTPUT_FILE meta = load_meta() topics = meta.get("topics", {}) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) total = 0 skipped = 0 print(f"Топиков: {len(topics)}") print(f"Выходной файл: {output_file}") with open(output_file, "w", encoding="utf-8") as out: for topic_id, topic_title in topics.items(): topic_dir = os.path.join(RAW_BASE, topic_id) if not os.path.isdir(topic_dir): print(f" Пропуск топика {topic_id} ({topic_title}): директория не найдена") continue batch_files = sorted(glob.glob(os.path.join(topic_dir, "batch_*.json"))) topic_count = 0 for batch_file in batch_files: try: with open(batch_file, "r", encoding="utf-8") as f: messages = json.load(f) for msg in messages: text = msg.get("text", "") if not text or not text.strip(): skipped += 1 continue record = { "id": msg["id"], "text": text.strip(), "date": msg.get("date", ""), "topic_id": int(topic_id), "topic_title": topic_title, "from_id": msg.get("from_id"), "reply_to_msg_id": msg.get("reply_to_msg_id"), "has_media": bool(msg.get("media")), } out.write(json.dumps(record, ensure_ascii=False) + "\n") topic_count += 1 total += 1 except (json.JSONDecodeError, KeyError) as e: print(f" ОШИБКА в {batch_file}: {e}") continue print(f" Топик {topic_id} ({topic_title}): {topic_count} сообщений") print(f"\nИтого: {total} сообщений сохранено, {skipped} пропущено (без текста)") print(f"Файл: {output_file}") return total if __name__ == "__main__": count = parse_all() sys.exit(0 if count > 0 else 1)