Files
wiki/tasks/flightradar24/ingest/schedule/backfill.py
2026-04-20 14:20:01 +03:00

122 lines
3.8 KiB
Python

"""
Backfill CLI — loads historical schedule data for a date range.
Saves progress to fr24_ext.load_state so it can resume after interruption.
Usage:
python backfill.py --start-date 2026-04-01 --end-date 2026-04-19
python backfill.py --start-date 2026-04-01 --end-date 2026-04-19 --skip-opensky
"""
import argparse
import json
import logging
import sys
from datetime import date, timedelta
import psycopg2
from config import config
from yandex_worker import fetch_day as yandex_fetch_day
from opensky_worker import enrich_day as opensky_enrich_day
log = logging.getLogger(__name__)
STATE_KEY = "backfill_last_date"
def load_state(conn, key: str):
with conn.cursor() as cur:
cur.execute(
"SELECT state_value FROM fr24_ext.load_state WHERE state_key = %s",
(key,),
)
row = cur.fetchone()
return row[0] if row else None
def save_state(conn, key: str, value: dict):
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO fr24_ext.load_state (state_key, state_value)
VALUES (%s, %s::jsonb)
ON CONFLICT (state_key) DO UPDATE
SET state_value = EXCLUDED.state_value,
updated_at = now()
""",
(key, json.dumps(value)),
)
conn.commit()
def main():
parser = argparse.ArgumentParser(description="Backfill fr24_ext.schedule")
parser.add_argument("--start-date", required=True, help="YYYY-MM-DD")
parser.add_argument("--end-date", required=True, help="YYYY-MM-DD")
parser.add_argument("--skip-opensky", action="store_true",
help="Skip OpenSky enrichment (faster, no icao24)")
parser.add_argument("--resume", action="store_true",
help="Resume from last saved state (ignores --start-date if state exists)")
args = parser.parse_args()
start = date.fromisoformat(args.start_date)
end = date.fromisoformat(args.end_date)
if start > end:
log.error("start-date must be <= end-date")
sys.exit(1)
conn = psycopg2.connect(config.DB_DSN)
# Resume from saved state if requested
if args.resume:
state = load_state(conn, STATE_KEY)
if state and state.get("last_date"):
last = date.fromisoformat(state["last_date"])
resume_from = last + timedelta(days=1)
if resume_from > start:
log.info("Resuming from %s (last completed: %s)", resume_from, last)
start = resume_from
current = start
total_flights = 0
total_enriched = 0
log.info("Backfill: %s%s (%d days)", start, end, (end - start).days + 1)
while current <= end:
log.info("── Processing %s ──", current)
try:
yandex_count = yandex_fetch_day(current, conn)
total_flights += yandex_count
log.info("Yandex: %d flights", yandex_count)
if not args.skip_opensky:
opensky_count = opensky_enrich_day(current, conn)
total_enriched += opensky_count
log.info("OpenSky: %d enriched", opensky_count)
save_state(conn, STATE_KEY, {"last_date": current.isoformat()})
except KeyboardInterrupt:
log.info("Interrupted. Progress saved up to %s", current - timedelta(days=1))
break
except Exception as e:
log.error("Failed on %s: %s — stopping", current, e)
break
current += timedelta(days=1)
conn.close()
log.info("Backfill done. Flights: %d, Enriched: %d", total_flights, total_enriched)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [backfill] %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
main()