Files
wiki/tasks/flightradar24/ingest/mart/build_mart.py
2026-04-22 00:10:02 +03:00

644 lines
24 KiB
Python

"""
Mart builder: merges all track sources into fr24_mart.
Priority: RTL-SDR > FR24 > FlightAware
For each flight in fr24_ext.schedule:
1. Find matching tracks from each source
2. Pick best available track
3. Copy points to fr24_mart.track_points with noise_score
4. Update fr24_mart.noise_grid (0.01° cells)
5. Update fr24_mart.source_coverage
"""
import logging
import re
from datetime import date
from typing import Dict, List, Optional, Tuple
import psycopg2
import psycopg2.extras
from noise_model import altitude_to_noise_db
log = logging.getLogger("build_mart")
# ft → m conversion
FT_TO_M = 0.3048
# ICAO → IATA airport mapping (for matching schedule IATA to track ICAO)
ICAO_TO_IATA = {
"UUEE": "SVO", "UUDD": "DME", "UUWW": "VKO", "UUBW": "ZIA",
"ULLI": "LED", "USSS": "SVX", "UNNT": "OVB", "UUEM": "KZN",
"UWGG": "GOJ", "UWUR": "MCX", "URSS": "AER", "URKK": "KRR",
"UMMS": "MSQ", "UKBB": "KBP", "UKLL": "LWO", "UTTT": "TAS",
"UTAA": "ASB", "LTFM": "IST", "EDDF": "FRA", "LFPG": "CDG",
"EGLL": "LHR", "LEMD": "MAD", "LIRF": "FCO", "EHAM": "AMS",
"LPPT": "LIS", "EDDM": "MUC", "LOWW": "VIE", "LKPR": "PRG",
"EPWA": "WAW", "EVRA": "RIX", "EYVI": "VNO", "EETN": "TLL",
"UACC": "TSE", "UATG": "GUW", "UAII": "CIT", "UTNU": "UGC",
"UTSB": "BHK", "UTSS": "SKD", "UTST": "TJK", "OEGS": "GIZ",
"RJTT": "HND", "RJBB": "KIX", "ZBAA": "PEK", "ZSSS": "SHA",
"ZSPD": "PVG", "VIDP": "DEL", "VABB": "BOM", "OMDB": "DXB",
"OTHH": "DOH", "OJAI": "AMM", "LLBG": "TLV",
# Russian domestic
"URWW": "VOG", "UNKL": "KJA", "USCC": "CEK", "UWUU": "UFA",
"XWGS": "PEE", "UWKD": "KZN", "ULMM": "MMK", "ULWW": "PES",
"ULPB": "PES", "UUYY": "SYK", "USDD": "TOF", "UNOO": "OMS",
"UNBB": "BAX", "UNWW": "NOZ", "ULWC": "KEM", "UIII": "IKT",
"UITT": "UUS", "UIBB": "BQS", "UHPP": "PKC", "UHMD": "GDX",
"UHMM": "MMK", "UASK": "IGT", "UATT": "AKX", "UAOO": "KSN",
"UKDD": "DNK", "UKDE": "ZAP",
# Turkey
"LTBA": "ISL", "LTAI": "AYT", "LTBS": "DLM", "LTBJ": "ADB",
# Spain / Canaries / Portugal
"LEAL": "ALC", "LEPA": "PMI", "GCTS": "TFS", "GCRR": "ACE",
"GCFV": "FUE", "GCLA": "SPC", "LPFR": "FAO",
# Cyprus
"LCLK": "LCA", "LCPH": "PFO",
# Africa
"HECA": "CAI", "HTDA": "DAR", "HAAB": "ADD",
"HESH": "SSH", "HEGN": "HRG",
# Southeast Asia
"VDPP": "PNH", "VVNB": "HAN", "VVTS": "SGN",
"WSSS": "SIN", "VTBD": "DMK", "VTBS": "BKK",
# East Asia
"RJAA": "NRT", "RKSI": "ICN",
"ZBAD": "PKX", "ZGGG": "CAN", "ZHHH": "WUH", "ZWWW": "URC",
# Gulf
"OMAA": "AUH", "OERK": "RUH", "OEDF": "DMM",
}
IATA_TO_ICAO = {v: k for k, v in ICAO_TO_IATA.items()}
# IATA airline code → ICAO airline code (for RTL-SDR callsign conversion)
AIRLINE_IATA_TO_ICAO = {
"SU": "AFL", # Аэрофлот
"FV": "SDM", # Россия
"DP": "PBD", # Победа
"S7": "SBI", # S7
"U6": "SVR", # Уральские авиалинии
"UT": "UTS", # UTair
"N4": "NWS", # Nordwind
"5N": "AUL", # Smartavia
"7K": "KYV", # Ямал
"6W": "TZA", # Saratov (исторический)
"ZX": "AZS", # Azimuth
"RT": "RLT", # РУСЛАЙН
"TK": "THY", # Turkish Airlines
"LH": "DLH", # Lufthansa
"AF": "AFR", # Air France
"BA": "BAW", # British Airways
"EK": "UAE", # Emirates
"QR": "QTR", # Qatar Airways
"SV": "SVA", # Saudia
"ET": "ETH", # Ethiopian
"FZ": "FDB", # flydubai
"CZ": "CSN", # China Southern
"CA": "CCA", # Air China
"MU": "CES", # China Eastern
"HU": "CHH", # Hainan Airlines
"9C": "CQH", # Spring Airlines
"MS": "MSR", # EgyptAir
"AT": "RAM", # Royal Air Maroc
"IR": "IRA", # Iran Air
"W5": "IRM", # Mahan Air
"KC": "KZR", # Air Astana
"HY": "UZB", # Uzbekistan Airways
"T5": "TUA", # Turkmenistan Airlines
"J2": "AHY", # Azerbaijan Airlines
"A9": "TGZ", # Georgian Airways
"QN": "RLU", # Royal Flight
}
def _flight_number_to_callsign(flight_number: str) -> Optional[str]:
"""Convert 'SU 1057' (IATA) to 'AFL1057' (ICAO callsign) for RTL-SDR matching."""
m = re.match(r'^([A-Z0-9]{1,3})\s*(\d+)$', flight_number.strip())
if not m:
return None
iata_code, num = m.group(1), m.group(2)
icao_code = AIRLINE_IATA_TO_ICAO.get(iata_code)
if icao_code:
return f"{icao_code}{num}"
return None
def _ft_to_m(ft: Optional[int]) -> Optional[int]:
if ft is None:
return None
return int(ft * FT_TO_M)
# ── source matchers ───────────────────────────────────────────
def find_rtlsdr_flight(conn, flight_number: str, flight_date: date) -> Optional[int]:
"""Return fr24.flights.flight_id for RTL-SDR data.
Converts IATA flight_number (e.g. 'SU 1057') to ICAO callsign ('AFL1057')."""
callsign = _flight_number_to_callsign(flight_number)
if not callsign:
return None
with conn.cursor() as cur:
cur.execute(
"SELECT f.flight_id FROM fr24.flights f "
"WHERE f.callsign = %s AND f.started_at::date = %s "
"ORDER BY f.started_at LIMIT 1",
(callsign, flight_date),
)
row = cur.fetchone()
return row[0] if row else None
def _extract_flight_num(flight_number: str) -> str:
"""Extract numeric part: 'FV 6807''6807', 'SU6807''6807'."""
digits = re.sub(r'[^0-9]', '', flight_number)
return digits
def find_fr24_track(conn, flight_number: str, flight_date: date,
origin_iata: str = None, destination_iata: str = None
) -> Optional[Tuple[int, str]]:
"""Return (id, aircraft_type) from fr24_ext.flight_tracks_fr24.
Matches by numeric flight number + optional route (IATA→ICAO)."""
fnum = _extract_flight_num(flight_number)
if not fnum:
return None
with conn.cursor() as cur:
# First try exact match on flight_number
cur.execute(
"""
SELECT id, aircraft_type, origin_icao, destination_icao
FROM fr24_ext.flight_tracks_fr24
WHERE flight_number = %s AND flight_date = %s
ORDER BY fetched_at DESC
""",
(flight_number, flight_date),
)
rows = cur.fetchall()
if rows:
if len(rows) == 1:
return (rows[0][0], rows[0][1])
# Multiple matches — try to disambiguate by route
if origin_iata and destination_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
dest_iata = ICAO_TO_IATA.get(row[3])
if orig_iata == origin_iata and dest_iata == destination_iata:
return (row[0], row[1])
return (rows[0][0], rows[0][1])
# No exact match — try by numeric flight number
# FR24 flight_number format: 'SU6807' (ICAO code + digits)
# Schedule format: 'FV 6807' (IATA code + space + digits)
# Match by numeric suffix
cur.execute(
"""
SELECT id, aircraft_type, origin_icao, destination_icao
FROM fr24_ext.flight_tracks_fr24
WHERE regexp_replace(flight_number, '[^0-9]', '', 'g') = %s
AND flight_date = %s
ORDER BY fetched_at DESC
""",
(fnum, flight_date),
)
rows = cur.fetchall()
if not rows:
return None
# Full route match (preferred)
if origin_iata and destination_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
dest_iata = ICAO_TO_IATA.get(row[3])
if orig_iata == origin_iata and dest_iata == destination_iata:
return (row[0], row[1])
# Fallback: match by origin only (full route match failed)
if origin_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
if orig_iata == origin_iata:
return (row[0], row[1])
# No match
return None
def find_fa_track(conn, flight_number: str, flight_date: date,
origin_iata: str = None, destination_iata: str = None
) -> Optional[Tuple[int, str]]:
"""Return (id, aircraft_type) from fr24_ext.flight_tracks_fa.
Matches by numeric flight number + optional route."""
fnum = _extract_flight_num(flight_number)
if not fnum:
return None
ident = flight_number.replace(" ", "")
with conn.cursor() as cur:
# Exact match on ident_iata
cur.execute(
"""
SELECT id, aircraft_type, origin_icao, destination_icao
FROM fr24_ext.flight_tracks_fa
WHERE ident_iata = %s AND flight_date = %s
ORDER BY fetched_at DESC
""",
(ident, flight_date),
)
rows = cur.fetchall()
if rows:
if len(rows) == 1:
return (rows[0][0], rows[0][1])
if origin_iata and destination_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
dest_iata = ICAO_TO_IATA.get(row[3])
if orig_iata == origin_iata and dest_iata == destination_iata:
return (row[0], row[1])
return (rows[0][0], rows[0][1])
# Try by numeric ident + route
cur.execute(
"""
SELECT id, aircraft_type, origin_icao, destination_icao
FROM fr24_ext.flight_tracks_fa
WHERE regexp_replace(ident_iata, '[^0-9]', '', 'g') = %s
AND flight_date = %s
ORDER BY fetched_at DESC
""",
(fnum, flight_date),
)
rows = cur.fetchall()
if not rows:
return None
# Full route match (preferred)
if origin_iata and destination_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
dest_iata = ICAO_TO_IATA.get(row[3])
if orig_iata == origin_iata and dest_iata == destination_iata:
return (row[0], row[1])
# Fallback: match by origin only (full route match failed)
if origin_iata:
for row in rows:
orig_iata = ICAO_TO_IATA.get(row[2])
if orig_iata == origin_iata:
return (row[0], row[1])
# No match
return None
# ── point fetchers ────────────────────────────────────────────
def get_rtlsdr_points(conn, flight_id: int) -> List[Dict]:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"""
SELECT tp.observed_at,
ST_Y(tp.geom) AS lat,
ST_X(tp.geom) AS lon,
tp.altitude_m,
tp.ground_speed_kt AS speed_kt,
tp.heading_deg AS heading
FROM fr24.track_points tp
JOIN fr24.tracks t ON t.track_id = tp.track_id
WHERE t.flight_id = %s
ORDER BY tp.observed_at
""",
(flight_id,),
)
return [dict(r) for r in cur.fetchall()]
def get_fr24_points(conn, track_id: int) -> List[Dict]:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"""
SELECT observed_at, lat, lon,
altitude_ft, gspeed_kt AS speed_kt, heading
FROM fr24_ext.track_points_fr24
WHERE track_id = %s
ORDER BY observed_at
""",
(track_id,),
)
rows = [dict(r) for r in cur.fetchall()]
# convert ft → m
for r in rows:
r["altitude_m"] = _ft_to_m(r.pop("altitude_ft", None))
return rows
def get_fa_points(conn, track_id: int) -> List[Dict]:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"""
SELECT observed_at, lat, lon,
altitude_ft, gspeed_kt AS speed_kt, heading
FROM fr24_ext.track_points_fa
WHERE track_id = %s
ORDER BY observed_at
""",
(track_id,),
)
rows = [dict(r) for r in cur.fetchall()]
for r in rows:
r["altitude_m"] = _ft_to_m(r.pop("altitude_ft", None))
return rows
# ── mart writers ──────────────────────────────────────────────
def upsert_mart_flight(conn, sched: Dict, source_info: Dict) -> int:
"""Upsert into fr24_mart.flights, return mart flight id."""
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO fr24_mart.flights
(flight_number, callsign, icao24, airline_iata,
origin_iata, destination_iata, aircraft_type,
flight_date, scheduled_dep,
has_schedule, has_rtlsdr, has_fr24, has_fa,
track_source, track_points,
schedule_id, fr24_track_id, fa_track_id, rtlsdr_flight_id,
updated_at)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now())
ON CONFLICT (flight_date, callsign) DO UPDATE SET
flight_number = EXCLUDED.flight_number,
airline_iata = EXCLUDED.airline_iata,
origin_iata = EXCLUDED.origin_iata,
destination_iata = EXCLUDED.destination_iata,
aircraft_type = COALESCE(EXCLUDED.aircraft_type, fr24_mart.flights.aircraft_type),
scheduled_dep = EXCLUDED.scheduled_dep,
has_schedule = EXCLUDED.has_schedule,
has_rtlsdr = EXCLUDED.has_rtlsdr,
has_fr24 = EXCLUDED.has_fr24,
has_fa = EXCLUDED.has_fa,
track_source = EXCLUDED.track_source,
track_points = EXCLUDED.track_points,
schedule_id = EXCLUDED.schedule_id,
fr24_track_id = EXCLUDED.fr24_track_id,
fa_track_id = EXCLUDED.fa_track_id,
rtlsdr_flight_id = EXCLUDED.rtlsdr_flight_id,
updated_at = now()
RETURNING id
""",
(
sched.get("flight_number"),
sched.get("callsign") or sched.get("flight_number"),
None,
sched.get("airline_iata"),
sched.get("origin_iata"),
sched.get("destination_iata"),
source_info.get("aircraft_type"),
sched["flight_date"],
sched.get("scheduled_at"),
True,
source_info.get("has_rtlsdr", False),
source_info.get("has_fr24", False),
source_info.get("has_fa", False),
source_info.get("track_source"),
source_info.get("track_points", 0),
sched.get("schedule_id"),
source_info.get("fr24_track_id"),
source_info.get("fa_track_id"),
source_info.get("rtlsdr_flight_id"),
),
)
row = cur.fetchone()
return row[0]
def insert_mart_points(conn, mart_flight_id: int, points: List[Dict],
source: str, aircraft_type: str):
"""Delete old mart points and insert new ones with noise_score."""
with conn.cursor() as cur:
cur.execute("DELETE FROM fr24_mart.track_points WHERE flight_id = %s", (mart_flight_id,))
if not points:
return
args = []
for p in points:
alt_m = float(p.get("altitude_m") or 0)
alt_ft = alt_m / FT_TO_M
noise = altitude_to_noise_db(alt_ft, aircraft_type or "default")
args.append((
mart_flight_id,
p["observed_at"],
p["lat"],
p["lon"],
alt_m,
p.get("speed_kt"),
p.get("heading"),
source,
round(noise, 2),
))
psycopg2.extras.execute_values(
cur,
"""
INSERT INTO fr24_mart.track_points
(flight_id, observed_at, lat, lon, altitude_m,
speed_kt, heading, source, noise_score)
VALUES %s
""",
args,
)
def update_noise_grid(conn, flight_date: date):
"""Aggregate track_points into noise_grid by 0.01° cells."""
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO fr24_mart.noise_grid
(grid_lat, grid_lon, period_date, flight_count, noise_score, avg_altitude_m, updated_at)
SELECT
round(lat::numeric, 2) AS grid_lat,
round(lon::numeric, 2) AS grid_lon,
%s AS period_date,
COUNT(DISTINCT flight_id) AS flight_count,
AVG(tp.noise_score) AS noise_score,
AVG(altitude_m) AS avg_altitude_m,
now()
FROM fr24_mart.track_points tp
JOIN fr24_mart.flights f ON f.id = tp.flight_id
WHERE f.flight_date = %s
GROUP BY grid_lat, grid_lon
ON CONFLICT (grid_lat, grid_lon, period_date) DO UPDATE SET
flight_count = EXCLUDED.flight_count,
noise_score = EXCLUDED.noise_score,
avg_altitude_m = EXCLUDED.avg_altitude_m,
updated_at = now()
""",
(flight_date, flight_date),
)
def update_source_coverage(conn, flight_date: date):
"""Recalculate source_coverage for the date."""
with conn.cursor() as cur:
cur.execute(
"SELECT count(*), count(*) FILTER(WHERE has_fr24), count(*) FILTER(WHERE has_rtlsdr)"
" FROM fr24_mart.flights WHERE flight_date = %s",
(flight_date,),
)
row = cur.fetchone()
log.info("source_coverage debug: total=%s fr24=%s rtlsdr=%s", *row)
cur.execute(
"""
INSERT INTO fr24_mart.source_coverage
(coverage_date, total_schedule, with_rtlsdr, with_fr24, with_fa,
schedule_only, rtlsdr_pct, fr24_pct, fa_pct, updated_at)
SELECT
%s,
COUNT(*) AS total_schedule,
COUNT(*) FILTER (WHERE has_rtlsdr) AS with_rtlsdr,
COUNT(*) FILTER (WHERE has_fr24) AS with_fr24,
COUNT(*) FILTER (WHERE has_fa) AS with_fa,
COUNT(*) FILTER (WHERE NOT has_rtlsdr AND NOT has_fr24 AND NOT has_fa) AS schedule_only,
ROUND(100.0 * COUNT(*) FILTER (WHERE has_rtlsdr) / NULLIF(COUNT(*),0), 1),
ROUND(100.0 * COUNT(*) FILTER (WHERE has_fr24) / NULLIF(COUNT(*),0), 1),
ROUND(100.0 * COUNT(*) FILTER (WHERE has_fa) / NULLIF(COUNT(*),0), 1),
now()
FROM fr24_mart.flights
WHERE flight_date = %s
ON CONFLICT (coverage_date) DO UPDATE SET
total_schedule = EXCLUDED.total_schedule,
with_rtlsdr = EXCLUDED.with_rtlsdr,
with_fr24 = EXCLUDED.with_fr24,
with_fa = EXCLUDED.with_fa,
schedule_only = EXCLUDED.schedule_only,
rtlsdr_pct = EXCLUDED.rtlsdr_pct,
fr24_pct = EXCLUDED.fr24_pct,
fa_pct = EXCLUDED.fa_pct,
updated_at = now()
""",
(flight_date, flight_date),
)
# ── main ──────────────────────────────────────────────────────
def build(target_date: date, conn) -> Dict:
log.info("Mart build: starting for %s", target_date)
stats = {
"date": str(target_date),
"schedule_flights": 0,
"mart_flights": 0,
"with_track": 0,
"errors": 0,
}
# Load schedule for the date
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(
"""
SELECT DISTINCT ON (flight_number, direction)
schedule_id, flight_number, airline_iata, origin_iata, destination_iata,
scheduled_at, aircraft_type, flight_date,
flight_number AS callsign
FROM fr24_ext.schedule
WHERE flight_date = %s
ORDER BY flight_number, direction, scheduled_at
""",
(target_date,),
)
schedule = [dict(r) for r in cur.fetchall()]
stats["schedule_flights"] = len(schedule)
log.info("Mart build: %d schedule flights", len(schedule))
for sched in schedule:
flight_number = sched["flight_number"]
callsign = sched.get("callsign") or flight_number
try:
source_info: Dict = {
"has_rtlsdr": False, "has_fr24": False, "has_fa": False,
"track_source": None, "track_points": 0,
"aircraft_type": None,
"fr24_track_id": None, "fa_track_id": None, "rtlsdr_flight_id": None,
}
points: List[Dict] = []
source_label = None
# 1. Try RTL-SDR
rtlsdr_id = find_rtlsdr_flight(conn, flight_number, target_date)
if rtlsdr_id:
source_info["has_rtlsdr"] = True
source_info["rtlsdr_flight_id"] = rtlsdr_id
pts = get_rtlsdr_points(conn, rtlsdr_id)
if pts:
points = pts
source_label = "rtlsdr"
# 2. Try FR24
fr24_result = find_fr24_track(
conn, flight_number, target_date,
origin_iata=sched.get("origin_iata"),
destination_iata=sched.get("destination_iata"),
)
if fr24_result:
source_info["has_fr24"] = True
source_info["fr24_track_id"] = fr24_result[0]
if not points:
pts = get_fr24_points(conn, fr24_result[0])
if pts:
points = pts
source_label = "fr24"
source_info["aircraft_type"] = fr24_result[1]
# 3. Try FlightAware
fa_result = find_fa_track(
conn, flight_number, target_date,
origin_iata=sched.get("origin_iata"),
destination_iata=sched.get("destination_iata"),
)
if fa_result:
source_info["has_fa"] = True
source_info["fa_track_id"] = fa_result[0]
if not points:
pts = get_fa_points(conn, fa_result[0])
if pts:
points = pts
source_label = "fa"
source_info["aircraft_type"] = fa_result[1]
source_info["track_source"] = source_label
source_info["track_points"] = len(points)
mart_id = upsert_mart_flight(conn, sched, source_info)
if points:
insert_mart_points(
conn, mart_id, points, source_label,
source_info.get("aircraft_type") or "default",
)
stats["with_track"] += 1
stats["mart_flights"] += 1
except Exception as e:
conn.rollback()
stats["errors"] += 1
log.error("Mart: error processing %s: %s", flight_number, e)
continue
try:
update_noise_grid(conn, target_date)
update_source_coverage(conn, target_date)
conn.commit()
except Exception as e:
conn.rollback()
log.error("Mart: error updating grid/coverage: %s", e)
stats["errors"] += 1
log.info("Mart build done: %s", stats)
return stats