Files
wiki/tasks/flightradar24/monitoring/main.py
2026-04-19 18:30:01 +03:00

115 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""FR24 monitoring service — checks disk, DB size, capture lag, throughput every 60s."""
import os
import time
import logging
import subprocess
import psycopg2
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
log = logging.getLogger("monitor")
DB_DSN = (
f"host={os.environ.get('POSTGRES_HOST', 'postgres')} "
f"port={os.environ.get('POSTGRES_PORT', '5432')} "
f"dbname={os.environ.get('POSTGRES_DB', 'fr24')} "
f"user={os.environ.get('POSTGRES_USER', 'fr24')} "
f"password={os.environ.get('POSTGRES_PASSWORD', 'change-me')}"
)
INTERVAL = int(os.environ.get("MONITORING_INTERVAL_SECONDS", "60"))
DISK_WARN_PCT = 80
LAG_WARN_SEC = 300 # 5 minutes
def get_disk_usage() -> str:
"""Return disk usage percent for / as integer string, e.g. '45'."""
try:
result = subprocess.run(
["df", "-P", "/"],
capture_output=True, text=True, timeout=5
)
# last line: Filesystem 1024-blocks Used Available Capacity Mounted
line = result.stdout.strip().splitlines()[-1]
pct = line.split()[4].rstrip("%")
return pct
except Exception as e:
log.warning("disk check failed: %s", e)
return "?"
def run_checks():
disk_pct_str = get_disk_usage()
try:
conn = psycopg2.connect(DB_DSN, connect_timeout=5)
conn.autocommit = True
cur = conn.cursor()
# DB size
cur.execute("SELECT pg_database_size(current_database())")
db_bytes = cur.fetchone()[0]
db_size_gb = db_bytes / (1024 ** 3)
db_size_str = f"{db_size_gb:.2f}GB" if db_size_gb >= 1 else f"{db_bytes / (1024**2):.1f}MB"
# Capture lag
cur.execute("SELECT EXTRACT(EPOCH FROM (now() - MAX(observed_at))) FROM fr24.raw_packets")
row = cur.fetchone()
lag_sec = int(row[0]) if row and row[0] is not None else None
lag_str = f"{lag_sec}s" if lag_sec is not None else "N/A"
# Throughput: packets in last 5 minutes
cur.execute(
"SELECT COUNT(*) FROM fr24.raw_packets "
"WHERE observed_at >= now() - INTERVAL '5 minutes'"
)
throughput = cur.fetchone()[0]
cur.close()
conn.close()
db_ok = True
except Exception as e:
log.warning("db check failed: %s", e)
db_size_str = "ERR"
lag_sec = None
lag_str = "ERR"
throughput = "ERR"
db_ok = False
# Emit metrics line
disk_display = f"{disk_pct_str}%" if disk_pct_str != "?" else "?"
print(
f"[monitor] disk={disk_display} db_size={db_size_str} "
f"capture_lag={lag_str} throughput={throughput}pkt/5min",
flush=True,
)
# Warnings
if disk_pct_str not in ("?",) and int(disk_pct_str) > DISK_WARN_PCT:
log.warning("DISK USAGE HIGH: %s%%", disk_pct_str)
if db_ok and lag_sec is not None and lag_sec > LAG_WARN_SEC:
log.warning("CAPTURE LAG HIGH: %ds (threshold %ds)", lag_sec, LAG_WARN_SEC)
def main():
log.info("FR24 monitoring started (interval=%ds)", INTERVAL)
# Signal readiness
open("/tmp/monitoring-ready", "w").close()
while True:
try:
run_checks()
except Exception as e:
log.error("unexpected error in run_checks: %s", e)
time.sleep(INTERVAL)
if __name__ == "__main__":
main()