Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
112 lines
4.9 KiB
YAML
112 lines
4.9 KiB
YAML
services:
|
||
orchestrator:
|
||
build: .
|
||
container_name: orchestrator
|
||
restart: unless-stopped
|
||
# ORCH-040: бежим под uid:gid хоста (slin=1000:1000), а не root, чтобы
|
||
# артефакты конвейера (worktree + docs) создавались как slin:slin и git на
|
||
# хосте работал без ручного chown. Доступ к docker.sock сохранён через
|
||
# group_add: ["999"] (МИНА 1 — НЕ удалять). См. ADR-001 ORCH-040.
|
||
user: "1000:1000"
|
||
# init: true injects docker-init (tini) as PID 1 so reparented grandchild
|
||
# processes from the claude/node subprocess tree are reaped (no zombies, B-2).
|
||
init: true
|
||
network_mode: host
|
||
volumes:
|
||
- ./data:/app/data
|
||
- /home/slin/repos:/repos
|
||
- /var/run/docker.sock:/var/run/docker.sock
|
||
- /usr/lib/node_modules/@anthropic-ai/claude-code:/opt/claude-code:ro
|
||
- /usr/bin/node:/usr/bin/node:ro
|
||
- /home/slin/.claude:/home/slin/.claude
|
||
- /home/slin/.claude.json:/home/slin/.claude.json:ro
|
||
# ORCH-040: target согласован с HOME=/home/slin (launcher), не /root/.ssh.
|
||
- /home/slin/.orchestrator-ssh:/home/slin/.ssh:ro
|
||
env_file: .env
|
||
environment:
|
||
- ORCH_REPOS_DIR=/repos
|
||
- ORCH_HOST_REPOS_DIR=/home/slin/repos
|
||
# legacy enduro deployer (read via os.environ, keep as-is):
|
||
- DEPLOY_SSH_USER=slin
|
||
- DEPLOY_SSH_HOST=127.0.0.1
|
||
- DEPLOY_HOOK_SCRIPT=/home/slin/bin/enduro-deploy-hook.sh
|
||
# ORCH-036 self-deploy (read via pydantic ORCH_ prefix; host-network -> 127.0.0.1, ssh key mounted):
|
||
- ORCH_DEPLOY_SSH_USER=slin
|
||
- ORCH_DEPLOY_SSH_HOST=127.0.0.1
|
||
- ORCH_DEPLOY_HOOK_SCRIPT=scripts/orchestrator-deploy-hook.sh
|
||
- ORCH_DEPLOY_HOST_REPO_PATH=/home/slin/repos/orchestrator
|
||
group_add:
|
||
- "999"
|
||
|
||
# ORCH-100 (FND/F1b): sidecar-watchdog — the monitoring brain in a SEPARATE
|
||
# container (observer separated from observed, ADR-001 D2). Deploying it builds
|
||
# ONLY this service — the prod `orchestrator` is NOT rebuilt/restarted.
|
||
# * network_mode: host -> /metrics reachable at http://127.0.0.1:8500/metrics
|
||
# and host interfaces visible for memory/disk reads.
|
||
# * docker.sock mounted :ro AND the code is GET-only (double read-only guard).
|
||
# * host disk paths bind-mounted :ro so shutil.disk_usage sees the host FS but
|
||
# can never write (opt-in disk ceiling, D6).
|
||
# * mem_limit caps the thin stdlib daemon (D2): OOM = early "sidecar grew" signal.
|
||
# * WATCHDOG_ENABLED=false (or simply not starting the service) -> inert.
|
||
orchestrator-watchdog:
|
||
build:
|
||
context: .
|
||
dockerfile: watchdog/Dockerfile
|
||
container_name: orchestrator-watchdog
|
||
restart: unless-stopped
|
||
init: true
|
||
network_mode: host
|
||
mem_limit: 128m
|
||
mem_reservation: 32m
|
||
volumes:
|
||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||
- /home/slin/repos:/repos:ro
|
||
- ./data:/app/data:ro
|
||
# Optional env_file (required: false): a missing .env.watchdog must NOT fail
|
||
# `docker compose up` for the prod orchestrator (self-hosting safety). Absent
|
||
# file -> WATCHDOG_* defaults, no token -> fail-safe (logs, does not send).
|
||
env_file:
|
||
- path: .env.watchdog
|
||
required: false
|
||
group_add:
|
||
- "999"
|
||
|
||
# ORCH-31: staging instance (port 8501, isolated DB).
|
||
# Starts ONLY with: docker compose --profile staging up -d orchestrator-staging
|
||
# Normal "docker compose up -d" does NOT start this service.
|
||
orchestrator-staging:
|
||
profiles:
|
||
- staging
|
||
build: .
|
||
container_name: orchestrator-staging
|
||
restart: unless-stopped
|
||
# ORCH-040: тот же uid хоста, что и у prod (см. комментарий выше / ADR-001).
|
||
user: "1000:1000"
|
||
init: true
|
||
network_mode: host
|
||
command: ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8501"]
|
||
volumes:
|
||
- ./data/staging:/app/data
|
||
- /home/slin/repos:/repos
|
||
- /var/run/docker.sock:/var/run/docker.sock
|
||
- /usr/lib/node_modules/@anthropic-ai/claude-code:/opt/claude-code:ro
|
||
- /usr/bin/node:/usr/bin/node:ro
|
||
- /home/slin/.claude:/home/slin/.claude
|
||
- /home/slin/.claude.json:/home/slin/.claude.json:ro
|
||
# ORCH-040: target согласован с HOME=/home/slin (launcher), не /root/.ssh.
|
||
- /home/slin/.orchestrator-ssh:/home/slin/.ssh:ro
|
||
env_file: .env.staging
|
||
environment:
|
||
- ORCH_REPOS_DIR=/repos
|
||
- ORCH_HOST_REPOS_DIR=/home/slin/repos
|
||
- DEPLOY_SSH_USER=slin
|
||
- DEPLOY_SSH_HOST=127.0.0.1
|
||
- DEPLOY_HOOK_SCRIPT=/home/slin/bin/enduro-deploy-hook.sh
|
||
# Staging DB is isolated via ./data/staging volume mount.
|
||
# Inside the container the path remains /app/data/orchestrator.db (same default),
|
||
# but on the host it physically lives at ./data/staging/orchestrator.db —
|
||
# completely separate from prod ./data/orchestrator.db.
|
||
- ORCH_DB_PATH=/app/data/orchestrator.db
|
||
group_add:
|
||
- "999"
|