fix(staging): host-side ssh execution + env classification for staging-runner (ORCH-123)
The ORCH-115 deterministic staging-runner ran `docker exec` FROM INSIDE the prod `orchestrator` container, which ships only `openssh-client git curl` — no `docker` CLI (Dockerfile:11). `Popen(["docker", ...])` hit FileNotFoundError -> a PERMANENT environment defect that was mis-routed as a code-fail rollback `deploy-staging -> development` (burning developer-retries). Incident ORCH-116: every self-hosting task reaching deploy-staging was doomed to a false rollback. Fix (adr-0049, additive, flag-gated, never-raise, self-hosting scope; the gate / artifact contract / STAGE_TRANSITIONS / DB schema are byte-for-byte unchanged): - D1: build_staging_command() wraps the SAME `docker exec ... staging_check.py ... --mode stub` in `ssh <user@host> '<...>'` so it runs HOST-SIDE over the existing trusted ssh channel (mirror self_deploy / image_freshness). New flag staging_runner_exec_host_side (default True). No docker CLI/SDK added to the image, docker.sock not used in-container (D2 security). - D3: three-way classify_staging_outcome (suite-ran / permanent-env / transient-infra), disambiguating the exit=1 collision by scanning stderr. - D4: invariant "infra != code-fail" — permanent-env / exhausted transient-infra end in an infra-HOLD (no rollback, no developer-retry), NOT a false FAILED rollback (supersedes ORCH-115 D5). A really-executed failing suite still rolls back (anti-over-tolerance). R-2 verified: a held deploy-staging task is not rolled back by the reconciler. - D5: prod-like preflight() of the host-side channel at startup (main.lifespan, best-effort, never blocks). - D8: snapshot adds permanent_env / exec_host_side / preflight. Docs (golden source, same PR): INFRA.md execution-boundary section, architecture/README.md, CLAUDE.md, CHANGELOG.md, .env.example. Tests: tests/test_orch123_staging_runner_exec.py (TC-01 mandatory regression red->green; TC-02..TC-14 + R-2). ORCH-115 anti-drift green (3 tests updated for the D1/D4/D8 supersession). Full suite: 2131 passed. Refs: ORCH-123 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
15
src/main.py
15
src/main.py
@@ -79,6 +79,21 @@ async def lifespan(app: FastAPI):
|
||||
except Exception as e:
|
||||
log.warning(f"Transition-lease recovery skipped: {e}")
|
||||
|
||||
# ORCH-123 (adr-0049 / D5 / FR-4): prod-like preflight of the host-side staging
|
||||
# execution channel. The deploy-staging staging-runner (ORCH-115) runs the suite
|
||||
# HOST-SIDE over ssh because the prod container ships no docker CLI (Dockerfile:11);
|
||||
# probe the channel at startup so a broken environment (no docker on host / staging
|
||||
# down / ssh unreachable / no ssh target) surfaces HERE — not postfactum as a false
|
||||
# rollback of a real task (incident ORCH-116). Purely observational: it never blocks
|
||||
# the start / gates the pipeline. never raises (runs after requeue + lease-recovery).
|
||||
try:
|
||||
from . import staging_runner
|
||||
ok, reason = staging_runner.preflight()
|
||||
if not ok:
|
||||
log.warning(f"Staging-runner preflight: {reason}")
|
||||
except Exception as e:
|
||||
log.warning(f"Staging-runner preflight skipped: {e}")
|
||||
|
||||
# ORCH-065: proactive startup reclaim of dead/stale merge-leases, next to the
|
||||
# queue-recovery above. A lease held by the previous (now dead) process pid is
|
||||
# released at once instead of waiting for the TTL / a foreign acquire so the
|
||||
|
||||
@@ -178,26 +178,74 @@ def should_intercept(job: dict) -> bool:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Suite execution (D3 / FR-2 / NFR-3 / AC-8 / AC-9)
|
||||
# ---------------------------------------------------------------------------
|
||||
def build_staging_command() -> list[str]:
|
||||
"""Build the canonical staging-suite argv (same command the LLM-deployer ran).
|
||||
def _ssh_target() -> str:
|
||||
"""ssh ``user@host`` for host-side execution, or ``""`` when no host is
|
||||
configured (mirror ``self_deploy``/``image_freshness._ssh_target``). On the prod
|
||||
host ``ORCH_DEPLOY_SSH_HOST=127.0.0.1`` is set by compose; the config default is
|
||||
empty so tests / non-self contexts fall back to the in-container command."""
|
||||
host = (settings.deploy_ssh_host or "").strip()
|
||||
if not host:
|
||||
return ""
|
||||
user = (settings.deploy_ssh_user or "").strip()
|
||||
return f"{user}@{host}" if user else host
|
||||
|
||||
``docker exec <STAGING_SERVICE> python3 <repos_dir>/<self-repo>/scripts/staging_check.py
|
||||
--base-url http://localhost:<staging_port> --mode stub``. Host-specifics come from
|
||||
config (ORCH-101, no host hardcodes). Self-hosting safety (BR-7 / AC-8 / TC-12):
|
||||
NO restart of 8500, NO ``docker compose up orchestrator`` / ``--build``, NO
|
||||
force-push, NO ``.env`` edit — the runner only reads/executes the staging suite
|
||||
(8501) and writes a log.
|
||||
|
||||
def _channel_ssh_configured() -> bool:
|
||||
"""Whether the execution channel is viable w.r.t. ssh config (fed to
|
||||
``classify_staging_outcome``). In-container mode (host-side disabled) is always
|
||||
'configured' — its viability is decided purely by the result signals; host-side
|
||||
mode needs a non-empty ssh target (R-6: an empty target -> ``permanent-env``).
|
||||
never-raise -> assume configured (rely on the result signals)."""
|
||||
try:
|
||||
if not bool(getattr(settings, "staging_runner_exec_host_side", True)):
|
||||
return True
|
||||
return bool(_ssh_target())
|
||||
except Exception: # noqa: BLE001 - never-raise
|
||||
return True
|
||||
|
||||
|
||||
def build_staging_command() -> list[str]:
|
||||
"""Build the staging-suite argv (same suite command the LLM-deployer ran).
|
||||
|
||||
The INNER command is unchanged: ``docker exec <STAGING_SERVICE> python3
|
||||
<repos_dir>/<self-repo>/scripts/staging_check.py --base-url
|
||||
http://localhost:<staging_port> --mode stub``. Host-specifics come from config
|
||||
(ORCH-101, no host hardcodes).
|
||||
|
||||
ORCH-123 (D1, adr-0049): ``docker`` lives on the HOST — the prod container ships
|
||||
only ``openssh-client git curl`` (``Dockerfile:11``), so a ``docker exec`` spawned
|
||||
FROM INSIDE the prod container hit ``FileNotFoundError`` (incident ORCH-116). When
|
||||
``staging_runner_exec_host_side`` (default True) is set AND an ssh target is
|
||||
configured, the inner command is wrapped in ``ssh <user@host> '<docker exec ...>'``
|
||||
so it runs host-side over the existing trusted ssh channel (mirror
|
||||
``self_deploy.build_deploy_command`` / ``image_freshness.image_revision``). With
|
||||
the flag off OR no ssh target configured (tests / non-self contexts) it falls back
|
||||
to the prior in-container ``docker exec`` (valid only where a docker CLI is baked
|
||||
into the image).
|
||||
|
||||
Self-hosting safety (BR-7 / AC-8 / TC-08): the argv carries ONLY ``docker exec
|
||||
<staging-service> python3 staging_check.py ... --mode stub`` — NO restart of 8500,
|
||||
NO ``docker compose up orchestrator`` / ``--build``, NO force-push, NO ``.env``
|
||||
edit. The runner only reads/executes the staging suite (8501) and writes a log.
|
||||
"""
|
||||
from .qg.checks import SELF_HOSTING_REPO
|
||||
repos_dir = (settings.repos_dir or "/repos").rstrip("/")
|
||||
script = f"{repos_dir}/{SELF_HOSTING_REPO}/scripts/staging_check.py"
|
||||
base_url = f"http://localhost:{int(settings.staging_port)}"
|
||||
return [
|
||||
inner_argv = [
|
||||
"docker", "exec", STAGING_SERVICE,
|
||||
"python3", script,
|
||||
"--base-url", base_url,
|
||||
"--mode", "stub",
|
||||
]
|
||||
# ORCH-123 (D1): host-side ssh-wrap when enabled AND a target is configured.
|
||||
if bool(getattr(settings, "staging_runner_exec_host_side", True)):
|
||||
target = _ssh_target()
|
||||
if target:
|
||||
remote = " ".join(shlex.quote(a) for a in inner_argv)
|
||||
return ["ssh", "-o", "StrictHostKeyChecking=no", target, remote]
|
||||
# Fallback: prior in-container docker exec (no ssh target / flag off).
|
||||
return inner_argv
|
||||
|
||||
|
||||
def _resolve_timeout() -> int:
|
||||
@@ -255,6 +303,71 @@ def map_exit_code_to_status(exit_code) -> str:
|
||||
return _map(exit_code)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Three-way outcome classification (D3 / FR-2 / FR-3 / AC-3, AC-4, AC-6)
|
||||
# ---------------------------------------------------------------------------
|
||||
def classify_staging_outcome(result, ssh_configured: bool) -> str:
|
||||
"""Classify a staging-suite ProcResult into one of three classes (D3). Pure,
|
||||
never-raise (mirror of ``merge_gate.classify_retest_failure``, ORCH-110 D2):
|
||||
|
||||
* ``"suite-ran"`` — a recognised executor exit-code (any int except the
|
||||
ssh transport code 255) AND no environment marker in
|
||||
stderr: the suite DEMONSTRABLY executed -> trust the
|
||||
code (``0->SUCCESS``, ``!=0->FAILED``). Anti-over-
|
||||
tolerance (BR-3): a real suite fail is NEVER
|
||||
reclassified as infra/env.
|
||||
* ``"permanent-env"`` — a deterministic PERMANENT environment defect: an
|
||||
stderr env-marker (no docker / no such container /
|
||||
daemon unreachable), a shell "command not found /
|
||||
cannot execute" code (126/127), no host-side ssh
|
||||
target configured, or a bare local spawn-error
|
||||
(``returncode is None`` and not a timeout). Retrying is
|
||||
pointless (FR-3) -> immediate distinguishable
|
||||
infra-HOLD (no rollback, no developer-retry).
|
||||
* ``"transient-infra"`` — a timeout OR an ssh transport/connection failure (255)
|
||||
OR any unknown signal: a retry is meaningful -> the
|
||||
bounded DEFER.
|
||||
|
||||
The exit=1 collision (``docker exec`` "No such container"=1 vs a real suite fail=1)
|
||||
is disambiguated by SCANNING stderr for env-markers, NEVER by the bare exit-code.
|
||||
Fail-safe on doubt -> ``"transient-infra"`` (DEFER), never a silent ``"suite-ran"``
|
||||
(a mis-routed environment->code-fail is exactly the defect this fixes; an over-
|
||||
tolerated code-fail is guarded by trusting recognised suite exit-codes first).
|
||||
"""
|
||||
try:
|
||||
rc = getattr(result, "returncode", None)
|
||||
timed_out = bool(getattr(result, "timed_out", False))
|
||||
stderr = (getattr(result, "stderr", "") or "").lower()
|
||||
|
||||
# 1. env-marker in stderr -> permanent, REGARDLESS of the exit-code (this is
|
||||
# what disambiguates a `docker exec` "No such container" exit=1 from a real
|
||||
# suite fail=1 — R-3).
|
||||
if any(m in stderr for m in _ENV_MARKERS):
|
||||
return "permanent-env"
|
||||
# 2. shell "command not found" (127) / "cannot execute" (126) -> permanent.
|
||||
if rc in (126, 127):
|
||||
return "permanent-env"
|
||||
# 3. a recognised executor exit-code (any int except the ssh transport code
|
||||
# 255) WITHOUT an env-marker -> the suite executed; trust it (BR-3). This is
|
||||
# checked BEFORE the channel guards so a real suite fail is never masked.
|
||||
if isinstance(rc, int) and rc != 255:
|
||||
return "suite-ran"
|
||||
# --- below this line the suite did NOT produce a trustworthy exit-code ---
|
||||
# 4. timeout / ssh transport failure (255) -> transient: a retry is meaningful.
|
||||
if timed_out or rc == 255:
|
||||
return "transient-infra"
|
||||
# 5. host-side ssh target missing (R-6) OR a bare local spawn-error (proc_group
|
||||
# degraded an OSError to rc None without a timeout) -> permanent env defect.
|
||||
if not ssh_configured:
|
||||
return "permanent-env"
|
||||
if rc is None:
|
||||
return "permanent-env"
|
||||
# 6. unknown signal -> fail-safe DEFER (never a silent suite-ran).
|
||||
return "transient-infra"
|
||||
except Exception: # noqa: BLE001 - never-raise; unknown -> transient (no false rollback)
|
||||
return "transient-infra"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Artifact 15-staging-log.md (D6 / FR-4 / AC-2 / AC-8) — mirror write_deploy_log
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -439,17 +552,21 @@ def _infra_retry_count(task_id) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _handle_tool_error(
|
||||
def _handle_transient_infra(
|
||||
task_id, repo: str, work_item_id: str, branch: str, result: proc_group.ProcResult
|
||||
) -> None:
|
||||
"""Suite did NOT execute (tool-error) -> bounded DEFER, then fail-closed (D5).
|
||||
"""Transient infra (timeout / ssh transport) — the suite did not execute but a
|
||||
retry is meaningful (D4). Bounded DEFER: re-queue a fresh ``deployer`` job (which
|
||||
re-enters this runner) with a delay + a restart-safe marker, instead of an
|
||||
immediate rollback that would burn a developer-retry.
|
||||
|
||||
Anti ORCH-110: an infra fault is NOT a code fault, so we re-queue a fresh
|
||||
``deployer`` job (which re-enters this runner) with a delay instead of an
|
||||
immediate FAILED-rollback that would burn a developer-retry. On budget exhaustion
|
||||
-> write ``staging_status: FAILED`` + advance (the existing rollback) + an
|
||||
INFRA-specific alert (explicitly "not a code defect"). Never a silent advance /
|
||||
false green; never wedges the queue. never-raise."""
|
||||
On budget exhaustion -> **infra-HOLD + alert** (D4, SUPERSEDES ORCH-115 D5): NOT
|
||||
the prior fail-closed ``write_staging_log("FAILED") + advance``, which falsely
|
||||
rolled an unresolved infra hiccup back to ``development`` as a code-fail (BR-2,
|
||||
anti-pattern ORCH-110). The task is HELD on ``deploy-staging`` (a red/missing gate
|
||||
keeps the reconciler's ``advance_if_gate_passed`` from rolling it back — R-2); the
|
||||
operator re-drives it after fixing the stand. Never a silent advance / false green;
|
||||
never wedges the queue. never-raise."""
|
||||
retries = _infra_retry_count(task_id)
|
||||
try:
|
||||
max_retries = int(settings.staging_runner_infra_max_retries)
|
||||
@@ -462,7 +579,7 @@ def _handle_tool_error(
|
||||
|
||||
if retries < max_retries:
|
||||
_bump("deferred")
|
||||
reason = "timeout" if result.timed_out else "suite did not execute (tool-error)"
|
||||
reason = "timeout" if result.timed_out else "ssh transport/connection error"
|
||||
task_desc = (
|
||||
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
||||
f"Stage: deploy-staging\nNote: {_INFRA_RETRY_MARKER} "
|
||||
@@ -474,40 +591,77 @@ def _handle_tool_error(
|
||||
"deployer", repo, task_desc, task_id=task_id, available_at_delay_s=delay,
|
||||
)
|
||||
logger.warning(
|
||||
"Task %s (%s): staging suite did not execute (%s) -> infra-DEFER "
|
||||
"(job_id=%s, attempt %d/%d)",
|
||||
"Task %s (%s): staging suite did not execute (%s) -> transient-infra "
|
||||
"DEFER (job_id=%s, attempt %d/%d)",
|
||||
task_id, work_item_id, reason, new_job, retries + 1, max_retries,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.error("staging_runner: infra-DEFER enqueue failed for %s: %s", task_id, e)
|
||||
return
|
||||
|
||||
# Budget exhausted -> fail-closed FAILED (terminal, never a false green).
|
||||
_bump("failed")
|
||||
# Budget exhausted -> infra-HOLD (D4): NO write_staging_log("FAILED"), NO advance,
|
||||
# NO developer-retry. The task is held on deploy-staging for the operator.
|
||||
_bump("permanent_env")
|
||||
logger.error(
|
||||
"Task %s (%s): staging tool-error DEFER budget exhausted (%d) -> fail-closed FAILED",
|
||||
"Task %s (%s): staging transient-infra DEFER budget exhausted (%d) -> "
|
||||
"infra-HOLD on deploy-staging (NOT a code defect, no rollback)",
|
||||
task_id, work_item_id, max_retries,
|
||||
)
|
||||
write_staging_log(repo, work_item_id, branch, result.returncode, "FAILED",
|
||||
result.stdout, tool_error=True)
|
||||
_alert_infra_exhausted(work_item_id, max_retries)
|
||||
_advance(task_id, repo, work_item_id, branch)
|
||||
|
||||
|
||||
def _handle_permanent_env(
|
||||
task_id, repo: str, work_item_id: str, branch: str, result: proc_group.ProcResult
|
||||
) -> None:
|
||||
"""Permanent environment defect (D4): an immediate, distinguishable infra-HOLD.
|
||||
|
||||
The DEFER cycle is SKIPPED — retrying a permanent defect (no docker / no ssh
|
||||
target / container down) is pointless (FR-3). We do NOT write a FAILED staging-log,
|
||||
do NOT advance, do NOT burn a developer-retry — the task is HELD on
|
||||
``deploy-staging``: a red/missing ``check_staging_status`` keeps the reconciler's
|
||||
``advance_if_gate_passed`` from rolling it back to ``development`` (R-2), and the
|
||||
rollback-to-development path (``advance_stage(finished_agent="deployer")``) is never
|
||||
taken. A structured error log + an operator alert ("infra/environment, NOT a code
|
||||
defect") make the hold visible; the operator re-drives after fixing the
|
||||
environment. never-raise."""
|
||||
detail = (getattr(result, "stderr", "") or "").strip()[:300]
|
||||
logger.error(
|
||||
"Task %s (%s): PERMANENT staging environment defect -> infra-HOLD on "
|
||||
"deploy-staging (NOT a code defect, no rollback, no developer-retry): %s",
|
||||
task_id, work_item_id, detail or "<no stderr>",
|
||||
)
|
||||
_alert_permanent_env(work_item_id, detail)
|
||||
|
||||
|
||||
def _alert_infra_exhausted(work_item_id: str, max_retries: int) -> None:
|
||||
"""Best-effort Telegram alert that the staging suite never executed (infra, NOT a
|
||||
code defect) after the retry budget. never-raise."""
|
||||
code defect) after the retry budget -> infra-HOLD (no rollback). never-raise."""
|
||||
try:
|
||||
from .notifications import send_telegram, link_for
|
||||
send_telegram(
|
||||
f"\U0001f6a8 {link_for(work_item_id)}: staging suite не запустилась "
|
||||
f"(инфра, НЕ дефект кода) после {max_retries} попыток — fail-closed FAILED, "
|
||||
f"откат на development. Нужно проверить staging-стенд."
|
||||
f"\U0001f6a8 {link_for(work_item_id)}: staging-сюита не запустилась "
|
||||
f"(инфра, НЕ дефект кода) после {max_retries} попыток — задача удержана на "
|
||||
f"deploy-staging (без отката на development). Нужно проверить staging-стенд."
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("staging_runner: infra-exhausted alert failed for %s: %s", work_item_id, e)
|
||||
|
||||
|
||||
def _alert_permanent_env(work_item_id: str, detail: str) -> None:
|
||||
"""Best-effort Telegram alert that the staging suite could not execute due to a
|
||||
permanent environment defect (NOT a code defect) -> infra-HOLD. never-raise."""
|
||||
try:
|
||||
from .notifications import send_telegram, link_for
|
||||
tail = f": {detail}" if detail else ""
|
||||
send_telegram(
|
||||
f"\U0001f6a8 {link_for(work_item_id)}: staging-сюита не смогла исполниться — "
|
||||
f"постоянный дефект окружения (инфра, НЕ дефект кода){tail}. Задача удержана "
|
||||
f"на deploy-staging до починки окружения (без отката на development)."
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("staging_runner: permanent-env alert failed for %s: %s", work_item_id, e)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point (D2) — owns the full deterministic flow, mirror run_deploy_finalizer
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -517,10 +671,12 @@ def run_staging_gate(job: dict) -> None:
|
||||
Flow (mirror of ``stage_engine.run_deploy_finalizer``):
|
||||
1. resolve ``work_item_id`` / ``branch`` by ``task_id``;
|
||||
2. execute the staging suite (D3) -> ProcResult;
|
||||
3. suite EXECUTED -> map exit-code -> ``staging_status:``, write
|
||||
``15-staging-log.md``, initiate the existing gate via ``advance_stage`` (D7);
|
||||
4. suite did NOT execute (tool-error) -> bounded DEFER / fail-closed (D5);
|
||||
5. observability counters + one structured verdict log (D10).
|
||||
3. classify the outcome three ways (D3): ``suite-ran`` -> map exit-code ->
|
||||
``staging_status:``, write ``15-staging-log.md``, initiate the existing gate
|
||||
via ``advance_stage`` (D7);
|
||||
4. ``permanent-env`` -> immediate infra-HOLD (no rollback, no developer-retry);
|
||||
``transient-infra`` -> bounded DEFER, then infra-HOLD on exhaustion (D4);
|
||||
5. observability counters + one structured verdict log (D8).
|
||||
Never raises into the caller (the launcher marks the job done/failed)."""
|
||||
started = time.time()
|
||||
_bump("runs")
|
||||
@@ -552,9 +708,10 @@ def run_staging_gate(job: dict) -> None:
|
||||
try:
|
||||
result = run_staging_suite()
|
||||
duration_s = round(time.time() - started, 1)
|
||||
suite_ran = (result.returncode is not None) and (not result.timed_out)
|
||||
# ORCH-123 (D3): three-way classification (env != transient infra != code-fail).
|
||||
outcome = classify_staging_outcome(result, _channel_ssh_configured())
|
||||
|
||||
if suite_ran:
|
||||
if outcome == "suite-ran":
|
||||
# 3. trust the exit-code (ORCH-061 already inside staging_check.py).
|
||||
status = map_exit_code_to_status(result.returncode)
|
||||
_bump("success" if status == "SUCCESS" else "failed")
|
||||
@@ -568,14 +725,27 @@ def run_staging_gate(job: dict) -> None:
|
||||
_advance(task_id, repo, work_item_id, branch)
|
||||
return
|
||||
|
||||
# 4. tool-error (suite did not execute) -> DEFER / fail-closed (D5).
|
||||
# The suite did NOT execute. Count it as a tool-error, then route by class
|
||||
# (D4): permanent-env -> immediate infra-HOLD; transient-infra -> bounded DEFER.
|
||||
# NEITHER rolls back to development or burns a developer-retry (BR-2 invariant).
|
||||
_bump("tool_error")
|
||||
if outcome == "permanent-env":
|
||||
_bump("permanent_env")
|
||||
logger.warning(
|
||||
"staging_runner verdict: work_item=%s repo=%s exit_code=%s "
|
||||
"duration_s=%s outcome=permanent-env (ssh_configured=%s)",
|
||||
work_item_id, repo, result.returncode, duration_s, _channel_ssh_configured(),
|
||||
)
|
||||
_handle_permanent_env(task_id, repo, work_item_id, branch, result)
|
||||
return
|
||||
|
||||
# transient-infra (timeout / ssh transport / unknown) -> bounded DEFER.
|
||||
logger.warning(
|
||||
"staging_runner verdict: work_item=%s repo=%s exit_code=%s status=%s "
|
||||
"duration_s=%s outcome=tool-error (timed_out=%s)",
|
||||
work_item_id, repo, result.returncode, "TOOL-ERROR", duration_s, result.timed_out,
|
||||
"staging_runner verdict: work_item=%s repo=%s exit_code=%s "
|
||||
"duration_s=%s outcome=transient-infra (timed_out=%s)",
|
||||
work_item_id, repo, result.returncode, duration_s, result.timed_out,
|
||||
)
|
||||
_handle_tool_error(task_id, repo, work_item_id, branch, result)
|
||||
_handle_transient_infra(task_id, repo, work_item_id, branch, result)
|
||||
except Exception as e: # noqa: BLE001 - never-raise into the worker (AC-7)
|
||||
logger.error(
|
||||
"staging_runner.run_staging_gate: unexpected error for task %s (%s): %s",
|
||||
@@ -584,24 +754,107 @@ def run_staging_gate(job: dict) -> None:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Observability (D10 / FR-7 / AC-10)
|
||||
# Prod-like preflight of the host-side execution channel (D5 / FR-4 / AC-5)
|
||||
# ---------------------------------------------------------------------------
|
||||
def _alert_preflight(reason: str) -> None:
|
||||
"""Best-effort Telegram alert that the host-side staging channel is unworkable at
|
||||
startup (so a real task never silently false-routes later). never-raise."""
|
||||
try:
|
||||
from .notifications import send_telegram
|
||||
send_telegram(
|
||||
f"⚠️ staging-runner preflight: {reason}. Хост-сторона исполнения "
|
||||
f"staging-сюиты неработоспособна — задачи на deploy-staging будут удержаны "
|
||||
f"(инфра, НЕ дефект кода). Проверьте ssh/docker/staging-стенд."
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("staging_runner: preflight alert failed: %s", e)
|
||||
|
||||
|
||||
def preflight() -> tuple[bool, str]:
|
||||
"""Prod-like preflight of the host-side staging execution channel (D5 / AC-5).
|
||||
|
||||
Probes the channel with a short bounded ssh probe (``command -v docker`` +
|
||||
``docker inspect -f '{{.State.Running}}' <staging-service>``) so a broken
|
||||
environment (no docker on the host / staging container down / ssh unreachable / no
|
||||
ssh target configured) surfaces at SERVICE START — NOT postfactum as a false
|
||||
rollback of a real task. Records ``_PREFLIGHT_STATE`` + alerts on failure.
|
||||
|
||||
Purely observational (FR-4): it NEVER gates/blocks the pipeline, touches stages /
|
||||
QG, or raises — called best-effort from ``main.lifespan``. Self-hosting scope:
|
||||
``applies()`` first (a disabled runner / out-of-scope repo -> n/a). Returns
|
||||
``(ok, reason)`` (``ok=True`` for n/a / in-container mode / a healthy channel).
|
||||
"""
|
||||
try:
|
||||
from .qg.checks import SELF_HOSTING_REPO
|
||||
if not applies(SELF_HOSTING_REPO):
|
||||
_PREFLIGHT_STATE.update(ok=None, reason="n/a (runner disabled / out of scope)")
|
||||
return True, "n/a"
|
||||
# In-container mode (host-side disabled): nothing to probe host-side.
|
||||
if not bool(getattr(settings, "staging_runner_exec_host_side", True)):
|
||||
_PREFLIGHT_STATE.update(ok=True, reason="in-container mode (host-side disabled)")
|
||||
return True, "in-container mode"
|
||||
target = _ssh_target()
|
||||
if not target:
|
||||
reason = "no ssh target configured (deploy_ssh_host empty) — host-side staging unworkable"
|
||||
_PREFLIGHT_STATE.update(ok=False, reason=reason)
|
||||
_alert_preflight(reason)
|
||||
return False, reason
|
||||
probe = (
|
||||
"command -v docker >/dev/null 2>&1 && "
|
||||
f"docker inspect -f '{{{{.State.Running}}}}' {shlex.quote(STAGING_SERVICE)}"
|
||||
)
|
||||
cmd = ["ssh", "-o", "StrictHostKeyChecking=no", target, probe]
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=_PREFLIGHT_TIMEOUT_S)
|
||||
except subprocess.TimeoutExpired:
|
||||
reason = "preflight ssh probe timed out"
|
||||
_PREFLIGHT_STATE.update(ok=False, reason=reason)
|
||||
_alert_preflight(reason)
|
||||
return False, reason
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
reason = f"preflight ssh probe error: {e}"
|
||||
_PREFLIGHT_STATE.update(ok=False, reason=reason)
|
||||
_alert_preflight(reason)
|
||||
return False, reason
|
||||
out = (r.stdout or "").strip().lower()
|
||||
if r.returncode == 0 and out == "true":
|
||||
_PREFLIGHT_STATE.update(ok=True, reason="host-side channel ok (docker present, staging running)")
|
||||
return True, "ok"
|
||||
reason = f"host-side staging channel not ready (rc={r.returncode}, running={out!r})"
|
||||
_PREFLIGHT_STATE.update(ok=False, reason=reason)
|
||||
_alert_preflight(reason)
|
||||
return False, reason
|
||||
except Exception as e: # noqa: BLE001 - never-raise; preflight must never block start
|
||||
logger.warning("staging_runner.preflight error: %s", e)
|
||||
_PREFLIGHT_STATE.update(ok=None, reason=f"preflight error: {e}")
|
||||
return True, "preflight skipped (error)"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Observability (D8 / FR-7 / AC-10)
|
||||
# ---------------------------------------------------------------------------
|
||||
def snapshot() -> dict:
|
||||
"""Read-only staging-runner summary for ``GET /queue`` (FR-7 / AC-10).
|
||||
|
||||
Additive block; existing ``/queue`` keys are untouched. never-raise: any error ->
|
||||
a minimal dict with the kill-switch state."""
|
||||
Additive block; existing ``/queue`` keys are untouched. ORCH-123 adds
|
||||
``exec_host_side`` (the strategy flag), ``permanent_env`` (the infra-HOLD counter,
|
||||
distinct from ``failed``=code-fail and ``deferred``=transient-infra) and the last
|
||||
``preflight`` verdict. never-raise: any error -> a minimal dict with the
|
||||
kill-switch state."""
|
||||
try:
|
||||
return {
|
||||
"enabled": bool(settings.staging_runner_enabled),
|
||||
"repos": getattr(settings, "staging_runner_repos", "") or "",
|
||||
"timeout_s": getattr(settings, "staging_runner_timeout_s", _DEFAULT_TIMEOUT_S),
|
||||
"infra_max_retries": getattr(settings, "staging_runner_infra_max_retries", 2),
|
||||
"exec_host_side": bool(getattr(settings, "staging_runner_exec_host_side", True)),
|
||||
"runs": _STAGING_RUNNER_COUNTERS["runs"],
|
||||
"success": _STAGING_RUNNER_COUNTERS["success"],
|
||||
"failed": _STAGING_RUNNER_COUNTERS["failed"],
|
||||
"tool_error": _STAGING_RUNNER_COUNTERS["tool_error"],
|
||||
"deferred": _STAGING_RUNNER_COUNTERS["deferred"],
|
||||
"permanent_env": _STAGING_RUNNER_COUNTERS["permanent_env"],
|
||||
"preflight": dict(_PREFLIGHT_STATE),
|
||||
}
|
||||
except Exception as e: # noqa: BLE001 - never-raise -> minimal dict
|
||||
logger.warning("staging_runner.snapshot error: %s", e)
|
||||
|
||||
Reference in New Issue
Block a user