From e1872e3d94e5527ab3876c673dcbe9a2c8d1177a Mon Sep 17 00:00:00 2001 From: claude-bot Date: Tue, 16 Jun 2026 08:17:25 +0300 Subject: [PATCH] developer(ET): auto-commit from developer run_id=749 --- src/config.py | 17 +++++++++++++++++ src/staging_runner.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/config.py b/src/config.py index 3db293d..5c7d9f8 100644 --- a/src/config.py +++ b/src/config.py @@ -452,11 +452,28 @@ class Settings(BaseSettings): # until the budget is exhausted (D5, anti ORCH-110). # staging_runner_infra_retry_delay_s-> delay before the re-queued deployer job # (env ORCH_STAGING_RUNNER_INFRA_RETRY_DELAY_S). + # ORCH-123 (D6, adr-0049): the staging suite (`docker exec orchestrator-staging + # ... staging_check.py`) MUST run host-side over the existing trusted ssh channel + # (ORCH-036/058), because the prod container ships only `openssh-client git curl` + # — NOT a `docker` CLI (Dockerfile:11) — so `docker exec` spawned FROM INSIDE the + # prod container hit FileNotFoundError -> a permanent environment defect that + # ORCH-115 mis-routed as a code-fail rollback (incident ORCH-116). The execution + # CHANNEL changes (host-side ssh wrap), the suite COMMAND/contract does not. + # staging_runner_exec_host_side -> True (default = boevoe) wraps the docker-exec + # in `ssh @ + # ''` (mirror self_deploy / + # image_freshness host-side docker). False -> + # the prior in-container `docker exec` (valid + # ONLY where a docker CLI is baked into the + # image; the current prod image has none). env + # ORCH_STAGING_RUNNER_EXEC_HOST_SIDE. Rollback: + # set False -> the prior in-container call 1:1. staging_runner_enabled: bool = True staging_runner_repos: str = "" staging_runner_timeout_s: int = 600 staging_runner_infra_max_retries: int = 2 staging_runner_infra_retry_delay_s: int = 30 + staging_runner_exec_host_side: bool = True # ORCH-098 (FND/F2): machine lessons-journal — additive `lessons` table + leaf # src/lessons.py (never-raise observer, by образцу serial_gate/coverage_gate/ diff --git a/src/staging_runner.py b/src/staging_runner.py index 296bc75..2b9f011 100644 --- a/src/staging_runner.py +++ b/src/staging_runner.py @@ -41,6 +41,8 @@ Two-level outcome (D5 — the key safety decision, anti ORCH-110): """ import logging +import shlex +import subprocess import time from .config import settings @@ -53,6 +55,25 @@ logger = logging.getLogger("orchestrator.staging_runner") # NOT a host hardcode (test_no_host_hardcodes forbids host IP/home/hostname only). STAGING_SERVICE = "orchestrator-staging" +# ORCH-123 (D3): deterministic stderr markers that prove a PERMANENT environment +# defect (no docker / container down / daemon unreachable) — NOT a transient infra +# hiccup and NEVER a code-fail. Scanned lower-cased; mirror of merge_gate's +# scope-guard marker approach. Recognised suite exit-codes WITHOUT any of these +# markers are always trusted as `suite-ran` (anti-over-tolerance, BR-3). +_ENV_MARKERS = ( + "command not found", + "executable file not found", + "no such container", + "is not running", + "cannot connect to the docker daemon", + "is the docker daemon running", +) + +# ORCH-123 (D5): last prod-like preflight verdict of the host-side staging channel +# (observed at startup; surfaced in snapshot()). `ok=None` -> not probed / n/a. +_PREFLIGHT_STATE: dict = {"ok": None, "reason": "not-probed"} +_PREFLIGHT_TIMEOUT_S = 20 + # Default wall-clock budget for the docker-exec staging suite (D9). Kept <= the LLM # staging window it replaces so Σ(work on the deploy-staging edge) does not grow and # the cross-cutting reaper invariant (ORCH-065/109/110) holds WITHOUT touching @@ -67,12 +88,18 @@ _GIT_TIMEOUT = 60 _INFRA_RETRY_MARKER = "staging-runner infra-retry" # In-process observability counters (mirror merge_gate._MERGE_GATE_COUNTERS, ORCH-110). +# ORCH-123 (D8): `failed` now counts ONLY a real code-fail (suite ran, exit != 0); +# `permanent_env` counts infra-HOLD events (a permanent environment defect, OR a +# transient-infra DEFER budget exhausted) so /queue distinguishes the THREE +# non-success classes: code-fail (`failed`) vs transient-infra (`deferred`) vs +# permanent-env / infra-HOLD (`permanent_env`). _STAGING_RUNNER_COUNTERS: dict = { "runs": 0, # run_staging_gate entered "success": 0, # suite ran, exit 0 -> SUCCESS - "failed": 0, # suite ran non-zero, OR infra budget exhausted -> FAILED + "failed": 0, # suite ran non-zero -> FAILED (a real code-fail) "tool_error": 0, # suite did NOT execute (spawn-error / timeout / None) - "deferred": 0, # bounded infra DEFER (re-queued) + "deferred": 0, # bounded transient-infra DEFER (re-queued) + "permanent_env": 0, # infra-HOLD: permanent env defect OR transient budget exhausted }