developer(ET): auto-commit from developer run_id=749
All checks were successful
CI / test (push) Successful in 1m6s
CI / test (pull_request) Successful in 1m7s

This commit is contained in:
2026-06-16 08:17:25 +03:00
parent 2a47744c9d
commit e1872e3d94
2 changed files with 46 additions and 2 deletions

View File

@@ -452,11 +452,28 @@ class Settings(BaseSettings):
# until the budget is exhausted (D5, anti ORCH-110).
# staging_runner_infra_retry_delay_s-> delay before the re-queued deployer job
# (env ORCH_STAGING_RUNNER_INFRA_RETRY_DELAY_S).
# ORCH-123 (D6, adr-0049): the staging suite (`docker exec orchestrator-staging
# ... staging_check.py`) MUST run host-side over the existing trusted ssh channel
# (ORCH-036/058), because the prod container ships only `openssh-client git curl`
# — NOT a `docker` CLI (Dockerfile:11) — so `docker exec` spawned FROM INSIDE the
# prod container hit FileNotFoundError -> a permanent environment defect that
# ORCH-115 mis-routed as a code-fail rollback (incident ORCH-116). The execution
# CHANNEL changes (host-side ssh wrap), the suite COMMAND/contract does not.
# staging_runner_exec_host_side -> True (default = boevoe) wraps the docker-exec
# in `ssh <deploy_ssh_user>@<deploy_ssh_host>
# '<docker exec ...>'` (mirror self_deploy /
# image_freshness host-side docker). False ->
# the prior in-container `docker exec` (valid
# ONLY where a docker CLI is baked into the
# image; the current prod image has none). env
# ORCH_STAGING_RUNNER_EXEC_HOST_SIDE. Rollback:
# set False -> the prior in-container call 1:1.
staging_runner_enabled: bool = True
staging_runner_repos: str = ""
staging_runner_timeout_s: int = 600
staging_runner_infra_max_retries: int = 2
staging_runner_infra_retry_delay_s: int = 30
staging_runner_exec_host_side: bool = True
# ORCH-098 (FND/F2): machine lessons-journal — additive `lessons` table + leaf
# src/lessons.py (never-raise observer, by образцу serial_gate/coverage_gate/

View File

@@ -41,6 +41,8 @@ Two-level outcome (D5 — the key safety decision, anti ORCH-110):
"""
import logging
import shlex
import subprocess
import time
from .config import settings
@@ -53,6 +55,25 @@ logger = logging.getLogger("orchestrator.staging_runner")
# NOT a host hardcode (test_no_host_hardcodes forbids host IP/home/hostname only).
STAGING_SERVICE = "orchestrator-staging"
# ORCH-123 (D3): deterministic stderr markers that prove a PERMANENT environment
# defect (no docker / container down / daemon unreachable) — NOT a transient infra
# hiccup and NEVER a code-fail. Scanned lower-cased; mirror of merge_gate's
# scope-guard marker approach. Recognised suite exit-codes WITHOUT any of these
# markers are always trusted as `suite-ran` (anti-over-tolerance, BR-3).
_ENV_MARKERS = (
"command not found",
"executable file not found",
"no such container",
"is not running",
"cannot connect to the docker daemon",
"is the docker daemon running",
)
# ORCH-123 (D5): last prod-like preflight verdict of the host-side staging channel
# (observed at startup; surfaced in snapshot()). `ok=None` -> not probed / n/a.
_PREFLIGHT_STATE: dict = {"ok": None, "reason": "not-probed"}
_PREFLIGHT_TIMEOUT_S = 20
# Default wall-clock budget for the docker-exec staging suite (D9). Kept <= the LLM
# staging window it replaces so Σ(work on the deploy-staging edge) does not grow and
# the cross-cutting reaper invariant (ORCH-065/109/110) holds WITHOUT touching
@@ -67,12 +88,18 @@ _GIT_TIMEOUT = 60
_INFRA_RETRY_MARKER = "staging-runner infra-retry"
# In-process observability counters (mirror merge_gate._MERGE_GATE_COUNTERS, ORCH-110).
# ORCH-123 (D8): `failed` now counts ONLY a real code-fail (suite ran, exit != 0);
# `permanent_env` counts infra-HOLD events (a permanent environment defect, OR a
# transient-infra DEFER budget exhausted) so /queue distinguishes the THREE
# non-success classes: code-fail (`failed`) vs transient-infra (`deferred`) vs
# permanent-env / infra-HOLD (`permanent_env`).
_STAGING_RUNNER_COUNTERS: dict = {
"runs": 0, # run_staging_gate entered
"success": 0, # suite ran, exit 0 -> SUCCESS
"failed": 0, # suite ran non-zero, OR infra budget exhausted -> FAILED
"failed": 0, # suite ran non-zero -> FAILED (a real code-fail)
"tool_error": 0, # suite did NOT execute (spawn-error / timeout / None)
"deferred": 0, # bounded infra DEFER (re-queued)
"deferred": 0, # bounded transient-infra DEFER (re-queued)
"permanent_env": 0, # infra-HOLD: permanent env defect OR transient budget exhausted
}