developer(ET): auto-commit from developer run_id=749
This commit is contained in:
@@ -452,11 +452,28 @@ class Settings(BaseSettings):
|
||||
# until the budget is exhausted (D5, anti ORCH-110).
|
||||
# staging_runner_infra_retry_delay_s-> delay before the re-queued deployer job
|
||||
# (env ORCH_STAGING_RUNNER_INFRA_RETRY_DELAY_S).
|
||||
# ORCH-123 (D6, adr-0049): the staging suite (`docker exec orchestrator-staging
|
||||
# ... staging_check.py`) MUST run host-side over the existing trusted ssh channel
|
||||
# (ORCH-036/058), because the prod container ships only `openssh-client git curl`
|
||||
# — NOT a `docker` CLI (Dockerfile:11) — so `docker exec` spawned FROM INSIDE the
|
||||
# prod container hit FileNotFoundError -> a permanent environment defect that
|
||||
# ORCH-115 mis-routed as a code-fail rollback (incident ORCH-116). The execution
|
||||
# CHANNEL changes (host-side ssh wrap), the suite COMMAND/contract does not.
|
||||
# staging_runner_exec_host_side -> True (default = boevoe) wraps the docker-exec
|
||||
# in `ssh <deploy_ssh_user>@<deploy_ssh_host>
|
||||
# '<docker exec ...>'` (mirror self_deploy /
|
||||
# image_freshness host-side docker). False ->
|
||||
# the prior in-container `docker exec` (valid
|
||||
# ONLY where a docker CLI is baked into the
|
||||
# image; the current prod image has none). env
|
||||
# ORCH_STAGING_RUNNER_EXEC_HOST_SIDE. Rollback:
|
||||
# set False -> the prior in-container call 1:1.
|
||||
staging_runner_enabled: bool = True
|
||||
staging_runner_repos: str = ""
|
||||
staging_runner_timeout_s: int = 600
|
||||
staging_runner_infra_max_retries: int = 2
|
||||
staging_runner_infra_retry_delay_s: int = 30
|
||||
staging_runner_exec_host_side: bool = True
|
||||
|
||||
# ORCH-098 (FND/F2): machine lessons-journal — additive `lessons` table + leaf
|
||||
# src/lessons.py (never-raise observer, by образцу serial_gate/coverage_gate/
|
||||
|
||||
@@ -41,6 +41,8 @@ Two-level outcome (D5 — the key safety decision, anti ORCH-110):
|
||||
"""
|
||||
|
||||
import logging
|
||||
import shlex
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from .config import settings
|
||||
@@ -53,6 +55,25 @@ logger = logging.getLogger("orchestrator.staging_runner")
|
||||
# NOT a host hardcode (test_no_host_hardcodes forbids host IP/home/hostname only).
|
||||
STAGING_SERVICE = "orchestrator-staging"
|
||||
|
||||
# ORCH-123 (D3): deterministic stderr markers that prove a PERMANENT environment
|
||||
# defect (no docker / container down / daemon unreachable) — NOT a transient infra
|
||||
# hiccup and NEVER a code-fail. Scanned lower-cased; mirror of merge_gate's
|
||||
# scope-guard marker approach. Recognised suite exit-codes WITHOUT any of these
|
||||
# markers are always trusted as `suite-ran` (anti-over-tolerance, BR-3).
|
||||
_ENV_MARKERS = (
|
||||
"command not found",
|
||||
"executable file not found",
|
||||
"no such container",
|
||||
"is not running",
|
||||
"cannot connect to the docker daemon",
|
||||
"is the docker daemon running",
|
||||
)
|
||||
|
||||
# ORCH-123 (D5): last prod-like preflight verdict of the host-side staging channel
|
||||
# (observed at startup; surfaced in snapshot()). `ok=None` -> not probed / n/a.
|
||||
_PREFLIGHT_STATE: dict = {"ok": None, "reason": "not-probed"}
|
||||
_PREFLIGHT_TIMEOUT_S = 20
|
||||
|
||||
# Default wall-clock budget for the docker-exec staging suite (D9). Kept <= the LLM
|
||||
# staging window it replaces so Σ(work on the deploy-staging edge) does not grow and
|
||||
# the cross-cutting reaper invariant (ORCH-065/109/110) holds WITHOUT touching
|
||||
@@ -67,12 +88,18 @@ _GIT_TIMEOUT = 60
|
||||
_INFRA_RETRY_MARKER = "staging-runner infra-retry"
|
||||
|
||||
# In-process observability counters (mirror merge_gate._MERGE_GATE_COUNTERS, ORCH-110).
|
||||
# ORCH-123 (D8): `failed` now counts ONLY a real code-fail (suite ran, exit != 0);
|
||||
# `permanent_env` counts infra-HOLD events (a permanent environment defect, OR a
|
||||
# transient-infra DEFER budget exhausted) so /queue distinguishes the THREE
|
||||
# non-success classes: code-fail (`failed`) vs transient-infra (`deferred`) vs
|
||||
# permanent-env / infra-HOLD (`permanent_env`).
|
||||
_STAGING_RUNNER_COUNTERS: dict = {
|
||||
"runs": 0, # run_staging_gate entered
|
||||
"success": 0, # suite ran, exit 0 -> SUCCESS
|
||||
"failed": 0, # suite ran non-zero, OR infra budget exhausted -> FAILED
|
||||
"failed": 0, # suite ran non-zero -> FAILED (a real code-fail)
|
||||
"tool_error": 0, # suite did NOT execute (spawn-error / timeout / None)
|
||||
"deferred": 0, # bounded infra DEFER (re-queued)
|
||||
"deferred": 0, # bounded transient-infra DEFER (re-queued)
|
||||
"permanent_env": 0, # infra-HOLD: permanent env defect OR transient budget exhausted
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user