feat(pipeline): add deploy-staging gate before prod deploy (ORCH-35)

2026-06-05 10:06:06 +03:00
parent e405a55f9d
commit e0b6e92b09
6 changed files with 413 additions and 4 deletions
--- a/src/qg/checks.py
+++ b/src/qg/checks.py
@@ -440,6 +440,100 @@ def check_deploy_status(repo: str, work_item_id: str, branch: str | None = None)
    return False, "Deploy log not found (14-deploy-log.md)"


+
+def _parse_staging_status(content: str) -> tuple[bool, str]:
+    """Parse a 15-staging-log.md body and map its `staging_status:` frontmatter to a
+    quality-gate verdict. Reads ONLY the machine-readable YAML field, never prose.
+
+      staging_status: SUCCESS -> (True,  "Staging status: SUCCESS")
+      staging_status: FAILED  -> (False, "Staging status: FAILED")
+      missing field / no frontmatter / bad YAML -> (False, <reason>)
+    """
+    import yaml
+    status = None
+    if content.startswith("---"):
+        parts = content.split("---", 2)
+        if len(parts) >= 3:
+            try:
+                fm = yaml.safe_load(parts[1]) or {}
+            except yaml.YAMLError as e:
+                return False, f"Invalid YAML frontmatter in staging log: {e}"
+            status = str(fm.get("staging_status", "")).upper().strip()
+    if status == "SUCCESS":
+        return True, "Staging status: SUCCESS"
+    if status == "FAILED":
+        return False, "Staging status: FAILED"
+    return False, f"No machine-readable staging_status in frontmatter (got: {status!r})"
+
+
+def _staging_log_from_main(repo: str, work_item_id: str) -> str | None:
+    """Best-effort read of 15-staging-log.md from origin/main on the shared clone.
+
+    The deployer writes 15-staging-log.md and merges the staging artifacts into main
+    via a separate PR (mirroring the deploy-log pattern), so the file lands in
+    origin/main, NOT in the feature branch worktree the gate normally reads.
+    This recovers it from main.
+
+    Degrades gracefully: any git failure (no clone, network/fetch error, file
+    absent in main) returns None instead of raising, so the caller falls back to
+    the plain "not found" verdict. Never raises.
+    """
+    repo_clone = os.path.join(settings.repos_dir, repo)
+    if not os.path.isdir(os.path.join(repo_clone, ".git")):
+        return None
+    rel = f"docs/work-items/{work_item_id}/15-staging-log.md"
+    try:
+        # Refresh origin/main so we see freshly-merged staging artifacts.
+        subprocess.run(
+            ["git", "-C", repo_clone, "fetch", "origin", "main"],
+            check=False, capture_output=True, timeout=30,
+        )
+        show = subprocess.run(
+            ["git", "-C", repo_clone, "show", f"origin/main:{rel}"],
+            check=False, capture_output=True, text=True, timeout=15,
+        )
+    except (subprocess.SubprocessError, OSError) as e:
+        logger.warning("staging-log origin/main lookup failed for %s/%s: %s", repo, work_item_id, e)
+        return None
+    if show.returncode != 0:
+        return None
+    return show.stdout
+
+
+def check_staging_status(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
+    """
+    Gate the deploy-staging -> deploy transition on the deployer's machine-readable
+    verdict in 15-staging-log.md frontmatter (staging_status: SUCCESS|FAILED).
+
+    Mirrors check_deploy_status (БАГ 8): reads ONLY the machine-readable YAML field,
+    never the body prose. The deployer runs the staging test suite against localhost:8501
+    and writes the verdict into 15-staging-log.md.
+
+    Lookup order: worktree -> origin/main -> not found.
+
+    Returns:
+      (True, ...)  -> staging_status: SUCCESS
+      (False, ...) -> staging_status: FAILED, missing field, or no frontmatter
+    """
+    repo_path = _repo_path(repo, branch)
+    log_path = os.path.join(repo_path, f"docs/work-items/{work_item_id}/15-staging-log.md")
+
+    if os.path.isfile(log_path):
+        try:
+            with open(log_path, "r") as f:
+                content = f.read()
+        except OSError as e:
+            return False, f"Error reading staging log: {e}"
+        return _parse_staging_status(content)
+
+    # Not in the feature worktree — the deployer may have merged it into main.
+    main_content = _staging_log_from_main(repo, work_item_id)
+    if main_content is not None:
+        return _parse_staging_status(main_content)
+
+    return False, "Staging log not found (15-staging-log.md)"
+
+
 # Registry for dynamic lookup by name
 QG_CHECKS = {
    "check_analysis_approved": check_analysis_approved,
@@ -451,4 +545,5 @@ QG_CHECKS = {
    "check_reviewer_verdict": check_reviewer_verdict,
    "check_tests_local": check_tests_local,
    "check_deploy_status": check_deploy_status,
+    "check_staging_status": check_staging_status,
 }
--- a/src/stage_engine.py
+++ b/src/stage_engine.py
@@ -517,6 +517,32 @@ def _handle_qg_failure_rollbacks(
                f"(job_id={new_job})"
            )

+    # ORCH-35: deployer staging verdict FAILED -> roll deploy-staging back to development.
+    # Staging-провал = код плох; откат на development по образцу БАГ-8 (deploy->development).
+    # НЕ трогает ветку check_deploy_status ниже.
+    if agent == "deployer" and qg_name == "check_staging_status":
+        update_task_stage(task_id, "development")
+        notify_stage_change(task_id, current_stage, "development")
+        plane_notify_stage(work_item_id, current_stage, "development")
+        result.rolled_back_to = "development"
+        set_issue_blocked(work_item_id)
+        notify_qg_failure(task_id, "deploy-staging", "check_staging_status", reason)
+        plane_add_comment(
+            work_item_id,
+            f"\u274c Staging gate FAILED ({reason}). Rolled back to development. "
+            f"Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430.",
+            author="deployer",
+        )
+        send_telegram(
+            f"\U0001f6a8 {work_item_id}: Staging FAILED ({reason}). "
+            f"Rolled back to development. Needs fix."
+        )
+        result.alerted = True
+        logger.error(
+            f"Task {task_id}: deployer staging verdict FAILED, rolled back deploy-staging -> "
+            f"development ({reason})"
+        )
+
    # БАГ 8: deployer verdict FAILED -> roll deploy back to development.
    # The launcher's exit_code-based guard (launcher.py:475) never fires because
    # the LLM process exit code is always 0; this gate fires on the machine-readable
--- a/src/stages.py
+++ b/src/stages.py
@@ -1,7 +1,7 @@
 """Stage machine for orchestrator pipeline.

 Stages:
-  created → analysis → architecture → development → review → testing → deploy → done
+  created → analysis → architecture → development → review → testing → deploy-staging → deploy → done

 Each stage defines:
  - next: the stage to advance to
@@ -15,8 +15,9 @@ STAGE_TRANSITIONS = {
    "architecture": {"next": "development", "agent": "developer", "qg": "check_architecture_done"},
    "development": {"next": "review", "agent": "reviewer", "qg": "check_ci_green"},
    "review": {"next": "testing", "agent": "tester", "qg": "check_reviewer_verdict"},
-    "testing": {"next": "deploy", "agent": "deployer", "qg": "check_tests_passed"},
-    "deploy": {"next": "done", "agent": None, "qg": "check_deploy_status"},
+    "testing":        {"next": "deploy-staging", "agent": "deployer",  "qg": "check_tests_passed"},
+    "deploy-staging": {"next": "deploy",         "agent": "deployer",  "qg": "check_staging_status"},
+    "deploy":         {"next": "done",            "agent": None,        "qg": "check_deploy_status"},
    "done": {"next": None, "agent": None, "qg": None},
 }