feat(qg): ORCH-045 — poll check_ci_green with retry to fix CI race (pending->success)
All checks were successful
CI / test (push) Successful in 12s
CI / test (pull_request) Successful in 12s

This commit is contained in:
stream
2026-06-05 19:59:06 +00:00
parent b9c61fc1f1
commit 0eff781d13
6 changed files with 169 additions and 26 deletions

View File

@@ -121,6 +121,15 @@ class Settings(BaseSettings):
log_keep_max: int = 500
# ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
# combined commit status up to ci_poll_max_attempts times, sleeping
# ci_poll_interval_s between attempts, to ride out a transient pending
# state right after the developer push (race fix, see ORCH-017).
# ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
# ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
ci_poll_max_attempts: int = 12
ci_poll_interval_s: int = 10
# Telegram notifications
telegram_bot_token: str = ""
telegram_chat_id: str = ""

View File

@@ -1,6 +1,7 @@
"""Quality Gate checks — real implementations using Gitea/Plane API and filesystem."""
import os
import time
import logging
import subprocess
import httpx
@@ -82,23 +83,65 @@ def check_ci_green(repo: str, branch: str) -> tuple[bool, str]:
"""
Check if CI status is green for branch via Gitea API.
GET /repos/{owner}/{repo}/commits/{branch}/status
ORCH-045: polling with retry to fix a race condition. The gate used to do a
single status read right after the developer push; if CI was still ``pending``
for the first 1-3s (real case ORCH-017: polled 17:58:54 -> pending, CI went
green 17:58:55) the gate returned False once and the task stalled silently.
Behaviour now:
* ``success`` -> (True, "CI green") immediately.
* ``failure`` / ``error`` -> (False, "CI state: <state>") immediately
(CI is red, retrying is pointless).
* ``pending`` / unknown -> sleep ``ci_poll_interval_s`` and poll again,
up to ``ci_poll_max_attempts`` times.
* still pending after all attempts -> (False, "CI still pending after <T>s").
* 404 -> (False, "Branch not found or no status").
* transient httpx errors -> logged and retried within the attempt budget;
if every attempt errors -> (False, "API error: <e>").
"""
owner = settings.gitea_owner
url = f"{GITEA_BASE}/repos/{owner}/{repo}/commits/{branch}/status"
try:
resp = httpx.get(url, headers=GITEA_HEADERS, timeout=10)
if resp.status_code == 404:
return False, f"Branch '{branch}' not found or no status"
resp.raise_for_status()
data = resp.json()
state = data.get("state", "unknown")
if state == "success":
return True, "CI green"
return False, f"CI state: {state}"
except httpx.HTTPError as e:
logger.error(f"Gitea API error checking CI: {e}")
return False, f"API error: {e}"
attempts = settings.ci_poll_max_attempts
interval = settings.ci_poll_interval_s
last_state = "unknown"
last_error: Exception | None = None
for i in range(1, attempts + 1):
try:
resp = httpx.get(url, headers=GITEA_HEADERS, timeout=10)
if resp.status_code == 404:
return False, f"Branch '{branch}' not found or no status"
resp.raise_for_status()
data = resp.json()
last_state = data.get("state", "unknown")
last_error = None
if last_state == "success":
return True, "CI green"
if last_state in ("failure", "error"):
return False, f"CI state: {last_state}"
# non-terminal (pending / unknown / other) -> retry below
except httpx.HTTPError as e:
last_error = e
logger.error(f"check_ci_green: attempt {i}/{attempts} API error: {e}")
if i < attempts:
if last_error is not None:
logger.info(
f"check_ci_green: attempt {i}/{attempts}, error, retrying in {interval}s"
)
else:
logger.info(
f"check_ci_green: attempt {i}/{attempts}, state={last_state}, "
f"retrying in {interval}s"
)
time.sleep(interval)
if last_error is not None:
return False, f"API error: {last_error}"
return False, f"CI still pending after {attempts * interval}s"
def check_review_approved(repo: str, pr_number: int) -> tuple[bool, str]: