integ: merge ORCH-068 reconciler livelock fix

# Conflicts:
#	docs/architecture/README.md
#	src/reconciler.py
This commit is contained in:
stream
2026-06-08 06:36:29 +00:00
20 changed files with 1379 additions and 17 deletions

View File

@@ -0,0 +1,180 @@
"""ORCH-068 (TR-4): tests for the Plane states cache TTL self-heal.
The per-project ``_STATES_CACHE`` used to live for the whole process lifetime,
so a status added to Plane after start was never seen without a restart
("stale set -> no pipeline action"). ORCH-068 adds a TTL: an entry is
re-fetched once it is older than ``plane_states_ttl_s`` (default 300s); ``0``
disables the TTL (strictly the previous lifetime cache).
All tests are offline: the Plane API (httpx) and the monotonic clock are mocked.
"""
import os
import tempfile
from unittest.mock import MagicMock, patch
import pytest
os.environ.setdefault("ORCH_PLANE_API_URL", "http://plane.local")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_WORKSPACE_SLUG", "test-ws")
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
_test_db = os.path.join(tempfile.gettempdir(), "test_plane_states_cache.db")
os.environ["ORCH_DB_PATH"] = _test_db
import src.plane_sync as ps # noqa: E402
_PROJECT = "proj-ttl"
_ET_PROJECT = "7a79f0a9-5278-49cd-9007-9a338f238f9c"
def _resp(data: dict, status: int = 200):
m = MagicMock()
m.status_code = status
m.json.return_value = data
if status >= 400:
from httpx import HTTPStatusError
m.raise_for_status.side_effect = HTTPStatusError(
"error", request=MagicMock(), response=MagicMock()
)
else:
m.raise_for_status.return_value = None
return m
def _states_response(in_progress_uuid: str) -> dict:
"""A minimal /states/ payload; In Progress carries the given UUID."""
return {
"results": [
{"id": in_progress_uuid, "name": "In Progress", "group": "started"},
{"id": "uuid-done", "name": "Done", "group": "completed"},
]
}
@pytest.fixture(autouse=True)
def reset_cache():
ps.reload_project_states()
yield
ps.reload_project_states()
# ---------------------------------------------------------------------------
# TC-11 (AC-12): a stale cache entry self-heals after the TTL — no restart.
# ---------------------------------------------------------------------------
def test_tc11_stale_cache_refreshes_after_ttl(monkeypatch):
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
clock = {"t": 1000.0}
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
responses = iter([
_resp(_states_response("uuid-A")), # first fetch: old set
_resp(_states_response("uuid-B")), # second fetch: new status appeared
])
mock_get = MagicMock(side_effect=lambda *a, **k: next(responses))
monkeypatch.setattr(ps.httpx, "get", mock_get)
# t=1000: first call -> fetch set A.
s1 = ps.get_project_states(_PROJECT)
assert s1["in_progress"] == "uuid-A"
assert mock_get.call_count == 1
# t=1100: within TTL -> served from cache, no new fetch.
clock["t"] = 1100.0
s2 = ps.get_project_states(_PROJECT)
assert s2["in_progress"] == "uuid-A"
assert mock_get.call_count == 1
# t=1400: TTL (300s) elapsed -> re-fetch -> fresh set B (self-heal).
clock["t"] = 1400.0
s3 = ps.get_project_states(_PROJECT)
assert s3["in_progress"] == "uuid-B"
assert mock_get.call_count == 2
def test_tc11_ttl_zero_keeps_lifetime_cache(monkeypatch):
"""plane_states_ttl_s=0 -> strictly the previous lifetime cache (back-compat)."""
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 0)
clock = {"t": 1000.0}
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
responses = iter([
_resp(_states_response("uuid-A")),
_resp(_states_response("uuid-B")),
])
mock_get = MagicMock(side_effect=lambda *a, **k: next(responses))
monkeypatch.setattr(ps.httpx, "get", mock_get)
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
clock["t"] = 1_000_000.0 # far in the future
# TTL disabled -> still the cached A, never re-fetched.
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
assert mock_get.call_count == 1
def test_tc11_groups_exposed_via_accessor(monkeypatch):
"""get_project_state_groups returns {uuid -> group} from the same record."""
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(_states_response("uuid-A")))
ps.get_project_states(_PROJECT)
groups = ps.get_project_state_groups(_PROJECT)
assert groups["uuid-A"] == "started"
assert groups["uuid-done"] == "completed"
def test_tc11_groups_empty_when_uncached(monkeypatch):
"""No cache record (e.g. API fell back to defaults) -> groups == {}."""
assert ps.get_project_state_groups("never-fetched") == {}
# ---------------------------------------------------------------------------
# TC-12 (AC-13): default-config compatibility — enduro UUIDs + API-error fallback.
# ---------------------------------------------------------------------------
def test_tc12_enduro_uuids_unchanged(monkeypatch):
"""enduro project still resolves its own UUIDs (return shape unchanged)."""
body = {
"results": [
{"id": "b873d9eb-993c-48cd-97ac-99a9b1623967",
"name": "In Progress", "group": "started"},
]
}
monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(body))
states = ps.get_project_states(_ET_PROJECT)
assert states["in_progress"] == "b873d9eb-993c-48cd-97ac-99a9b1623967"
# Missing keys are still backfilled from _DEFAULT_STATES (complete mapping).
assert states["done"] == ps._DEFAULT_STATES["done"]
def test_tc12_api_error_falls_back_to_defaults(monkeypatch):
"""API failure with nothing cached -> _DEFAULT_STATES (fallback preserved)."""
monkeypatch.setattr(
ps.httpx, "get", MagicMock(side_effect=Exception("network error"))
)
states = ps.get_project_states(_PROJECT)
assert states is ps._DEFAULT_STATES
def test_tc12_stale_served_when_refresh_fails(monkeypatch):
"""TTL expiry + transient API failure -> serve the stale (project-correct)
set rather than reverting to enduro defaults."""
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
clock = {"t": 1000.0}
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
calls = {"n": 0}
def flaky_get(*a, **k):
calls["n"] += 1
if calls["n"] == 1:
return _resp(_states_response("uuid-A"))
raise Exception("transient outage")
monkeypatch.setattr(ps.httpx, "get", flaky_get)
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
clock["t"] = 2000.0 # past TTL -> refresh attempt fails
states = ps.get_project_states(_PROJECT)
assert states["in_progress"] == "uuid-A" # stale-but-correct, not defaults
assert states is not ps._DEFAULT_STATES

View File

@@ -341,3 +341,342 @@ def test_tc17_polls_all_projects_resolves_states_per_project(monkeypatch):
# state uuids are resolved per-project (not hardcoded): each call carries them.
for _pid, states in issues_calls:
assert set(states) == {_IN_PROGRESS, _APPROVED, _REJECTED}
# ===========================================================================
# ORCH-068: livelock-fix — terminal exclusion (D1) + confirmed-change unblock
# (D2) + dedup (TR-3). The old code spammed `_note_unblock` every ~120s for a
# fully synchronized Done task (incident: ET-002, 191+ Telegram messages/night).
# ===========================================================================
_DONE = "uuid-done"
_CANCELLED = "uuid-cancelled"
def _patch_states_with_terminals(monkeypatch, *, alias_done_to_approved=False):
"""Patch F-2 state resolution to include terminals + their groups.
``alias_done_to_approved`` models the regression trigger (ORCH-066): the
project "collapses" Done onto the approved UUID, so a genuinely-Done issue
would enter the ``approved`` branch by UUID. Only the state GROUP
(``completed``) disentangles it — the heart of D1.
"""
done_uuid = _APPROVED if alias_done_to_approved else _DONE
states = {
"in_progress": _IN_PROGRESS,
"approved": _APPROVED,
"rejected": _REJECTED,
"done": done_uuid,
"cancelled": _CANCELLED,
}
groups = {
_IN_PROGRESS: "started",
_APPROVED: "started",
_REJECTED: "started",
done_uuid: "completed", # genuinely-done issue -> completed group
_CANCELLED: "cancelled",
}
monkeypatch.setattr(reconciler_mod, "get_project_states", lambda pid: states)
monkeypatch.setattr(
reconciler_mod, "get_project_state_groups", lambda pid: groups
)
return states, groups
def _spy_telegram(monkeypatch):
sent = []
monkeypatch.setattr(reconciler_mod, "send_telegram", lambda msg: sent.append(msg))
return sent
def _job_count():
conn = get_db()
n = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0]
conn.close()
return n
# ---------------------------------------------------------------------------
# TC-01 (AC-1, AC-7): synchronized Done task -> total silence, 0 jobs.
# ---------------------------------------------------------------------------
def test_tc01_synced_done_is_silent(monkeypatch, single_project):
start, verdict = _patch_handlers(monkeypatch)
_patch_states_with_terminals(monkeypatch)
sent = _spy_telegram(monkeypatch)
_make_task("iss-done", stage="done", wi="ET-002")
_patch_issues(monkeypatch, [
{"id": "iss-done", "state": {"id": _DONE}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
start.assert_not_called()
verdict.assert_not_called()
assert sent == []
assert recon.unblocked_total == 0
assert recon.skipped_terminal_total == 1
assert _job_count() == 0
# ---------------------------------------------------------------------------
# TC-02 (AC-2): Done UUID aliased onto approved -> still excluded by GROUP.
# ---------------------------------------------------------------------------
def test_tc02_terminal_aliased_to_approved_excluded(monkeypatch, single_project):
start, verdict = _patch_handlers(monkeypatch)
_patch_states_with_terminals(monkeypatch, alias_done_to_approved=True)
sent = _spy_telegram(monkeypatch)
# Task is Done; its Plane state UUID equals the approved UUID (aliasing).
_make_task("iss-alias", stage="done", wi="ET-002")
_patch_issues(monkeypatch, [
{"id": "iss-alias", "state": {"id": _APPROVED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
# Without the group check this would enter the approved branch and notify.
start.assert_not_called()
verdict.assert_not_called()
assert sent == []
assert recon.unblocked_total == 0
assert recon.skipped_terminal_total == 1
# ---------------------------------------------------------------------------
# TC-03 (AC-2): Cancelled terminal is also excluded.
# ---------------------------------------------------------------------------
def test_tc03_cancelled_excluded(monkeypatch, single_project):
start, verdict = _patch_handlers(monkeypatch)
_patch_states_with_terminals(monkeypatch)
sent = _spy_telegram(monkeypatch)
_make_task("iss-cancel", stage="done", wi="ET-003")
_patch_issues(monkeypatch, [
{"id": "iss-cancel", "state": {"id": _CANCELLED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
start.assert_not_called()
verdict.assert_not_called()
assert sent == []
assert recon.unblocked_total == 0
assert recon.skipped_terminal_total == 1
# ---------------------------------------------------------------------------
# TC-04 (AC-3): no-op dispatch (stage unchanged) -> no notification.
# ---------------------------------------------------------------------------
def test_tc04_noop_dispatch_no_unblock(monkeypatch, single_project):
# handle_verdict is a no-op AsyncMock -> the task stage never moves.
start, verdict = _patch_handlers(monkeypatch)
sent = _spy_telegram(monkeypatch)
_make_task("iss-noop", stage="review")
_patch_issues(monkeypatch, [
{"id": "iss-noop", "state": {"id": _APPROVED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
# The handler was replayed (idempotent), but nothing changed -> silence.
assert verdict.call_count == 1
assert sent == []
assert recon.unblocked_total == 0
# ---------------------------------------------------------------------------
# TC-05 (AC-4): two consecutive ticks on a synced task -> 0 repeat unblocks;
# plus a direct check of the in-memory dedup guard.
# ---------------------------------------------------------------------------
def test_tc05_dedup_no_repeat_notification(monkeypatch, single_project):
start, verdict = _patch_handlers(monkeypatch)
_patch_states_with_terminals(monkeypatch)
sent = _spy_telegram(monkeypatch)
_make_task("iss-dedup", stage="done", wi="ET-004")
_patch_issues(monkeypatch, [
{"id": "iss-dedup", "state": {"id": _DONE}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
recon.reconcile_plane_once()
assert sent == []
assert recon.unblocked_total == 0
# Direct dedup-guard exercise: the same issue+state notifies at most once.
recon._note_unblock("ET-004", "review", "state-x")
recon._note_unblock("ET-004", "review", "state-x")
assert recon.unblocked_total == 1
assert recon.deduped_total == 1
# ---------------------------------------------------------------------------
# TC-06 (AC-5): legit lost Approved webhook -> replayed, advanced, ONE unblock.
# ---------------------------------------------------------------------------
def test_tc06_legit_approved_unblock_once(monkeypatch, single_project):
_patch_states_with_terminals(monkeypatch) # non-terminal approved -> actionable
sent = _spy_telegram(monkeypatch)
_make_task("iss-appr", stage="review", wi="ET-005")
async def fake_verdict(issue_data, project_id, approved=True):
# Simulate the real handler advancing the stage (review -> testing).
conn = get_db()
conn.execute(
"UPDATE tasks SET stage='testing' WHERE plane_id=?",
(issue_data["id"],),
)
conn.commit()
conn.close()
monkeypatch.setattr(reconciler_mod, "handle_verdict", fake_verdict)
monkeypatch.setattr(reconciler_mod, "handle_status_start", AsyncMock())
_patch_issues(monkeypatch, [
{"id": "iss-appr", "state": {"id": _APPROVED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
assert recon.unblocked_total == 1
assert len(sent) == 1
assert "ET-005" in sent[0]
# ---------------------------------------------------------------------------
# TC-07 (AC-6): lost In Progress start (task appears) and lost Rejected
# rollback (stage moves) each fire exactly one unblock.
# ---------------------------------------------------------------------------
def test_tc07_in_progress_start_and_rejected_each_one_unblock(
monkeypatch, single_project
):
_patch_states_with_terminals(monkeypatch)
sent = _spy_telegram(monkeypatch)
async def fake_start(issue_data, project_id):
# Simulate the real start handler creating the task.
_make_task(issue_data["id"], stage="analysis", wi="ET-006")
async def fake_verdict(issue_data, project_id, approved=True):
conn = get_db()
conn.execute(
"UPDATE tasks SET stage='development' WHERE plane_id=?",
(issue_data["id"],),
)
conn.commit()
conn.close()
monkeypatch.setattr(reconciler_mod, "handle_status_start", fake_start)
monkeypatch.setattr(reconciler_mod, "handle_verdict", fake_verdict)
# Rejected task already exists at review; In Progress one has no task yet.
_make_task("iss-rej", stage="review", wi="ET-007")
_patch_issues(monkeypatch, [
{"id": "iss-start", "state": {"id": _IN_PROGRESS}, "updated_at": _OLD_TS},
{"id": "iss-rej", "state": {"id": _REJECTED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once()
assert recon.unblocked_total == 2
assert len(sent) == 2
# ---------------------------------------------------------------------------
# TC-08 (AC-8): never-raise — a failing dependency isolates to its unit of work.
# ---------------------------------------------------------------------------
def test_tc08_never_raise_isolation(monkeypatch, single_project):
_patch_states_with_terminals(monkeypatch)
monkeypatch.setattr(reconciler_mod, "send_telegram", lambda msg: None)
# _dispatch blows up for one issue -> isolated; the tick must not crash.
def boom_dispatch(*a, **k):
raise RuntimeError("handler exploded")
monkeypatch.setattr(Reconciler, "_dispatch", staticmethod(boom_dispatch))
_make_task("iss-boom", stage="review", wi="ET-008")
_patch_issues(monkeypatch, [
{"id": "iss-boom", "state": {"id": _APPROVED}, "updated_at": _OLD_TS},
])
recon = Reconciler()
recon.reconcile_plane_once() # must NOT raise
assert recon.unblocked_total == 0
# list_issues_by_state raising -> per-project isolation, still no crash.
def boom_list(pid, states):
raise RuntimeError("plane down")
monkeypatch.setattr(reconciler_mod, "list_issues_by_state", boom_list)
recon.reconcile_plane_once() # must NOT raise
# ---------------------------------------------------------------------------
# TC-09 (AC-9): kill-switches mute F-2.
# ---------------------------------------------------------------------------
def test_tc09_kill_switches(monkeypatch, single_project):
start, verdict = _patch_handlers(monkeypatch)
_patch_states_with_terminals(monkeypatch)
called = {"list": 0}
def counting_list(pid, states):
called["list"] += 1
return [{"id": "iss-x", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}]
monkeypatch.setattr(reconciler_mod, "list_issues_by_state", counting_list)
monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", False)
Reconciler().reconcile_plane_once()
assert called["list"] == 0 # global switch off -> F-2 never runs
monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", True)
monkeypatch.setattr(reconciler_mod.settings, "reconcile_plane_enabled", False)
Reconciler().reconcile_plane_once()
assert called["list"] == 0 # F-2 switch off -> still no poll
# ---------------------------------------------------------------------------
# TC-10 (AC-1, AC-2): end-to-end on BOTH registry projects (enduro AND
# orchestrator): a Done task on each -> 0 notifications / 0 jobs, regardless
# of per-project status aliasing. The headline regression test.
# ---------------------------------------------------------------------------
def test_tc10_done_silent_on_all_projects(monkeypatch):
from src import projects as projects_mod
projects_mod.reload_projects()
assert len({p.plane_project_id for p in projects_mod.PROJECTS}) >= 2
start, verdict = _patch_handlers(monkeypatch)
sent = _spy_telegram(monkeypatch)
states = {
"in_progress": _IN_PROGRESS,
"approved": _APPROVED,
"rejected": _REJECTED,
"done": _DONE,
"cancelled": _CANCELLED,
}
groups = {_DONE: "completed", _CANCELLED: "cancelled"}
monkeypatch.setattr(reconciler_mod, "get_project_states", lambda pid: states)
monkeypatch.setattr(
reconciler_mod, "get_project_state_groups", lambda pid: groups
)
# Each project returns a Done issue (unique id per project).
monkeypatch.setattr(
reconciler_mod, "list_issues_by_state",
lambda pid, st: [
{"id": f"done-{pid}", "state": {"id": _DONE}, "updated_at": _OLD_TS}
],
)
recon = Reconciler()
recon.reconcile_plane_once()
start.assert_not_called()
verdict.assert_not_called()
assert sent == []
assert recon.unblocked_total == 0
assert recon.skipped_terminal_total >= 2 # one per project
assert _job_count() == 0