Reconciler F-2 spammed Telegram "<wi> разблокирована" every ~120s for a
fully-synchronized Done task (incident ET-002, 191+ msgs/night) after the
ORCH-066 Plane status model merge. Two stacked defects (defense in depth):
- D1 (selection): actionable states were told apart by bare UUID, so a Done
issue aliased onto the approved UUID entered the approved branch. Now
terminal states are excluded by Plane state GROUP (completed/cancelled),
a project-independent discriminator robust to UUID aliasing; per-issue
check with a logical-key fallback when the group is unavailable.
get_project_states caches {uuid -> group} from the same /states/ fetch;
new sibling accessor get_project_state_groups.
- D2 (notification): _note_unblock fired unconditionally after _dispatch.
Now it only fires on a confirmed state change (stage before/after _dispatch;
task-appears for the start case) — handlers' contracts untouched.
- TR-3: in-memory dedup guard {issue_id -> last unblocked state} as a backstop.
- TR-4: _STATES_CACHE lived for the whole process lifetime, so a new Plane
status was invisible without a restart. Added TTL ORCH_PLANE_STATES_TTL_S
(default 300s; 0 = previous lifetime cache) reusing reload_project_states();
a failed refresh serves the stale-but-correct set, not enduro defaults.
STAGE_TRANSITIONS / QG_CHECKS / DB schema / handle_* contracts / F-1 / F-3
unchanged; never-raise preserved; self-hosting tick never restarts prod.
Observability: skipped_terminal_total / deduped_total in /queue reconcile block.
Tests: tests/test_reconciler_plane.py (TC-01..TC-10),
tests/test_plane_states_cache.py (TC-11/TC-12).
Refs: ORCH-068
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
181 lines
6.7 KiB
Python
181 lines
6.7 KiB
Python
"""ORCH-068 (TR-4): tests for the Plane states cache TTL self-heal.
|
|
|
|
The per-project ``_STATES_CACHE`` used to live for the whole process lifetime,
|
|
so a status added to Plane after start was never seen without a restart
|
|
("stale set -> no pipeline action"). ORCH-068 adds a TTL: an entry is
|
|
re-fetched once it is older than ``plane_states_ttl_s`` (default 300s); ``0``
|
|
disables the TTL (strictly the previous lifetime cache).
|
|
|
|
All tests are offline: the Plane API (httpx) and the monotonic clock are mocked.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
os.environ.setdefault("ORCH_PLANE_API_URL", "http://plane.local")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_WORKSPACE_SLUG", "test-ws")
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
|
|
_test_db = os.path.join(tempfile.gettempdir(), "test_plane_states_cache.db")
|
|
os.environ["ORCH_DB_PATH"] = _test_db
|
|
|
|
import src.plane_sync as ps # noqa: E402
|
|
|
|
_PROJECT = "proj-ttl"
|
|
_ET_PROJECT = "7a79f0a9-5278-49cd-9007-9a338f238f9c"
|
|
|
|
|
|
def _resp(data: dict, status: int = 200):
|
|
m = MagicMock()
|
|
m.status_code = status
|
|
m.json.return_value = data
|
|
if status >= 400:
|
|
from httpx import HTTPStatusError
|
|
m.raise_for_status.side_effect = HTTPStatusError(
|
|
"error", request=MagicMock(), response=MagicMock()
|
|
)
|
|
else:
|
|
m.raise_for_status.return_value = None
|
|
return m
|
|
|
|
|
|
def _states_response(in_progress_uuid: str) -> dict:
|
|
"""A minimal /states/ payload; In Progress carries the given UUID."""
|
|
return {
|
|
"results": [
|
|
{"id": in_progress_uuid, "name": "In Progress", "group": "started"},
|
|
{"id": "uuid-done", "name": "Done", "group": "completed"},
|
|
]
|
|
}
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def reset_cache():
|
|
ps.reload_project_states()
|
|
yield
|
|
ps.reload_project_states()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-11 (AC-12): a stale cache entry self-heals after the TTL — no restart.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc11_stale_cache_refreshes_after_ttl(monkeypatch):
|
|
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
|
|
clock = {"t": 1000.0}
|
|
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
|
|
|
|
responses = iter([
|
|
_resp(_states_response("uuid-A")), # first fetch: old set
|
|
_resp(_states_response("uuid-B")), # second fetch: new status appeared
|
|
])
|
|
mock_get = MagicMock(side_effect=lambda *a, **k: next(responses))
|
|
monkeypatch.setattr(ps.httpx, "get", mock_get)
|
|
|
|
# t=1000: first call -> fetch set A.
|
|
s1 = ps.get_project_states(_PROJECT)
|
|
assert s1["in_progress"] == "uuid-A"
|
|
assert mock_get.call_count == 1
|
|
|
|
# t=1100: within TTL -> served from cache, no new fetch.
|
|
clock["t"] = 1100.0
|
|
s2 = ps.get_project_states(_PROJECT)
|
|
assert s2["in_progress"] == "uuid-A"
|
|
assert mock_get.call_count == 1
|
|
|
|
# t=1400: TTL (300s) elapsed -> re-fetch -> fresh set B (self-heal).
|
|
clock["t"] = 1400.0
|
|
s3 = ps.get_project_states(_PROJECT)
|
|
assert s3["in_progress"] == "uuid-B"
|
|
assert mock_get.call_count == 2
|
|
|
|
|
|
def test_tc11_ttl_zero_keeps_lifetime_cache(monkeypatch):
|
|
"""plane_states_ttl_s=0 -> strictly the previous lifetime cache (back-compat)."""
|
|
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 0)
|
|
clock = {"t": 1000.0}
|
|
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
|
|
|
|
responses = iter([
|
|
_resp(_states_response("uuid-A")),
|
|
_resp(_states_response("uuid-B")),
|
|
])
|
|
mock_get = MagicMock(side_effect=lambda *a, **k: next(responses))
|
|
monkeypatch.setattr(ps.httpx, "get", mock_get)
|
|
|
|
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
|
|
clock["t"] = 1_000_000.0 # far in the future
|
|
# TTL disabled -> still the cached A, never re-fetched.
|
|
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
|
|
assert mock_get.call_count == 1
|
|
|
|
|
|
def test_tc11_groups_exposed_via_accessor(monkeypatch):
|
|
"""get_project_state_groups returns {uuid -> group} from the same record."""
|
|
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
|
|
monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(_states_response("uuid-A")))
|
|
|
|
ps.get_project_states(_PROJECT)
|
|
groups = ps.get_project_state_groups(_PROJECT)
|
|
assert groups["uuid-A"] == "started"
|
|
assert groups["uuid-done"] == "completed"
|
|
|
|
|
|
def test_tc11_groups_empty_when_uncached(monkeypatch):
|
|
"""No cache record (e.g. API fell back to defaults) -> groups == {}."""
|
|
assert ps.get_project_state_groups("never-fetched") == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-12 (AC-13): default-config compatibility — enduro UUIDs + API-error fallback.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc12_enduro_uuids_unchanged(monkeypatch):
|
|
"""enduro project still resolves its own UUIDs (return shape unchanged)."""
|
|
body = {
|
|
"results": [
|
|
{"id": "b873d9eb-993c-48cd-97ac-99a9b1623967",
|
|
"name": "In Progress", "group": "started"},
|
|
]
|
|
}
|
|
monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(body))
|
|
states = ps.get_project_states(_ET_PROJECT)
|
|
assert states["in_progress"] == "b873d9eb-993c-48cd-97ac-99a9b1623967"
|
|
# Missing keys are still backfilled from _DEFAULT_STATES (complete mapping).
|
|
assert states["done"] == ps._DEFAULT_STATES["done"]
|
|
|
|
|
|
def test_tc12_api_error_falls_back_to_defaults(monkeypatch):
|
|
"""API failure with nothing cached -> _DEFAULT_STATES (fallback preserved)."""
|
|
monkeypatch.setattr(
|
|
ps.httpx, "get", MagicMock(side_effect=Exception("network error"))
|
|
)
|
|
states = ps.get_project_states(_PROJECT)
|
|
assert states is ps._DEFAULT_STATES
|
|
|
|
|
|
def test_tc12_stale_served_when_refresh_fails(monkeypatch):
|
|
"""TTL expiry + transient API failure -> serve the stale (project-correct)
|
|
set rather than reverting to enduro defaults."""
|
|
monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300)
|
|
clock = {"t": 1000.0}
|
|
monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"])
|
|
|
|
calls = {"n": 0}
|
|
|
|
def flaky_get(*a, **k):
|
|
calls["n"] += 1
|
|
if calls["n"] == 1:
|
|
return _resp(_states_response("uuid-A"))
|
|
raise Exception("transient outage")
|
|
|
|
monkeypatch.setattr(ps.httpx, "get", flaky_get)
|
|
|
|
assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A"
|
|
clock["t"] = 2000.0 # past TTL -> refresh attempt fails
|
|
states = ps.get_project_states(_PROJECT)
|
|
assert states["in_progress"] == "uuid-A" # stale-but-correct, not defaults
|
|
assert states is not ps._DEFAULT_STATES
|