Фундамент тиража 10-common (эпик ORCH-10): платформа разворачивается на
новой инфре без правки кода — только env/конфиг. Каждый дефолт = боевому
значению (пустой .env => поведение 1:1, kill-switch-природа, NFR-2);
STAGE_TRANSITIONS/QG_CHECKS/check_*/machine-verdict/схема БД не тронуты.
- config: agent_home_dir / agent_git_name / git_email_domain / staging_port
(ADR-001 D2/D4); код-блокеры A1-A4 закрыты: plane_sync ссылки из
gitea_public_url+gitea_owner, launcher - единый agent_git_env() (x2 места),
self_deploy/post_deploy - HOME+домен из Settings (имена системных акторов -
платформенные литералы)
- image_freshness: staging_port из конфига + fail-closed guard
staging_port == прод-порт -> отказ ДО ssh/build (инвариант ORCH-058 AC-9
стал исполняемым); REPO= передаётся хуку явно обоими инвокерами (D7)
- SELF_HOSTING_REPO - нормативная платформенная константа (D3, пин-тест)
- compose: полная ${VAR:-default}-интерполяция (реестр B, карта D6); группа
ORCH-040 uid/gid/HOME/маунты двигается согласованно (build.args APP_*);
group_add "МИНА 1" сохранён x3; оба app-сервиса с явным command:
- Dockerfile: ARG APP_UID/APP_GID/APP_USER/APP_HOME (CMD exec-form 8500
сознательно не тронут - D5); deploy-hook: REPO="${REPO:-...}" (D1 реестра)
- секреты: stdlib scripts/gen_secrets.py (token_hex(32); печать по умолчанию;
--write никогда не перезаписывает существующий .env молча, exit=2;
перезапись только --force); .env.example дополнен до полноты ключей старта
- доки: новый docs/operations/REPLICATION.md (карта env, чек-лист секретов,
smoke-процедура с PASS/FAIL, границы 10-common/Lite/Bundled), INFRA.md,
README, CLAUDE.md, CHANGELOG
- анти-регресс: tests/test_no_host_hardcodes.py (tokenize-сканер запрещённых
литералов, config-модули - структурное исключение, allowlist пуст,
негативная самопроверка) + test_host_config_keys / test_infra_parametrization
/ test_secrets_gen / test_replication_smoke; согласованные структурные
правки test_orch040_compose (судит резолв дефолтов) и
test_deploy_hook_rollback_sim (REPO через env-override = контракт D7)
Полный регресс: 1764 passed.
Refs: ORCH-101
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
425 lines
19 KiB
Python
425 lines
19 KiB
Python
"""Executable self-deploy primitives (ORCH-036).
|
||
|
||
The ``deploy`` stage for the self-hosting ``orchestrator`` repo is a REAL prod
|
||
restart, not a paper LLM verdict. Because the prod container (8500) runs the
|
||
worker/agent itself, the restart must be performed by an EXTERNAL host process
|
||
that survives the container dying (BR-2). The orchestration is split into three
|
||
deterministic phases (ADR-001), wired in ``stage_engine``:
|
||
|
||
* Phase A — request approve on the ``deploy-staging -> deploy`` edge.
|
||
* Phase B — a human Plane ``Approved`` initiates the detached host deploy.
|
||
* Phase C — a deterministic finalizer maps the hook exit-code -> deploy_status.
|
||
|
||
This module is a **leaf**: it imports only config / git_worktree (and lazily
|
||
``qg.checks.is_self_hosting_repo``), never ``stage_engine`` / ``launcher`` — the
|
||
orchestration that needs those lives in ``stage_engine``. Every public helper
|
||
honours a **never-raise** contract so a deploy-state hiccup can never crash the
|
||
stage engine.
|
||
|
||
Restart-safe state lives in sentinel files under
|
||
``<repos_dir>/.deploy-state-<repo>/<work_item_id>/`` (mirrors the merge-lease
|
||
pattern, ТЗ §4 — no DB migration), on the shared mount visible to BOTH the
|
||
container (reads markers) and the host (writes ``result``):
|
||
* ``approve-requested`` — Phase A done;
|
||
* ``initiated`` — Phase B started (idempotency-guard);
|
||
* ``result`` — the hook exit-code, written by the host WRAPPER
|
||
(``echo $? > result``), NOT by the hook itself.
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
import shlex
|
||
import shutil
|
||
import subprocess
|
||
|
||
from .config import settings
|
||
|
||
logger = logging.getLogger("orchestrator.self_deploy")
|
||
|
||
# Sentinel marker filenames (see module docstring).
|
||
APPROVE_REQUESTED = "approve-requested"
|
||
INITIATED = "initiated"
|
||
RESULT = "result"
|
||
|
||
# ssh launch is detached (returns immediately); keep a bounded timeout so a hung
|
||
# ssh handshake never wedges the caller.
|
||
_SSH_TIMEOUT = 30
|
||
_GIT_TIMEOUT = 60
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Conditionality
|
||
# ---------------------------------------------------------------------------
|
||
def self_deploy_applies(repo: str) -> bool:
|
||
"""Whether executable self-deploy (Phase A/B/C) is REAL for this repo.
|
||
|
||
Mirrors the ORCH-35 / ORCH-43 conditional rollout:
|
||
* ``self_deploy_enabled=False`` -> always False (global kill-switch); the
|
||
legacy synchronous deployer path runs for everyone.
|
||
* ``self_deploy_repos`` (CSV) non-empty -> real only for listed repos.
|
||
* empty CSV -> real ONLY for the self-hosting repo (``orchestrator``).
|
||
Never raises.
|
||
"""
|
||
try:
|
||
if not settings.self_deploy_enabled:
|
||
return False
|
||
raw = (settings.self_deploy_repos or "").strip()
|
||
if raw:
|
||
allowed = {r.strip().lower() for r in raw.split(",") if r.strip()}
|
||
return (repo or "").strip().lower() in allowed
|
||
# Lazy import keeps this module a leaf (avoids importing qg at module load).
|
||
from .qg.checks import is_self_hosting_repo
|
||
return is_self_hosting_repo(repo)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("self_deploy_applies error for %s: %s", repo, e)
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# exit-code -> deploy_status mapping (pure, unit-tested: TC-01/02/03)
|
||
# ---------------------------------------------------------------------------
|
||
def map_exit_code_to_status(exit_code) -> str:
|
||
"""Map a deploy-hook exit-code to a machine verdict (deterministic, pure).
|
||
|
||
Contract (AC-1 / AC-3, hook exit-code contract 0/1/2):
|
||
* ``0`` -> ``"SUCCESS"`` (health-ok proven by the hook).
|
||
* ``1`` (rolled back), ``2`` (rollback also failed), anything else, or a
|
||
non-int/None -> ``"FAILED"`` (fail-closed; never advances on doubt).
|
||
"""
|
||
try:
|
||
code = int(exit_code)
|
||
except (TypeError, ValueError):
|
||
return "FAILED"
|
||
return "SUCCESS" if code == 0 else "FAILED"
|
||
|
||
|
||
def build_deploy_log(work_item_id: str, exit_code, status: str) -> str:
|
||
"""Render a 14-deploy-log.md body whose ``deploy_status:`` frontmatter is the
|
||
verdict ``check_deploy_status`` / ``_parse_deploy_status`` reads (contract
|
||
unchanged, AC-10). The body is informational only — only the frontmatter is
|
||
machine-read.
|
||
"""
|
||
return (
|
||
"---\n"
|
||
f"deploy_status: {status}\n"
|
||
f"work_item: {work_item_id}\n"
|
||
f"hook_exit_code: {exit_code}\n"
|
||
"deployed_by: deploy-finalizer\n"
|
||
"---\n\n"
|
||
"# Deploy log — ORCH-036 executable self-deploy\n\n"
|
||
f"Прод-деплой завершён хост-хуком с exit-code `{exit_code}` -> "
|
||
f"`deploy_status: {status}`.\n\n"
|
||
"Вердикт зафиксирован детерминированным finalizer'ом (Фаза C), не LLM.\n"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Sentinel state (restart-safe, no DB migration — ТЗ §4)
|
||
# ---------------------------------------------------------------------------
|
||
def _state_dir(base: str, repo: str, work_item_id: str | None) -> str:
|
||
return os.path.join(base, f".deploy-state-{repo}", (work_item_id or "_"))
|
||
|
||
|
||
def container_state_dir(repo: str, work_item_id: str | None) -> str:
|
||
"""State dir as seen FROM THE CONTAINER (settings.repos_dir mount)."""
|
||
return _state_dir(settings.repos_dir, repo, work_item_id)
|
||
|
||
|
||
def host_state_dir(repo: str, work_item_id: str | None) -> str:
|
||
"""State dir as seen FROM THE HOST (settings.host_repos_dir).
|
||
|
||
Same physical directory as ``container_state_dir`` via the shared mount; the
|
||
host path is what we embed in the ssh command so the host wrapper writes the
|
||
``result`` sentinel where the container can read it.
|
||
"""
|
||
return _state_dir(settings.host_repos_dir, repo, work_item_id)
|
||
|
||
|
||
def marker_path(repo: str, work_item_id: str | None, name: str) -> str:
|
||
return os.path.join(container_state_dir(repo, work_item_id), name)
|
||
|
||
|
||
def has_marker(repo: str, work_item_id: str | None, name: str) -> bool:
|
||
"""True iff the named sentinel exists. Never raises."""
|
||
try:
|
||
return os.path.isfile(marker_path(repo, work_item_id, name))
|
||
except Exception as e: # noqa: BLE001 - never-raise
|
||
logger.warning("has_marker error for %s/%s/%s: %s", repo, work_item_id, name, e)
|
||
return False
|
||
|
||
|
||
def write_marker(repo: str, work_item_id: str | None, name: str, content: str = "") -> bool:
|
||
"""Create/overwrite a sentinel (best-effort). Returns True on success."""
|
||
try:
|
||
d = container_state_dir(repo, work_item_id)
|
||
os.makedirs(d, exist_ok=True)
|
||
with open(os.path.join(d, name), "w", encoding="utf-8") as f:
|
||
f.write(str(content))
|
||
return True
|
||
except OSError as e:
|
||
logger.warning("write_marker error for %s/%s/%s: %s", repo, work_item_id, name, e)
|
||
return False
|
||
|
||
|
||
def clear_state(repo: str, work_item_id: str | None) -> bool:
|
||
"""Remove ALL deploy-state sentinels for this work item (best-effort).
|
||
|
||
Sentinels are keyed by ``work_item_id`` (stable for the whole task lifetime),
|
||
so a FAILED prod-deploy leaves ``approve-requested`` / ``initiated`` / ``result``
|
||
behind. Without cleanup, after the БАГ-8 rollback (deploy -> development) and a
|
||
fix, the task reaching ``deploy`` again would hit Phase B's idempotency-guard:
|
||
the STALE ``initiated`` makes it a no-op, the detached hook never re-launches and
|
||
the task wedges on ``deploy`` forever (re-deploy-after-rollback contract broken;
|
||
AC-4/AC-10). A stale ``result`` would likewise be mis-read by the new finalizer.
|
||
Clearing the whole state dir restores a clean slate for the next pass. Idempotent
|
||
(a missing dir is success). Never raises.
|
||
"""
|
||
d = container_state_dir(repo, work_item_id)
|
||
try:
|
||
shutil.rmtree(d)
|
||
logger.info("clear_state: removed deploy-state dir %s", d)
|
||
return True
|
||
except FileNotFoundError:
|
||
return True
|
||
except OSError as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("clear_state error for %s/%s: %s", repo, work_item_id, e)
|
||
return False
|
||
|
||
|
||
def read_result(repo: str, work_item_id: str | None) -> tuple[bool, int | None]:
|
||
"""Read the ``result`` sentinel (hook exit-code written by the host wrapper).
|
||
|
||
Returns ``(present, exit_code)``:
|
||
* ``(False, None)`` -> not written yet (finalizer should DEFER);
|
||
* ``(True, <int>)`` -> verdict ready;
|
||
* ``(True, 1)`` -> present but corrupt/unparseable -> treated as a
|
||
failure code (fail-closed) so we never advance on garbage.
|
||
Never raises.
|
||
"""
|
||
p = marker_path(repo, work_item_id, RESULT)
|
||
try:
|
||
with open(p, "r", encoding="utf-8") as f:
|
||
raw = f.read().strip()
|
||
except FileNotFoundError:
|
||
return False, None
|
||
except OSError as e:
|
||
logger.warning("read_result error for %s/%s: %s", repo, work_item_id, e)
|
||
return False, None
|
||
if raw == "":
|
||
return False, None
|
||
try:
|
||
return True, int(raw)
|
||
except ValueError:
|
||
logger.warning("read_result: corrupt result %r for %s/%s", raw, repo, work_item_id)
|
||
return True, 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Detached host deploy: ssh + setsid (Phase B)
|
||
# ---------------------------------------------------------------------------
|
||
def build_deploy_command(repo: str, work_item_id: str | None, branch: str) -> list[str]:
|
||
"""Build the ssh argv that launches the DETACHED prod deploy on the host.
|
||
|
||
The remote command runs the hook via ``setsid`` with stdin/stdout detached and
|
||
backgrounded (``&``) so the process SURVIVES the prod container restart (BR-2),
|
||
then the WRAPPER (not the hook) writes the exit-code to the ``result`` sentinel:
|
||
|
||
setsid bash -c 'cd <repo> && <prod env...> bash <hook> --deploy; \
|
||
echo $? > <result>' >> <hook.log> 2>&1 </dev/null &
|
||
|
||
Build-once (BR-6): ``SOURCE_IMAGE=<staging-image>`` makes the hook retag the
|
||
staging-validated image to the prod tag instead of rebuilding (no ``docker
|
||
build``). The exit-code contract of the hook is untouched.
|
||
|
||
Provenance guard (ORCH-058, Strategy B): when the image-freshness feature is
|
||
active for this repo, the VALIDATED commit SHA is passed as
|
||
``EXPECTED_REVISION=<sha>`` so the hook fail-closes (``exit 1``) before
|
||
``docker tag`` if SOURCE_IMAGE's revision label does not match — a stale image
|
||
can never be silently promoted. When inactive (non-self / kill-switch off)
|
||
``expected_revision`` returns ``""`` and the env is omitted, keeping the hook's
|
||
backward-compatible "no provenance check" behaviour (AC-5 / AC-7).
|
||
"""
|
||
from . import image_freshness
|
||
|
||
host_dir = host_state_dir(repo, work_item_id)
|
||
result_sentinel = os.path.join(host_dir, RESULT)
|
||
hook_log = os.path.join(host_dir, "hook.log")
|
||
|
||
# ORCH-101 (D7): REPO is passed EXPLICITLY (same source the `cd` below uses)
|
||
# so the hook's env-override actually works on a parametrised host; the
|
||
# hook's own default only serves manual operator runs. The exit-code
|
||
# contract of the hook (0/1/2, ORCH-036) is untouched — this is one
|
||
# additional env assignment in the prefix.
|
||
env_assignments = (
|
||
f"REPO={shlex.quote(settings.deploy_host_repo_path)} "
|
||
f"SOURCE_IMAGE={shlex.quote(settings.deploy_prod_source_image)} "
|
||
f"TARGET_SERVICE={shlex.quote(settings.deploy_prod_target_service)} "
|
||
f"TARGET_PORT={int(settings.deploy_prod_target_port)} "
|
||
f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_target_image)} "
|
||
f"COMPOSE_PROFILE={shlex.quote(settings.deploy_prod_compose_profile)} "
|
||
f"PREV_IMAGE_FILE={shlex.quote(settings.deploy_prod_prev_image_file)}"
|
||
)
|
||
expected_rev = image_freshness.expected_revision(repo, branch)
|
||
if expected_rev:
|
||
env_assignments += f" EXPECTED_REVISION={shlex.quote(expected_rev)}"
|
||
inner = (
|
||
f"cd {shlex.quote(settings.deploy_host_repo_path)} && "
|
||
f"{env_assignments} "
|
||
f"bash {shlex.quote(settings.deploy_hook_script)} --deploy; "
|
||
f"echo $? > {shlex.quote(result_sentinel)}"
|
||
)
|
||
remote = (
|
||
f"setsid bash -c {shlex.quote(inner)} "
|
||
f">> {shlex.quote(hook_log)} 2>&1 </dev/null &"
|
||
)
|
||
user = (settings.deploy_ssh_user or "").strip()
|
||
host = (settings.deploy_ssh_host or "").strip()
|
||
target = f"{user}@{host}" if user else host
|
||
return ["ssh", "-o", "StrictHostKeyChecking=no", target, remote]
|
||
|
||
|
||
def initiate_deploy(repo: str, work_item_id: str | None, branch: str) -> tuple[bool, str]:
|
||
"""Launch the detached prod deploy on the host (Phase B). Never raises.
|
||
|
||
The ssh call returns immediately (the remote process is detached via setsid +
|
||
``&``). Returns ``(True, msg)`` when ssh dispatched the detached process, or
|
||
``(False, reason)`` so the caller can alert and let the human re-approve.
|
||
"""
|
||
# Ensure the shared state dir exists so the host wrapper can write `result`.
|
||
try:
|
||
os.makedirs(container_state_dir(repo, work_item_id), exist_ok=True)
|
||
except OSError as e:
|
||
logger.warning("initiate_deploy: state dir error for %s/%s: %s", repo, work_item_id, e)
|
||
|
||
cmd = build_deploy_command(repo, work_item_id, branch)
|
||
try:
|
||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=_SSH_TIMEOUT)
|
||
except subprocess.TimeoutExpired:
|
||
return False, "ssh launch timeout"
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
return False, f"ssh launch error: {e}"
|
||
if r.returncode != 0:
|
||
detail = ((r.stderr or "") + (r.stdout or "")).strip()[:200]
|
||
return False, f"ssh launch failed (rc={r.returncode}): {detail}"
|
||
logger.info("initiate_deploy: detached prod deploy dispatched for %s/%s", repo, work_item_id)
|
||
return True, "deploy initiated (detached host process)"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Deploy log write + best-effort merge (Phase C)
|
||
# ---------------------------------------------------------------------------
|
||
def write_deploy_log(repo: str, work_item_id: str, branch: str, exit_code, status: str) -> bool:
|
||
"""Write 14-deploy-log.md into the task worktree (so check_deploy_status reads
|
||
it) and best-effort commit+push it. Returns True iff the file was written.
|
||
Never raises.
|
||
"""
|
||
from .git_worktree import get_worktree_path
|
||
|
||
rel = f"docs/work-items/{work_item_id}/14-deploy-log.md"
|
||
try:
|
||
wt = get_worktree_path(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise
|
||
logger.error("write_deploy_log: worktree error for %s/%s: %s", repo, branch, e)
|
||
return False
|
||
|
||
path = os.path.join(wt, rel)
|
||
content = build_deploy_log(work_item_id, exit_code, status)
|
||
try:
|
||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
except OSError as e:
|
||
logger.error("write_deploy_log: write error at %s: %s", path, e)
|
||
return False
|
||
|
||
# Best-effort commit + push (the gate also falls back to origin/main).
|
||
# ORCH-101 (A3): HOME + email domain from Settings; the actor NAME stays the
|
||
# platform literal `deploy-finalizer` (D2 — distinguishable system-actor
|
||
# commits, not host-specific). Defaults = the previous hardcoded values.
|
||
_email = f"deploy-finalizer@{settings.git_email_domain}"
|
||
git_env = {
|
||
**os.environ,
|
||
"HOME": settings.agent_home_dir,
|
||
"GIT_AUTHOR_NAME": "deploy-finalizer",
|
||
"GIT_AUTHOR_EMAIL": _email,
|
||
"GIT_COMMITTER_NAME": "deploy-finalizer",
|
||
"GIT_COMMITTER_EMAIL": _email,
|
||
}
|
||
try:
|
||
subprocess.run(["git", "-C", wt, "add", rel],
|
||
capture_output=True, timeout=_GIT_TIMEOUT, env=git_env)
|
||
commit = subprocess.run(
|
||
["git", "-C", wt, "commit", "-m",
|
||
f"deploy(ORCH-036): finalize {status} for {work_item_id}"],
|
||
capture_output=True, text=True, timeout=_GIT_TIMEOUT, env=git_env,
|
||
)
|
||
if commit.returncode == 0:
|
||
subprocess.run(["git", "-C", wt, "push", "origin", branch],
|
||
capture_output=True, timeout=_GIT_TIMEOUT, env=git_env)
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
logger.warning("write_deploy_log: git commit/push best-effort failed: %s", e)
|
||
return True
|
||
|
||
|
||
def record_merged_to_main(repo: str, work_item_id: str, branch: str, merged: bool) -> bool:
|
||
"""Stamp ``merged_to_main: true|false`` into 14-deploy-log.md frontmatter (ORCH-071).
|
||
|
||
Machine-readable observability for the merge-verify under-gate. ONLY the
|
||
``merged_to_main:`` line is added/updated inside the YAML frontmatter block; the
|
||
``deploy_status:`` field is left untouched, so the ``check_deploy_status`` /
|
||
``_parse_deploy_status`` parsing contract is unchanged (TRZ §6 / AC §5).
|
||
|
||
Best-effort and idempotent: a missing log or any I/O error is logged and
|
||
swallowed. Never raises.
|
||
"""
|
||
from .git_worktree import get_worktree_path
|
||
|
||
rel = f"docs/work-items/{work_item_id}/14-deploy-log.md"
|
||
try:
|
||
wt = get_worktree_path(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise
|
||
logger.warning("record_merged_to_main: worktree error for %s/%s: %s", repo, branch, e)
|
||
return False
|
||
path = os.path.join(wt, rel)
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
except FileNotFoundError:
|
||
logger.info("record_merged_to_main: no deploy log at %s (skip)", path)
|
||
return False
|
||
except OSError as e:
|
||
logger.warning("record_merged_to_main: read error at %s: %s", path, e)
|
||
return False
|
||
|
||
value = "true" if merged else "false"
|
||
if not content.startswith("---"):
|
||
# No frontmatter to amend — do not fabricate one (keep the contract minimal).
|
||
logger.info("record_merged_to_main: no frontmatter in %s (skip)", path)
|
||
return False
|
||
parts = content.split("---", 2)
|
||
if len(parts) < 3:
|
||
return False
|
||
fm_lines = parts[1].splitlines()
|
||
new_lines = []
|
||
replaced = False
|
||
for ln in fm_lines:
|
||
if ln.strip().lower().startswith("merged_to_main:"):
|
||
new_lines.append(f"merged_to_main: {value}")
|
||
replaced = True
|
||
else:
|
||
new_lines.append(ln)
|
||
if not replaced:
|
||
# Insert before the closing of the frontmatter block (append to the body).
|
||
if new_lines and new_lines[0] == "":
|
||
new_lines = new_lines[1:]
|
||
new_lines.append(f"merged_to_main: {value}")
|
||
new_fm = "\n".join(new_lines)
|
||
new_content = "---\n" + new_fm.strip("\n") + "\n---" + parts[2]
|
||
try:
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
f.write(new_content)
|
||
except OSError as e:
|
||
logger.warning("record_merged_to_main: write error at %s: %s", path, e)
|
||
return False
|
||
return True
|