Improve deploy failure analysis

2026-04-22 22:05:15 +02:00 · 2026-04-22 22:05:15 +02:00 · 72290cd461
parent 185a866b51
commit 72290cd461
2 changed files with 57 additions and 3 deletions
--- a/codex_context.yaml
+++ b/codex_context.yaml
@ -78,6 +78,12 @@ SESSION_STATE:
    D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'."
    D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing."
    D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery."
+    D21: "Git commit created on 2026-04-22: 185a866 'Add LLM-assisted deploy retry'."
+    D22: "Updated ./deploy_agent.py failure analysis to collect 'docker compose ps -q' container IDs, fetch per-container 'docker logs --tail=50', cap combined logs at 2000 chars, and include logs in the single-retry LLM correction prompt."
+    D23: "Fixed malformed duplicate function header introduced during D22 patch; deploy_agent.py function structure restored."
+    D24: "Updated deploy_agent.py status validation: deployment success now requires status containing 'Up' and not containing 'unhealthy' case-insensitively."
+    D25: "User reiterated file-only output expectation after status-validation request; no code change beyond D24."
+    D26: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for log-analysis and status-validation updates."
  todos:
    T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml."
    T2: "DONE: Commit current changes."
@ -94,6 +100,9 @@ SESSION_STATE:
    T13: "DONE: Add deployment status verification and basic port-80 safety check."
    T14: "DONE: Commit deploy agent safety/status updates."
    T15: "DONE: Add one-shot LLM-assisted deployment failure recovery."
+    T16: "DONE: Commit LLM-assisted deploy retry changes."
+    T17: "DONE: Add bounded container log analysis to deploy failure recovery."
+    T18: "DONE: Tighten deploy status validation against unhealthy containers."
  issues:
    I1: "Tailscale DNS health warning: configured DNS servers unreachable."
    I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed."
--- a/deploy_agent.py
+++ b/deploy_agent.py
@ -50,13 +50,57 @@ def _is_valid_compose(compose: str) -> bool:
        return False
    return isinstance(parsed, dict) and "services" in parsed

+def _get_compose_logs(path: Path) -> str:
+    try:
+        ids_result = subprocess.run(
+            ["docker", "compose", "ps", "-q"],
+            cwd=path,
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+    except Exception as exc:
+        return str(exc)

-def _fix_compose(service: str, error: str, status: str) -> str:
+    container_ids = [line.strip() for line in ids_result.stdout.splitlines() if line.strip()]
+    if not container_ids:
+        return ""
+
+    parts = []
+    total = 0
+    limit = 2000
+    for container_id in container_ids:
+        try:
+            logs_result = subprocess.run(
+                ["docker", "logs", container_id, "--tail=50"],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+        except Exception as exc:
+            chunk = f"{container_id}:\n{exc}\n"
+        else:
+            chunk_body = logs_result.stdout or logs_result.stderr
+            chunk = f"{container_id}:\n{chunk_body}\n"
+        remaining = limit - total
+        if remaining <= 0:
+            break
+        if len(chunk) > remaining:
+            chunk = chunk[:remaining]
+        parts.append(chunk)
+        total += len(chunk)
+        if total >= limit:
+            break
+    return "".join(parts).strip()
+
+
+def _fix_compose(service: str, error: str, status: str, logs: str) -> str:
    prompt = (
        "Deployment failed.\n\n"
        f"Service: {service}\n\n"
        f"Error: {error}\n\n"
        f"Status: {status}\n\n"
+        f"Logs: {logs}\n\n"
        "Fix the docker-compose YAML. Return ONLY corrected YAML."
    )
    response = ask(prompt)
@ -135,7 +179,8 @@ def deploy_service(service: str) -> str:
    ok, error = _run_compose_up(target_dir)
    if not ok:
        status = get_service_status(target_dir)
-        fixed_compose = _fix_compose(service, error, status)
+        logs = _get_compose_logs(target_dir)
+        fixed_compose = _fix_compose(service, error, status, logs)
        if fixed_compose.startswith("ERROR:"):
            if status and not status.startswith("ERROR:"):
                return f"ERROR: {error}\n{status}"
@ -151,7 +196,7 @@ def deploy_service(service: str) -> str:
    status = get_service_status(target_dir)
    if status.startswith("ERROR:"):
        return status
-    if "Up" not in status:
+    if "Up" not in status or "unhealthy" in status.lower():
        return f"ERROR: no running services\n{status}"

    return f"DEPLOYED: {target_dir.name}\n{status}"