From 72290cd4610816250e447f32ab92efe86e770d08 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Wed, 22 Apr 2026 22:05:15 +0200 Subject: [PATCH] Improve deploy failure analysis --- codex_context.yaml | 9 ++++++++ deploy_agent.py | 51 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/codex_context.yaml b/codex_context.yaml index c4a0a09..a4ba37d 100644 --- a/codex_context.yaml +++ b/codex_context.yaml @@ -78,6 +78,12 @@ SESSION_STATE: D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'." D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing." D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery." + D21: "Git commit created on 2026-04-22: 185a866 'Add LLM-assisted deploy retry'." + D22: "Updated ./deploy_agent.py failure analysis to collect 'docker compose ps -q' container IDs, fetch per-container 'docker logs --tail=50', cap combined logs at 2000 chars, and include logs in the single-retry LLM correction prompt." + D23: "Fixed malformed duplicate function header introduced during D22 patch; deploy_agent.py function structure restored." + D24: "Updated deploy_agent.py status validation: deployment success now requires status containing 'Up' and not containing 'unhealthy' case-insensitively." + D25: "User reiterated file-only output expectation after status-validation request; no code change beyond D24." + D26: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for log-analysis and status-validation updates." todos: T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml." T2: "DONE: Commit current changes." @@ -94,6 +100,9 @@ SESSION_STATE: T13: "DONE: Add deployment status verification and basic port-80 safety check." T14: "DONE: Commit deploy agent safety/status updates." T15: "DONE: Add one-shot LLM-assisted deployment failure recovery." + T16: "DONE: Commit LLM-assisted deploy retry changes." + T17: "DONE: Add bounded container log analysis to deploy failure recovery." + T18: "DONE: Tighten deploy status validation against unhealthy containers." issues: I1: "Tailscale DNS health warning: configured DNS servers unreachable." I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed." diff --git a/deploy_agent.py b/deploy_agent.py index 02f7001..17eb28f 100644 --- a/deploy_agent.py +++ b/deploy_agent.py @@ -50,13 +50,57 @@ def _is_valid_compose(compose: str) -> bool: return False return isinstance(parsed, dict) and "services" in parsed +def _get_compose_logs(path: Path) -> str: + try: + ids_result = subprocess.run( + ["docker", "compose", "ps", "-q"], + cwd=path, + check=False, + capture_output=True, + text=True, + ) + except Exception as exc: + return str(exc) -def _fix_compose(service: str, error: str, status: str) -> str: + container_ids = [line.strip() for line in ids_result.stdout.splitlines() if line.strip()] + if not container_ids: + return "" + + parts = [] + total = 0 + limit = 2000 + for container_id in container_ids: + try: + logs_result = subprocess.run( + ["docker", "logs", container_id, "--tail=50"], + check=False, + capture_output=True, + text=True, + ) + except Exception as exc: + chunk = f"{container_id}:\n{exc}\n" + else: + chunk_body = logs_result.stdout or logs_result.stderr + chunk = f"{container_id}:\n{chunk_body}\n" + remaining = limit - total + if remaining <= 0: + break + if len(chunk) > remaining: + chunk = chunk[:remaining] + parts.append(chunk) + total += len(chunk) + if total >= limit: + break + return "".join(parts).strip() + + +def _fix_compose(service: str, error: str, status: str, logs: str) -> str: prompt = ( "Deployment failed.\n\n" f"Service: {service}\n\n" f"Error: {error}\n\n" f"Status: {status}\n\n" + f"Logs: {logs}\n\n" "Fix the docker-compose YAML. Return ONLY corrected YAML." ) response = ask(prompt) @@ -135,7 +179,8 @@ def deploy_service(service: str) -> str: ok, error = _run_compose_up(target_dir) if not ok: status = get_service_status(target_dir) - fixed_compose = _fix_compose(service, error, status) + logs = _get_compose_logs(target_dir) + fixed_compose = _fix_compose(service, error, status, logs) if fixed_compose.startswith("ERROR:"): if status and not status.startswith("ERROR:"): return f"ERROR: {error}\n{status}" @@ -151,7 +196,7 @@ def deploy_service(service: str) -> str: status = get_service_status(target_dir) if status.startswith("ERROR:"): return status - if "Up" not in status: + if "Up" not in status or "unhealthy" in status.lower(): return f"ERROR: no running services\n{status}" return f"DEPLOYED: {target_dir.name}\n{status}"