Improve deploy failure analysis

This commit is contained in:
Oskar Kapala 2026-04-22 22:05:15 +02:00
parent 185a866b51
commit 72290cd461
2 changed files with 57 additions and 3 deletions

View file

@ -78,6 +78,12 @@ SESSION_STATE:
D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'."
D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing."
D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery."
D21: "Git commit created on 2026-04-22: 185a866 'Add LLM-assisted deploy retry'."
D22: "Updated ./deploy_agent.py failure analysis to collect 'docker compose ps -q' container IDs, fetch per-container 'docker logs --tail=50', cap combined logs at 2000 chars, and include logs in the single-retry LLM correction prompt."
D23: "Fixed malformed duplicate function header introduced during D22 patch; deploy_agent.py function structure restored."
D24: "Updated deploy_agent.py status validation: deployment success now requires status containing 'Up' and not containing 'unhealthy' case-insensitively."
D25: "User reiterated file-only output expectation after status-validation request; no code change beyond D24."
D26: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for log-analysis and status-validation updates."
todos:
T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml."
T2: "DONE: Commit current changes."
@ -94,6 +100,9 @@ SESSION_STATE:
T13: "DONE: Add deployment status verification and basic port-80 safety check."
T14: "DONE: Commit deploy agent safety/status updates."
T15: "DONE: Add one-shot LLM-assisted deployment failure recovery."
T16: "DONE: Commit LLM-assisted deploy retry changes."
T17: "DONE: Add bounded container log analysis to deploy failure recovery."
T18: "DONE: Tighten deploy status validation against unhealthy containers."
issues:
I1: "Tailscale DNS health warning: configured DNS servers unreachable."
I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed."

View file

@ -50,13 +50,57 @@ def _is_valid_compose(compose: str) -> bool:
return False
return isinstance(parsed, dict) and "services" in parsed
def _get_compose_logs(path: Path) -> str:
try:
ids_result = subprocess.run(
["docker", "compose", "ps", "-q"],
cwd=path,
check=False,
capture_output=True,
text=True,
)
except Exception as exc:
return str(exc)
def _fix_compose(service: str, error: str, status: str) -> str:
container_ids = [line.strip() for line in ids_result.stdout.splitlines() if line.strip()]
if not container_ids:
return ""
parts = []
total = 0
limit = 2000
for container_id in container_ids:
try:
logs_result = subprocess.run(
["docker", "logs", container_id, "--tail=50"],
check=False,
capture_output=True,
text=True,
)
except Exception as exc:
chunk = f"{container_id}:\n{exc}\n"
else:
chunk_body = logs_result.stdout or logs_result.stderr
chunk = f"{container_id}:\n{chunk_body}\n"
remaining = limit - total
if remaining <= 0:
break
if len(chunk) > remaining:
chunk = chunk[:remaining]
parts.append(chunk)
total += len(chunk)
if total >= limit:
break
return "".join(parts).strip()
def _fix_compose(service: str, error: str, status: str, logs: str) -> str:
prompt = (
"Deployment failed.\n\n"
f"Service: {service}\n\n"
f"Error: {error}\n\n"
f"Status: {status}\n\n"
f"Logs: {logs}\n\n"
"Fix the docker-compose YAML. Return ONLY corrected YAML."
)
response = ask(prompt)
@ -135,7 +179,8 @@ def deploy_service(service: str) -> str:
ok, error = _run_compose_up(target_dir)
if not ok:
status = get_service_status(target_dir)
fixed_compose = _fix_compose(service, error, status)
logs = _get_compose_logs(target_dir)
fixed_compose = _fix_compose(service, error, status, logs)
if fixed_compose.startswith("ERROR:"):
if status and not status.startswith("ERROR:"):
return f"ERROR: {error}\n{status}"
@ -151,7 +196,7 @@ def deploy_service(service: str) -> str:
status = get_service_status(target_dir)
if status.startswith("ERROR:"):
return status
if "Up" not in status:
if "Up" not in status or "unhealthy" in status.lower():
return f"ERROR: no running services\n{status}"
return f"DEPLOYED: {target_dir.name}\n{status}"