Improve deploy failure analysis
This commit is contained in:
parent
185a866b51
commit
72290cd461
|
|
@ -78,6 +78,12 @@ SESSION_STATE:
|
|||
D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'."
|
||||
D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing."
|
||||
D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery."
|
||||
D21: "Git commit created on 2026-04-22: 185a866 'Add LLM-assisted deploy retry'."
|
||||
D22: "Updated ./deploy_agent.py failure analysis to collect 'docker compose ps -q' container IDs, fetch per-container 'docker logs --tail=50', cap combined logs at 2000 chars, and include logs in the single-retry LLM correction prompt."
|
||||
D23: "Fixed malformed duplicate function header introduced during D22 patch; deploy_agent.py function structure restored."
|
||||
D24: "Updated deploy_agent.py status validation: deployment success now requires status containing 'Up' and not containing 'unhealthy' case-insensitively."
|
||||
D25: "User reiterated file-only output expectation after status-validation request; no code change beyond D24."
|
||||
D26: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for log-analysis and status-validation updates."
|
||||
todos:
|
||||
T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml."
|
||||
T2: "DONE: Commit current changes."
|
||||
|
|
@ -94,6 +100,9 @@ SESSION_STATE:
|
|||
T13: "DONE: Add deployment status verification and basic port-80 safety check."
|
||||
T14: "DONE: Commit deploy agent safety/status updates."
|
||||
T15: "DONE: Add one-shot LLM-assisted deployment failure recovery."
|
||||
T16: "DONE: Commit LLM-assisted deploy retry changes."
|
||||
T17: "DONE: Add bounded container log analysis to deploy failure recovery."
|
||||
T18: "DONE: Tighten deploy status validation against unhealthy containers."
|
||||
issues:
|
||||
I1: "Tailscale DNS health warning: configured DNS servers unreachable."
|
||||
I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed."
|
||||
|
|
|
|||
|
|
@ -50,13 +50,57 @@ def _is_valid_compose(compose: str) -> bool:
|
|||
return False
|
||||
return isinstance(parsed, dict) and "services" in parsed
|
||||
|
||||
def _get_compose_logs(path: Path) -> str:
|
||||
try:
|
||||
ids_result = subprocess.run(
|
||||
["docker", "compose", "ps", "-q"],
|
||||
cwd=path,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
return str(exc)
|
||||
|
||||
def _fix_compose(service: str, error: str, status: str) -> str:
|
||||
container_ids = [line.strip() for line in ids_result.stdout.splitlines() if line.strip()]
|
||||
if not container_ids:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
total = 0
|
||||
limit = 2000
|
||||
for container_id in container_ids:
|
||||
try:
|
||||
logs_result = subprocess.run(
|
||||
["docker", "logs", container_id, "--tail=50"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
chunk = f"{container_id}:\n{exc}\n"
|
||||
else:
|
||||
chunk_body = logs_result.stdout or logs_result.stderr
|
||||
chunk = f"{container_id}:\n{chunk_body}\n"
|
||||
remaining = limit - total
|
||||
if remaining <= 0:
|
||||
break
|
||||
if len(chunk) > remaining:
|
||||
chunk = chunk[:remaining]
|
||||
parts.append(chunk)
|
||||
total += len(chunk)
|
||||
if total >= limit:
|
||||
break
|
||||
return "".join(parts).strip()
|
||||
|
||||
|
||||
def _fix_compose(service: str, error: str, status: str, logs: str) -> str:
|
||||
prompt = (
|
||||
"Deployment failed.\n\n"
|
||||
f"Service: {service}\n\n"
|
||||
f"Error: {error}\n\n"
|
||||
f"Status: {status}\n\n"
|
||||
f"Logs: {logs}\n\n"
|
||||
"Fix the docker-compose YAML. Return ONLY corrected YAML."
|
||||
)
|
||||
response = ask(prompt)
|
||||
|
|
@ -135,7 +179,8 @@ def deploy_service(service: str) -> str:
|
|||
ok, error = _run_compose_up(target_dir)
|
||||
if not ok:
|
||||
status = get_service_status(target_dir)
|
||||
fixed_compose = _fix_compose(service, error, status)
|
||||
logs = _get_compose_logs(target_dir)
|
||||
fixed_compose = _fix_compose(service, error, status, logs)
|
||||
if fixed_compose.startswith("ERROR:"):
|
||||
if status and not status.startswith("ERROR:"):
|
||||
return f"ERROR: {error}\n{status}"
|
||||
|
|
@ -151,7 +196,7 @@ def deploy_service(service: str) -> str:
|
|||
status = get_service_status(target_dir)
|
||||
if status.startswith("ERROR:"):
|
||||
return status
|
||||
if "Up" not in status:
|
||||
if "Up" not in status or "unhealthy" in status.lower():
|
||||
return f"ERROR: no running services\n{status}"
|
||||
|
||||
return f"DEPLOYED: {target_dir.name}\n{status}"
|
||||
|
|
|
|||
Loading…
Reference in a new issue