Improve deploy failure analysis
This commit is contained in:
parent
185a866b51
commit
72290cd461
|
|
@ -78,6 +78,12 @@ SESSION_STATE:
|
||||||
D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'."
|
D18: "Git commit created on 2026-04-22: 0abe9cb 'Improve deploy agent safety checks'."
|
||||||
D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing."
|
D19: "Updated ./deploy_agent.py to use local LLM for one bounded deployment-failure retry: capture service/error/status, request corrected YAML only, replace docker-compose.yml, retry once, then return final error plus last status if still failing."
|
||||||
D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery."
|
D20: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for one-shot LLM-assisted deployment failure recovery."
|
||||||
|
D21: "Git commit created on 2026-04-22: 185a866 'Add LLM-assisted deploy retry'."
|
||||||
|
D22: "Updated ./deploy_agent.py failure analysis to collect 'docker compose ps -q' container IDs, fetch per-container 'docker logs --tail=50', cap combined logs at 2000 chars, and include logs in the single-retry LLM correction prompt."
|
||||||
|
D23: "Fixed malformed duplicate function header introduced during D22 patch; deploy_agent.py function structure restored."
|
||||||
|
D24: "Updated deploy_agent.py status validation: deployment success now requires status containing 'Up' and not containing 'unhealthy' case-insensitively."
|
||||||
|
D25: "User reiterated file-only output expectation after status-validation request; no code change beyond D24."
|
||||||
|
D26: "User requested git commit on 2026-04-22; commit scope includes ./deploy_agent.py and ./codex_context.yaml for log-analysis and status-validation updates."
|
||||||
todos:
|
todos:
|
||||||
T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml."
|
T1: "For all future meaningful changes/decisions, update and overwrite ./codex_context.yaml."
|
||||||
T2: "DONE: Commit current changes."
|
T2: "DONE: Commit current changes."
|
||||||
|
|
@ -94,6 +100,9 @@ SESSION_STATE:
|
||||||
T13: "DONE: Add deployment status verification and basic port-80 safety check."
|
T13: "DONE: Add deployment status verification and basic port-80 safety check."
|
||||||
T14: "DONE: Commit deploy agent safety/status updates."
|
T14: "DONE: Commit deploy agent safety/status updates."
|
||||||
T15: "DONE: Add one-shot LLM-assisted deployment failure recovery."
|
T15: "DONE: Add one-shot LLM-assisted deployment failure recovery."
|
||||||
|
T16: "DONE: Commit LLM-assisted deploy retry changes."
|
||||||
|
T17: "DONE: Add bounded container log analysis to deploy failure recovery."
|
||||||
|
T18: "DONE: Tighten deploy status validation against unhealthy containers."
|
||||||
issues:
|
issues:
|
||||||
I1: "Tailscale DNS health warning: configured DNS servers unreachable."
|
I1: "Tailscale DNS health warning: configured DNS servers unreachable."
|
||||||
I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed."
|
I2: "Preferred gateway path unavailable: 100.108.208.3:8080 connection failed."
|
||||||
|
|
|
||||||
|
|
@ -50,13 +50,57 @@ def _is_valid_compose(compose: str) -> bool:
|
||||||
return False
|
return False
|
||||||
return isinstance(parsed, dict) and "services" in parsed
|
return isinstance(parsed, dict) and "services" in parsed
|
||||||
|
|
||||||
|
def _get_compose_logs(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
ids_result = subprocess.run(
|
||||||
|
["docker", "compose", "ps", "-q"],
|
||||||
|
cwd=path,
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
return str(exc)
|
||||||
|
|
||||||
def _fix_compose(service: str, error: str, status: str) -> str:
|
container_ids = [line.strip() for line in ids_result.stdout.splitlines() if line.strip()]
|
||||||
|
if not container_ids:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
total = 0
|
||||||
|
limit = 2000
|
||||||
|
for container_id in container_ids:
|
||||||
|
try:
|
||||||
|
logs_result = subprocess.run(
|
||||||
|
["docker", "logs", container_id, "--tail=50"],
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
chunk = f"{container_id}:\n{exc}\n"
|
||||||
|
else:
|
||||||
|
chunk_body = logs_result.stdout or logs_result.stderr
|
||||||
|
chunk = f"{container_id}:\n{chunk_body}\n"
|
||||||
|
remaining = limit - total
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
if len(chunk) > remaining:
|
||||||
|
chunk = chunk[:remaining]
|
||||||
|
parts.append(chunk)
|
||||||
|
total += len(chunk)
|
||||||
|
if total >= limit:
|
||||||
|
break
|
||||||
|
return "".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_compose(service: str, error: str, status: str, logs: str) -> str:
|
||||||
prompt = (
|
prompt = (
|
||||||
"Deployment failed.\n\n"
|
"Deployment failed.\n\n"
|
||||||
f"Service: {service}\n\n"
|
f"Service: {service}\n\n"
|
||||||
f"Error: {error}\n\n"
|
f"Error: {error}\n\n"
|
||||||
f"Status: {status}\n\n"
|
f"Status: {status}\n\n"
|
||||||
|
f"Logs: {logs}\n\n"
|
||||||
"Fix the docker-compose YAML. Return ONLY corrected YAML."
|
"Fix the docker-compose YAML. Return ONLY corrected YAML."
|
||||||
)
|
)
|
||||||
response = ask(prompt)
|
response = ask(prompt)
|
||||||
|
|
@ -135,7 +179,8 @@ def deploy_service(service: str) -> str:
|
||||||
ok, error = _run_compose_up(target_dir)
|
ok, error = _run_compose_up(target_dir)
|
||||||
if not ok:
|
if not ok:
|
||||||
status = get_service_status(target_dir)
|
status = get_service_status(target_dir)
|
||||||
fixed_compose = _fix_compose(service, error, status)
|
logs = _get_compose_logs(target_dir)
|
||||||
|
fixed_compose = _fix_compose(service, error, status, logs)
|
||||||
if fixed_compose.startswith("ERROR:"):
|
if fixed_compose.startswith("ERROR:"):
|
||||||
if status and not status.startswith("ERROR:"):
|
if status and not status.startswith("ERROR:"):
|
||||||
return f"ERROR: {error}\n{status}"
|
return f"ERROR: {error}\n{status}"
|
||||||
|
|
@ -151,7 +196,7 @@ def deploy_service(service: str) -> str:
|
||||||
status = get_service_status(target_dir)
|
status = get_service_status(target_dir)
|
||||||
if status.startswith("ERROR:"):
|
if status.startswith("ERROR:"):
|
||||||
return status
|
return status
|
||||||
if "Up" not in status:
|
if "Up" not in status or "unhealthy" in status.lower():
|
||||||
return f"ERROR: no running services\n{status}"
|
return f"ERROR: no running services\n{status}"
|
||||||
|
|
||||||
return f"DEPLOYED: {target_dir.name}\n{status}"
|
return f"DEPLOYED: {target_dir.name}\n{status}"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue