supervisor: auto-cancel pending actions when drift is resolved
When a service becomes healthy (node-agent emits service_healthy → observer updates services.json), any previously queued redeploy/container_restart action is stale. Without cleanup, the queue accumulates old actions that require manual rejection. _cancel_resolved_pending_actions() runs after each reconcile cycle: - Reads all pending/*.json with type=redeploy or container_restart - If the service is now healthy in actual_state, moves action to cancelled/ with reason=drift_resolved_auto - Only pending actions are touched; approved/running are left to the operator Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2f1965733f
commit
fb7828b52b
|
|
@ -224,6 +224,14 @@ class Supervisor:
|
|||
if node_info.get("disk_pressure") == "high":
|
||||
self._generate_disk_cleanup_recommendation(node_name)
|
||||
|
||||
# 4. Cancel pending actions whose drift has been resolved.
|
||||
# When a service becomes healthy again (because node-agent emits
|
||||
# service_healthy and the observer updates services.json), any
|
||||
# previously queued redeploy/container_restart action for that
|
||||
# service is no longer needed. Move it to "cancelled/" so the
|
||||
# operator can see it was auto-resolved rather than silently dropped.
|
||||
self._cancel_resolved_pending_actions()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Recommendation generation
|
||||
# ------------------------------------------------------------------
|
||||
|
|
@ -357,6 +365,63 @@ class Supervisor:
|
|||
except Exception as e:
|
||||
logger.error(f"Failed to save disk cleanup recommendation {action_id}: {e}")
|
||||
|
||||
def _cancel_resolved_pending_actions(self):
|
||||
"""
|
||||
Auto-cancel pending service actions (redeploy / container_restart) whose
|
||||
target service is now healthy in the actual state.
|
||||
|
||||
This keeps the action queue clean: when node-agent starts reporting
|
||||
service_healthy for a container that previously had no world-state entry,
|
||||
the pending 'missing_service' redeploy action that was generated before
|
||||
the first health confirmation should be removed automatically rather than
|
||||
sitting in the queue until an operator manually rejects it.
|
||||
|
||||
Only pending actions are considered — approved/running actions have already
|
||||
been committed to by the operator and must not be cancelled automatically.
|
||||
"""
|
||||
cancelled_dir = ACTIONS_DIR / "cancelled"
|
||||
cancelled_dir.mkdir(parents=True, exist_ok=True)
|
||||
pending_dir = ACTIONS_DIR / "pending"
|
||||
if not pending_dir.exists():
|
||||
return
|
||||
|
||||
for action_file in list(pending_dir.glob("*.json")):
|
||||
try:
|
||||
with open(action_file, "r") as f:
|
||||
action = json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read action {action_file.name}: {e}")
|
||||
continue
|
||||
|
||||
action_type = action.get("type")
|
||||
node = action.get("node")
|
||||
service = action.get("service")
|
||||
|
||||
# Only auto-cancel service-level actions (not disk_cleanup)
|
||||
if action_type not in ("redeploy", "container_restart"):
|
||||
continue
|
||||
if not node or not service:
|
||||
continue
|
||||
|
||||
svc_key = f"{node}/{service}"
|
||||
actual_info = self.actual_state["services"].get(svc_key)
|
||||
if actual_info and actual_info.get("status") == "healthy":
|
||||
# Drift resolved — move to cancelled/
|
||||
dest = cancelled_dir / action_file.name
|
||||
try:
|
||||
action["status"] = "cancelled"
|
||||
action["cancelled_reason"] = "drift_resolved_auto"
|
||||
action["cancelled_at"] = time.time()
|
||||
with open(dest, "w") as f:
|
||||
json.dump(action, f, indent=2)
|
||||
action_file.unlink()
|
||||
logger.info(
|
||||
f"Auto-cancelled {action_file.name}: "
|
||||
f"{svc_key} is now healthy"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cancel action {action_file.name}: {e}")
|
||||
|
||||
def loop(self, interval=30):
|
||||
logger.info("Starting supervisor loop")
|
||||
while True:
|
||||
|
|
|
|||
Loading…
Reference in a new issue