supervisor: auto-cancel pending actions when drift is resolved
When a service becomes healthy (node-agent emits service_healthy → observer updates services.json), any previously queued redeploy/container_restart action is stale. Without cleanup, the queue accumulates old actions that require manual rejection. _cancel_resolved_pending_actions() runs after each reconcile cycle: - Reads all pending/*.json with type=redeploy or container_restart - If the service is now healthy in actual_state, moves action to cancelled/ with reason=drift_resolved_auto - Only pending actions are touched; approved/running are left to the operator Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2f1965733f
commit
fb7828b52b
|
|
@ -224,6 +224,14 @@ class Supervisor:
|
||||||
if node_info.get("disk_pressure") == "high":
|
if node_info.get("disk_pressure") == "high":
|
||||||
self._generate_disk_cleanup_recommendation(node_name)
|
self._generate_disk_cleanup_recommendation(node_name)
|
||||||
|
|
||||||
|
# 4. Cancel pending actions whose drift has been resolved.
|
||||||
|
# When a service becomes healthy again (because node-agent emits
|
||||||
|
# service_healthy and the observer updates services.json), any
|
||||||
|
# previously queued redeploy/container_restart action for that
|
||||||
|
# service is no longer needed. Move it to "cancelled/" so the
|
||||||
|
# operator can see it was auto-resolved rather than silently dropped.
|
||||||
|
self._cancel_resolved_pending_actions()
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Recommendation generation
|
# Recommendation generation
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
@ -357,6 +365,63 @@ class Supervisor:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save disk cleanup recommendation {action_id}: {e}")
|
logger.error(f"Failed to save disk cleanup recommendation {action_id}: {e}")
|
||||||
|
|
||||||
|
def _cancel_resolved_pending_actions(self):
|
||||||
|
"""
|
||||||
|
Auto-cancel pending service actions (redeploy / container_restart) whose
|
||||||
|
target service is now healthy in the actual state.
|
||||||
|
|
||||||
|
This keeps the action queue clean: when node-agent starts reporting
|
||||||
|
service_healthy for a container that previously had no world-state entry,
|
||||||
|
the pending 'missing_service' redeploy action that was generated before
|
||||||
|
the first health confirmation should be removed automatically rather than
|
||||||
|
sitting in the queue until an operator manually rejects it.
|
||||||
|
|
||||||
|
Only pending actions are considered — approved/running actions have already
|
||||||
|
been committed to by the operator and must not be cancelled automatically.
|
||||||
|
"""
|
||||||
|
cancelled_dir = ACTIONS_DIR / "cancelled"
|
||||||
|
cancelled_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
pending_dir = ACTIONS_DIR / "pending"
|
||||||
|
if not pending_dir.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
for action_file in list(pending_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
with open(action_file, "r") as f:
|
||||||
|
action = json.load(f)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to read action {action_file.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
action_type = action.get("type")
|
||||||
|
node = action.get("node")
|
||||||
|
service = action.get("service")
|
||||||
|
|
||||||
|
# Only auto-cancel service-level actions (not disk_cleanup)
|
||||||
|
if action_type not in ("redeploy", "container_restart"):
|
||||||
|
continue
|
||||||
|
if not node or not service:
|
||||||
|
continue
|
||||||
|
|
||||||
|
svc_key = f"{node}/{service}"
|
||||||
|
actual_info = self.actual_state["services"].get(svc_key)
|
||||||
|
if actual_info and actual_info.get("status") == "healthy":
|
||||||
|
# Drift resolved — move to cancelled/
|
||||||
|
dest = cancelled_dir / action_file.name
|
||||||
|
try:
|
||||||
|
action["status"] = "cancelled"
|
||||||
|
action["cancelled_reason"] = "drift_resolved_auto"
|
||||||
|
action["cancelled_at"] = time.time()
|
||||||
|
with open(dest, "w") as f:
|
||||||
|
json.dump(action, f, indent=2)
|
||||||
|
action_file.unlink()
|
||||||
|
logger.info(
|
||||||
|
f"Auto-cancelled {action_file.name}: "
|
||||||
|
f"{svc_key} is now healthy"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to cancel action {action_file.name}: {e}")
|
||||||
|
|
||||||
def loop(self, interval=30):
|
def loop(self, interval=30):
|
||||||
logger.info("Starting supervisor loop")
|
logger.info("Starting supervisor loop")
|
||||||
while True:
|
while True:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue