diff --git a/agent-node/agent_node/config.py b/agent-node/agent_node/config.py index ba29271..ad9eeba 100644 --- a/agent-node/agent_node/config.py +++ b/agent-node/agent_node/config.py @@ -14,7 +14,7 @@ "auth_token": os.getenv("AGENT_AUTH_TOKEN", "cortex-secret-shared-key"), "sync_root": "/tmp/cortex-sync", "tls": True, - "max_skill_workers": 5, + "max_skill_workers": 10, "health_report_interval": 10, } diff --git a/agent-node/agent_node/node.py b/agent-node/agent_node/node.py index f061dd0..90d08fc 100644 --- a/agent-node/agent_node/node.py +++ b/agent-node/agent_node/node.py @@ -549,6 +549,14 @@ success, reason = self.skills.submit(task, self.sandbox, self._on_finish, self._on_event) if not success: print(f"[!] Execution Rejected: {reason}", flush=True) + self._send_response( + task.task_id, + agent_pb2.TaskResponse( + task_id=task.task_id, + status=agent_pb2.TaskResponse.ERROR, + stderr=f"[NODE] Execution Rejected: {reason}", + ) + ) def _on_event(self, event): """Live Event Tunneler: Routes browser/skill events into the main stream.""" diff --git a/ai-hub/app/core/services/sub_agent.py b/ai-hub/app/core/services/sub_agent.py index 1abbf43..8343e2f 100644 --- a/ai-hub/app/core/services/sub_agent.py +++ b/ai-hub/app/core/services/sub_agent.py @@ -35,12 +35,14 @@ if isinstance(self.result, dict) and self.result.get("error"): err_msg = str(self.result.get("error")).lower() # Only retry on potentially transient network/node issues - if any(x in err_msg for x in ["timeout", "offline", "disconnected"]): + if any(x in err_msg for x in ["timeout", "offline", "disconnected", "capacity", "rejected"]): if attempt < self.retries: - self.status = f"RETRYING ({attempt+1}/{self.retries})" - logger.info(f"[SubAgent] {self.name} retrying due to: {err_msg}") - await asyncio.sleep(2) + backoff = (attempt + 1) * 3 # Incremental backoff: 3s, 6s + self.status = f"RETRYING ({attempt+1}/{self.retries}) - {err_msg}" + logger.info(f"[SubAgent] {self.name} retrying due to: {err_msg}. Backoff={backoff}s") + await asyncio.sleep(backoff) continue + self.status = "COMPLETED" break