diff --git a/agent-node/src/agent_node/utils/service_manager.py b/agent-node/src/agent_node/utils/service_manager.py index 457337c..45aa88a 100644 --- a/agent-node/src/agent_node/utils/service_manager.py +++ b/agent-node/src/agent_node/utils/service_manager.py @@ -165,10 +165,29 @@ TASK_NAME = "CortexAgent" def install(self, python_path, script_path, working_dir) -> bool: + # Create an infinite loop wrapper to ensure resilient restarts on Windows + loop_bat_path = os.path.join(working_dir, "run_loop.bat") + log_path = os.path.join(working_dir, "agent.log") + loop_content = f"""@echo off +:loop +echo [%date% %time%] Starting Cortex Agent... >> "{log_path}" +"{python_path}" -u "{script_path}" >> "{log_path}" 2>&1 +echo [%date% %time%] Agent process exited. Restarting in 5 seconds... >> "{log_path}" +timeout /t 5 /nobreak >nul +goto loop +""" + try: + with open(loop_bat_path, "w", encoding="utf-8") as f: + f.write(loop_content) + except Exception as e: + print(f"Failed to write run_loop.bat: {e}") + return False + ps_cmd = f""" - $Action = New-ScheduledTaskAction -Execute '{python_path}' -Argument '"{script_path}"' -WorkingDirectory '{working_dir}' + $Action = New-ScheduledTaskAction -Execute "cmd.exe" -Argument "/c `"{loop_bat_path}`"" -WorkingDirectory '{working_dir}' $Trigger = New-ScheduledTaskTrigger -AtStartup - Register-ScheduledTask -TaskName '{self.TASK_NAME}' -Action $Action -Trigger $Trigger -User "SYSTEM" -RunLevel Highest -Force + $Settings = New-ScheduledTaskSettingsSet -AllowStartIfOnBatteries -DontStopIfGoingOnBatteries -StartWhenAvailable + Register-ScheduledTask -TaskName '{self.TASK_NAME}' -Action $Action -Trigger $Trigger -Settings $Settings -User "SYSTEM" -RunLevel Highest -Force """ try: subprocess.run(["powershell", "-Command", ps_cmd], check=True, capture_output=True) diff --git a/mesh-sdk/mesh_core/engines/node.py b/mesh-sdk/mesh_core/engines/node.py index b4e5c4f..36b4a7e 100644 --- a/mesh-sdk/mesh_core/engines/node.py +++ b/mesh-sdk/mesh_core/engines/node.py @@ -33,9 +33,13 @@ logger.info(f"[MeshCore] Starting Node {self.node_id}...") # 1. Perform Handshake - if not self.transport.handshake(): + policy = self.transport.handshake() + if not policy: logger.error(f"[MeshCore] Handshake failed. Node {self.node_id} cannot start.") return False + + if self.on_policy and hasattr(policy, "mode"): + self.on_policy(policy) # 2. Connect TaskStream self.transport.connect() diff --git a/mesh-sdk/mesh_core/transport/grpc.py b/mesh-sdk/mesh_core/transport/grpc.py index 0f227c6..73f3a36 100644 --- a/mesh-sdk/mesh_core/transport/grpc.py +++ b/mesh-sdk/mesh_core/transport/grpc.py @@ -36,7 +36,7 @@ self.last_activity = 0 self._send_counter = itertools.count() # thread-safe atomic counter; avoids protobuf comparison in heapq - def handshake(self) -> bool: + def handshake(self) -> Any: self._refresh_stub() try: req = agent_pb2.RegistrationRequest( @@ -47,14 +47,14 @@ res = self.stub.SyncConfiguration(req) if res.success: logger.info(f"[Mesh] Handshake successful for {self.node_id}") - # Optional: Handle policy res.policy - return True + return res.policy else: logger.error(f"[Mesh] Handshake REJECTED for {self.node_id}: {res.error_message}") if self.hub_http_url and self.secret_key: logger.info(f"[Mesh] Attempting token self-recovery for {self.node_id}...") if self._try_token_recovery(): - return True + # We need to retry handshake to get the policy + return self.handshake() return False except Exception as e: logger.error(f"[Mesh] Handshake FAILED for {self.node_id}: {e}") diff --git a/reinstall_windows_agent.ps1 b/reinstall_windows_agent.ps1 index 66da338..ab17c70 100644 --- a/reinstall_windows_agent.ps1 +++ b/reinstall_windows_agent.ps1 @@ -46,15 +46,27 @@ } } -Write-Host "[5/6] Re-registering Scheduled Task with auto-restart..." -$action = New-ScheduledTaskAction -Execute "cmd.exe" -Argument "/c `"$pythonPath $mainPath >> $logPath 2>&1`"" +Write-Host "[5/6] Re-registering Scheduled Task with infinite auto-restart wrapper..." + +# Create an infinite loop wrapper to bypass the 3-restart limit of Scheduled Tasks +$loopBat = "$agentRoot\run_loop.bat" +$loopContent = @" +@echo off +:loop +echo [%date% %time%] Starting Cortex Agent... >> "$logPath" +"$pythonPath" -u "$mainPath" >> "$logPath" 2>&1 +echo [%date% %time%] Agent process exited. Restarting in 5 seconds... >> "$logPath" +timeout /t 5 /nobreak >nul +goto loop +"@ +Set-Content -Path $loopBat -Value $loopContent -Encoding ASCII + +$action = New-ScheduledTaskAction -Execute "cmd.exe" -Argument "/c `"$loopBat`"" $principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -LogonType ServiceAccount -RunLevel Highest $settings = New-ScheduledTaskSettingsSet ` -AllowStartIfOnBatteries ` -DontStopIfGoingOnBatteries ` - -StartWhenAvailable ` - -RestartCount 3 ` - -RestartInterval (New-TimeSpan -Minutes 1) + -StartWhenAvailable Unregister-ScheduledTask -TaskName $taskName -Confirm:$false -ErrorAction SilentlyContinue Register-ScheduledTask -TaskName $taskName -Action $action -Principal $principal -Settings $settings