diff --git a/agent-node/src/agent_node/node.py b/agent-node/src/agent_node/node.py index dbf8cd7..ddada51 100644 --- a/agent-node/src/agent_node/node.py +++ b/agent-node/src/agent_node/node.py @@ -742,6 +742,18 @@ if not os.path.exists(abs_path): print(f" [📁❓] Requested file {rel_path} not found on node") + if task_id: + # Immediately notify the Hub so it doesn't wait the full timeout + self.task_queue.put(agent_pb2.ClientTaskMessage( + file_sync=agent_pb2.FileSyncMessage( + session_id=session_id, + task_id=task_id, + status=agent_pb2.SyncStatus( + code=agent_pb2.SyncStatus.ERROR, + message=f"File not found: {rel_path}" + ) + ) + )) return # Optimization: 4MB Incremental Hashing + Zero Throttling diff --git a/ai-hub/app/core/grpc/services/assistant.py b/ai-hub/app/core/grpc/services/assistant.py index 9a6f94d..5c9e8f9 100644 --- a/ai-hub/app/core/grpc/services/assistant.py +++ b/ai-hub/app/core/grpc/services/assistant.py @@ -365,6 +365,11 @@ return {"content": content, "path": path} except Exception as e: logger.error(f"[📁📄] Local cat error for {session_id}/{path}: {e}") + else: + # File is not in the mirror — it was either never written or already deleted. + # Don't round-trip to the node: return instantly so verify-after-delete + # doesn't stall for the full 15s journal timeout. + return {"error": "File not found"} node = self.registry.get_node(node_id) if not node: return {"error": "Offline"} diff --git a/ai-hub/integration_tests/test_file_sync.py b/ai-hub/integration_tests/test_file_sync.py index 4d8c3a1..077e959 100644 --- a/ai-hub/integration_tests/test_file_sync.py +++ b/ai-hub/integration_tests/test_file_sync.py @@ -378,6 +378,70 @@ ) print(f"[Case 4] ✅ {NODE_1} no longer has the file.") + # ── Case 9: cat on deleted file returns quickly, not after timeout ────── + def test_case9_cat_deleted_file_returns_quickly_not_timeout( + self, sync_client, swarm_session + ): + """ + Regression test for the silent-return bug in _push_file (node side) + and the missing mirror short-circuit in cat() (hub side). + + Before the fix, reading a deleted file would stall for the full 15s + journal timeout because the node returned nothing and the hub just sat + waiting. After the fix: + - hub: cat() checks the mirror first; file absent → instant "File not found" + - node: _push_file sends an ERROR SyncStatus immediately when file missing + + This test enforces that a cat call on a deleted file resolves in under + MAX_LATENCY_S seconds on BOTH nodes. + """ + MAX_LATENCY_S = 3.0 # well below the 15s journal timeout + filename = _unique("case9_latency") + content = f"Case 9 — delete latency probe — {uuid.uuid4()}" + workspace = swarm_session + + # Setup: write the file and wait for full propagation + r = _touch(sync_client, NODE_1, filename, content, workspace) + assert r.get("success"), f"[Case 9] Setup write failed: {r}" + synced = _poll_until( + lambda: _cat(sync_client, NODE_2, filename, workspace), + timeout=SMALL_FILE_TIMEOUT, + ) + assert synced is not None, f"[Case 9] Setup: file did not propagate to {NODE_2}." + + # Delete from server + print(f"\n[Case 9] Deleting {filename!r}, then timing cat() on both nodes") + _rm(sync_client, NODE_1, filename, workspace) + + # Give delete broadcast a moment to reach nodes (but not the full poll timeout) + time.sleep(1.5) + + # Measure cat latency on node-1 (hub mirror path — should be instant) + t0 = time.time() + res1 = _cat(sync_client, NODE_1, filename, workspace) + latency_node1 = time.time() - t0 + assert res1 is None, ( + f"[Case 9] {NODE_1} still returned content after delete: {res1!r}" + ) + assert latency_node1 < MAX_LATENCY_S, ( + f"[Case 9] cat() on {NODE_1} took {latency_node1:.1f}s — expected < {MAX_LATENCY_S}s. " + f"Hub mirror short-circuit may be broken." + ) + print(f"[Case 9] ✅ {NODE_1} cat returned in {latency_node1:.2f}s (file absent, fast-fail).") + + # Measure cat latency on node-2 (hub mirror path — should also be instant) + t0 = time.time() + res2 = _cat(sync_client, NODE_2, filename, workspace) + latency_node2 = time.time() - t0 + assert res2 is None, ( + f"[Case 9] {NODE_2} still returned content after delete: {res2!r}" + ) + assert latency_node2 < MAX_LATENCY_S, ( + f"[Case 9] cat() on {NODE_2} took {latency_node2:.1f}s — expected < {MAX_LATENCY_S}s. " + f"Node _push_file may not be sending error status on missing file." + ) + print(f"[Case 9] ✅ {NODE_2} cat returned in {latency_node2:.2f}s (file absent, fast-fail).") + # ══════════════════════════════════════════════════════════════════════════════ # LARGE FILE TESTS (20 MB, multi-chunk)