diff --git a/ai-hub/integration_tests/test_browser_llm.py b/ai-hub/integration_tests/test_browser_llm.py index 608a36c..56c920b 100644 --- a/ai-hub/integration_tests/test_browser_llm.py +++ b/ai-hub/integration_tests/test_browser_llm.py @@ -31,10 +31,10 @@ session_id = r_sess.json()["id"] - # Step 2: Ask a question that requires real-time knowledge (weather) - # We explicitly ask it to use its search/browser tools to ensure it invokes them. + # Step 2: Ask a question that requires the browser to take a snapshot/screenshot + # We explicitly ask it to navigate and snapshot to guarantee image output. chat_payload = { - "prompt": "Use your browser or search tools to find the current weather in Tokyo, Japan. Briefly summarize it in 1 sentence.", + "prompt": "Use your browser_automation_agent tool to navigate to https://example.com, take a screenshot of the page, and tell me the heading you see on the page.", "provider_name": "gemini" } @@ -57,8 +57,8 @@ if event_type == "content": full_response += event.get("content", "") - elif event_type == "status": - if "Dispatching" in event.get("content", ""): + elif event_type == "tool_start": + if event.get("name") == "browser_automation_agent": tool_invoked = True elif event_type == "error": pytest.fail(f"LLM backend emitted an error: {event.get('content')}") @@ -67,7 +67,44 @@ except json.JSONDecodeError: pass - full_response = full_response.strip() + full_response = full_response.strip().lower() assert len(full_response) > 0, "LLM returned an entirely silent response." - assert tool_invoked, "The LLM didn't attempt to invoke any web/browser tools as instructed." - # If it invoked the tool and formulated a response, the end-to-end framework succeeded! + assert tool_invoked, "The LLM didn't attempt to invoke the browser tool as instructed." + + # The prompt asked for example.com heading ("Example Domain"). + assert "example domain" in full_response, f"LLM did not identify the correct heading. Response: {full_response}" + + # Step 3: Verify the browser agent physically saved the screenshot and metadata to the file sync system. + # Since the session_id is either used directly or via sync_workspace_id wrapper, we search the mirrors directory. + import subprocess + cmd = [ + "docker", "exec", "ai_hub_service", "bash", "-c", + "find /app/data/mirrors/ -name '.browser_data' -type d" + ] + res = subprocess.run(cmd, capture_output=True, text=True) + assert res.returncode == 0, f"Failed to search for .browser_data inside Docker: {res.stderr}" + + # We expect at least one .browser_data directory corresponding to our session + found_dirs = res.stdout.strip().split('\n') + # match session id or sync_workspace_id (e.g. 'session-21-' or '21') + matched_dir = next((d for d in found_dirs if str(session_id) in d), None) + assert matched_dir, f"Could not find .browser_data directory for session {session_id} in mirrors. Found: {res.stdout}" + + # Now verify there's a screenshot inside it + verify_cmd = [ + "docker", "exec", "ai_hub_service", "bash", "-c", + f"find {matched_dir} -name '*.png' -type f | wc -l" + ] + res_png = subprocess.run(verify_cmd, capture_output=True, text=True) + png_count = int(res_png.stdout.strip() or "0") + assert png_count > 0, "No screenshot .png files were saved within the browser agent's output folder." + + # Verify metadata / A11y summary saved + verify_meta_cmd = [ + "docker", "exec", "ai_hub_service", "bash", "-c", + f"find {matched_dir}/.metadata -name '*.txt' -o -name '*.json' | wc -l" + ] + res_meta = subprocess.run(verify_meta_cmd, capture_output=True, text=True) + meta_count = int(res_meta.stdout.strip() or "0") + assert meta_count > 0, "No extraction metadata or A11y text files found in the .browser_data/.metadata folder." +