diff --git a/ai-hub/app/core/grpc/services/grpc_server.py b/ai-hub/app/core/grpc/services/grpc_server.py index 0599063..a61476f 100644 --- a/ai-hub/app/core/grpc/services/grpc_server.py +++ b/ai-hub/app/core/grpc/services/grpc_server.py @@ -259,9 +259,23 @@ if tr.HasField("browser_result"): br = tr.browser_result res_obj["browser"] = { - "url": br.url, "title": br.title, "has_snapshot": len(br.snapshot) > 0, - "eval": br.eval_result + "url": br.url, + "title": br.title, + "has_snapshot": len(br.snapshot) > 0, + "eval": br.eval_result, + # OpenClaw-inspired: include the full aria role tree for AI reasoning + "aria_snapshot": br.a11y_tree if br.a11y_tree else None, } + # Flatten key fields to stdout for easy AI consumption + if br.a11y_tree: + res_obj["stdout"] = ( + f"[Browser] URL: {br.url}\n" + f"[Browser] Title: {br.title}\n" + f"[Browser] Page Snapshot ({br.eval_result}):\n\n" + f"{br.a11y_tree}" + ) + elif br.eval_result: + res_obj["stdout"] = f"[Browser] {br.url} | {br.title}\nResult: {br.eval_result}" self.journal.fulfill(tr.task_id, res_obj) # M6: Emit to EventBus for UI streaming diff --git a/ai-hub/app/core/services/tool.py b/ai-hub/app/core/services/tool.py index a2672d0..d2de517 100644 --- a/ai-hub/app/core/services/tool.py +++ b/ai-hub/app/core/services/tool.py @@ -199,13 +199,18 @@ # ... existing logic ... from app.protos import agent_pb2 action_str = args.get("action", "navigate").upper() + # Map get_a11y -> GET_A11Y, get_dom -> GET_DOM etc + action_str = action_str.replace("GET_A11Y", "GET_A11Y").replace("GET_DOM", "GET_DOM") action_type = getattr(agent_pb2.BrowserAction, action_str, agent_pb2.BrowserAction.NAVIGATE) - session_id = args.get("session_id") browser_action = agent_pb2.BrowserAction( action=action_type, url=args.get("url", ""), - session_id=resolved_sid or "" + selector=args.get("selector", ""), + text=args.get("text", ""), + x=int(args.get("x", 0)), + y=int(args.get("y", 0)), + session_id=resolved_sid or "", ) task_fn = assistant.dispatch_browser task_args = {"node_id": node_id, "action": browser_action, "session_id": resolved_sid} diff --git a/skills/browser-automation-agent/SKILL.md b/skills/browser-automation-agent/SKILL.md index 8f18b86..bcb8f96 100644 --- a/skills/browser-automation-agent/SKILL.md +++ b/skills/browser-automation-agent/SKILL.md @@ -1,8 +1,10 @@ --- name: browser_automation_agent emoji: "🌐" -description: Perform web browsing, form filling, and UI testing on remote agent nodes - using Playwright. +description: > + Perform web browsing, data extraction, form filling, and UI automation on remote + agent nodes using Playwright. Supports persistent browser sessions with stateful + element refs (e1, e2, ...) for reliable multi-step interaction. skill_type: remote_grpc is_enabled: true features: @@ -13,14 +15,12 @@ method: Navigate capabilities: - browser - - screenshot - - click parameters: type: object properties: url: type: string - description: The URL to navigate to. + description: The URL to navigate to (required for 'navigate' action). action: type: string enum: @@ -28,15 +28,44 @@ - click - type - screenshot - description: The browser action to perform. + - get_dom + - hover + - scroll + - eval + - get_a11y + - close + description: | + The browser action to perform: + - navigate: Go to a URL. Auto-returns an aria snapshot for immediate context. + - get_a11y: Get a semantic role tree of the page with [ref=eN] labels. Use this to understand the page and get selectors for interactive elements. + - click: Click a selector or ref (e.g. 'e3'). + - type: Type text into a selector or ref. + - screenshot: Capture a PNG screenshot. + - eval: Execute JavaScript on the page and return the result. + - get_dom: Get the full HTML source. + - scroll: Scroll vertically by 'y' pixels. + - hover: Hover over a selector or ref. + - close: Close the browser session. + selector: + type: string + description: > + CSS/XPath selector OR a ref from the last snapshot (e.g. 'e3'). + Refs are more reliable than CSS selectors — always prefer refs after get_a11y. + text: + type: string + description: Text to type (for 'type' action) or JavaScript to execute (for 'eval' action). + y: + type: integer + description: Pixels to scroll vertically (for 'scroll' action, default 400). node_id: type: string description: The target node ID. session_id: type: string - description: Optional session ID to persist browser state (cookies, login). + description: > + Session ID for persistent browser state. Use a consistent ID across multiple + actions to maintain cookies, login state, and element refs. required: - - url - action - node_id is_system: true @@ -44,4 +73,33 @@ # Browser Automation Agent -You are an AI browsing assistant. Use the Playwright tool to navigate pages, extract information, and interact with web elements. Always provide reasoning for your actions. +You are an AI browsing and data extraction assistant using Playwright on a remote agent node. + +## Recommended Workflow (ALWAYS follow this pattern) + +### Step 1: Navigate +Use `navigate` to go to a URL. This automatically returns an accessibility snapshot. + +### Step 2: Understand the page with `get_a11y` +Run `get_a11y` to get a **semantic role tree** of the page. Each interactive or content element gets a stable `[ref=eN]` label: +``` +- heading "Top Stories" [ref=e1] +- link "OpenAI releases new model" [ref=e2] +- searchbox "Search" [ref=e3] +- button "Submit" [ref=e4] +``` + +### Step 3: Interact using refs +Use the refs directly as a `selector` value for `click`, `type`, or `hover`: +- To click "Submit": `{ "action": "click", "selector": "e4" }` +- To type a query: `{ "action": "type", "selector": "e3", "text": "AI news" }` + +## Extracting Information +- Use `eval` with JavaScript for targeted data extraction: + - `document.title` + - `[...document.querySelectorAll('h2')].map(e=>e.innerText).join('\n')` + - `document.body.innerText` (for clean text without HTML) +- Use `get_a11y` for structured listings of links, headings, buttons. + +## Session Persistence +Always use the same `session_id` across steps to preserve cookies, login state, and element refs. diff --git a/skills/browser-automation-agent/logic.py b/skills/browser-automation-agent/logic.py index fd1dc9d..c7d7ea7 100644 --- a/skills/browser-automation-agent/logic.py +++ b/skills/browser-automation-agent/logic.py @@ -2,79 +2,225 @@ import queue import time import json +import re from playwright.sync_api import sync_playwright from agent_node.skills.base import BaseSkill from protos import agent_pb2 +# ============================================================ +# Role-Ref Registry +# Inspired by Openclaw's pw-role-snapshot.ts +# Maps `ref=eN` shorthand -> (role, name, nth) for every +# interactive / content element on the last snapshotted page. +# ============================================================ + +INTERACTIVE_ROLES = { + "button", "link", "textbox", "checkbox", "radio", "combobox", + "listbox", "menuitem", "menuitemcheckbox", "menuitemradio", + "option", "searchbox", "slider", "spinbutton", "switch", "tab", "treeitem", +} +CONTENT_ROLES = { + "heading", "cell", "gridcell", "columnheader", "rowheader", + "listitem", "article", "region", "main", "navigation", +} +STRUCTURAL_ROLES = { + "generic", "group", "list", "table", "row", "rowgroup", "grid", + "treegrid", "menu", "menubar", "toolbar", "tablist", "tree", + "directory", "document", "application", "presentation", "none", +} + + +def _build_aria_snapshot(aria_text: str) -> tuple[str, dict]: + """ + Parse Playwright's ariaSnapshot() output and annotate interactive/content + elements with stable [ref=eN] labels that the AI can refer back to. + Returns (annotated_snapshot, ref_map). + """ + lines = aria_text.split("\n") + refs = {} + counter = [0] + role_counts = {} # (role, name) -> count (for nth disambiguation) + output_lines = [] + + def next_ref(): + counter[0] += 1 + return f"e{counter[0]}" + + for line in lines: + m = re.match(r'^(\s*-\s*)(\w+)(?:\s+"([^"]*)")?(.*)$', line) + if not m: + output_lines.append(line) + continue + + prefix, role_raw, name, suffix = m.group(1), m.group(2), m.group(3), m.group(4) + role = role_raw.lower() + + is_interactive = role in INTERACTIVE_ROLES + is_content_with_name = role in CONTENT_ROLES and name + + if not (is_interactive or is_content_with_name): + output_lines.append(line) + continue + + # assign ref + ref = next_ref() + key = (role, name) + nth = role_counts.get(key, 0) + role_counts[key] = nth + 1 + + refs[ref] = {"role": role, "name": name, "nth": nth if nth > 0 else None} + + enhanced = f"{prefix}{role_raw}" + if name: + enhanced += f' "{name}"' + enhanced += f" [ref={ref}]" + if nth > 0: + enhanced += f" [nth={nth}]" + if suffix: + enhanced += suffix + output_lines.append(enhanced) + + return "\n".join(output_lines), refs + + +def _resolve_ref(page, ref: str, role_refs: dict): + """Resolve a [ref=eN] string to a Playwright Locator.""" + info = role_refs.get(ref) + if not info: + raise ValueError(f"Unknown ref '{ref}'. Run aria_snapshot first and use a ref from that output.") + role = info["role"] + name = info.get("name") + nth = info.get("nth") or 0 + if name: + loc = page.get_by_role(role, name=name, exact=True) + else: + loc = page.get_by_role(role) + if nth: + loc = loc.nth(nth) + return loc + + class BrowserSkill(BaseSkill): - """The 'Antigravity Bridge': Persistent Browser Skill using a dedicated Actor thread.""" + """ + Persistent Browser Skill — OpenClaw-inspired role-snapshot architecture. + + Key innovation over the prior version: + - `aria_snapshot` action returns a compact semantic role tree with [ref=eN] labels. + - All `click`, `type`, `hover` actions accept either a CSS/XPath selector OR a + ref string like 'e3', enabling the AI to address elements without fragile selectors. + - Page errors and console output are tracked per-session and included in results. + """ def __init__(self, sync_mgr=None): self.task_queue = queue.Queue() - self.sessions = {} # session_id -> { "context": Context, "page": Page } + # session_id -> { "context", "page", "role_refs", "console", "errors", "download_dir" } + self.sessions = {} self.sync_mgr = sync_mgr self.lock = threading.Lock() threading.Thread(target=self._browser_actor, daemon=True, name="BrowserActor").start() - def _setup_listeners(self, sid, page, on_event): - """Tunnels browser internal events back to the Orchestrator.""" - if not on_event: return - - # Live Console Redirector - page.on("console", lambda msg: on_event(agent_pb2.BrowserEvent( - session_id=sid, console_msg=agent_pb2.ConsoleMessage( - level=msg.type, text=msg.text, timestamp_ms=int(time.time()*1000) - ) - ))) - - # Live Network Redirector - page.on("requestfinished", lambda req: on_event(agent_pb2.BrowserEvent( - session_id=sid, network_req=agent_pb2.NetworkRequest( - method=req.method, url=req.url, status=req.response().status if req.response() else 0, - resource_type=req.resource_type, latency_ms=0 - ) - ))) + # ------------------------------------------------------------------ + # Session Management + # ------------------------------------------------------------------ - # Live Download Redirector - page.on("download", lambda download: self._handle_download(sid, download)) - - def _handle_download(self, sid, download): - """Saves browser downloads directly into the synchronized session workspace.""" - import os + def _get_or_create_session(self, browser, sid, task, on_event): + """Return existing session dict or create a new one.""" with self.lock: - sess = self.sessions.get(sid) - if sess and sess.get("download_dir"): - os.makedirs(sess["download_dir"], exist_ok=True) - target = os.path.join(sess["download_dir"], download.suggested_filename) - print(f" [🌐📥] Browser Download Sync: {download.suggested_filename} -> {target}") - download.save_as(target) - + if sid in self.sessions: + return self.sessions[sid] + + download_dir = None + if self.sync_mgr and task.session_id: + download_dir = self.sync_mgr.get_session_dir(task.session_id) + print(f" [🌐📁] Mapping Browser Context to: {download_dir}") + + ctx = browser.new_context(accept_downloads=True) + page = ctx.new_page() + + sess = { + "context": ctx, + "page": page, + "role_refs": {}, # ref -> {role, name, nth} + "console": [], + "errors": [], + "download_dir": download_dir, + } + self.sessions[sid] = sess + + # Listeners + self._attach_listeners(sid, page, on_event, sess) + return sess + + def _attach_listeners(self, sid, page, on_event, sess): + # Console log capture + def _on_console(msg): + entry = {"level": msg.type, "text": msg.text, "ts": int(time.time() * 1000)} + sess["console"].append(entry) + if len(sess["console"]) > 200: + sess["console"].pop(0) + if on_event: + on_event(agent_pb2.BrowserEvent( + session_id=sid, + console_msg=agent_pb2.ConsoleMessage( + level=msg.type, text=msg.text, timestamp_ms=entry["ts"] + ) + )) + + def _on_page_error(err): + sess["errors"].append({"message": str(err), "ts": int(time.time() * 1000)}) + if len(sess["errors"]) > 100: + sess["errors"].pop(0) + + def _on_network(req): + resp = req.response() + if on_event: + on_event(agent_pb2.BrowserEvent( + session_id=sid, + network_req=agent_pb2.NetworkRequest( + method=req.method, url=req.url, + status=resp.status if resp else 0, + resource_type=req.resource_type, latency_ms=0 + ) + )) + + def _on_download(dl): + import os + with self.lock: + s = self.sessions.get(sid) + if s and s.get("download_dir"): + os.makedirs(s["download_dir"], exist_ok=True) + target = os.path.join(s["download_dir"], dl.suggested_filename) + print(f" [🌐📥] Download: {dl.suggested_filename} -> {target}") + dl.save_as(target) + + page.on("console", _on_console) + page.on("pageerror", _on_page_error) + page.on("requestfinished", _on_network) + page.on("download", _on_download) + + # ------------------------------------------------------------------ + # Browser Actor Loop + # ------------------------------------------------------------------ + def _browser_actor(self): - """Serializes all Playwright operations on a single dedicated thread.""" print("[🌐] Browser Actor Starting...", flush=True) - pw = None - browser = None + pw = browser = None try: - # 1. Start Playwright Driver try: pw = sync_playwright().start() except Exception as pe: - print(f"[!] Browser Skill: Playwright driver failed to start: {pe}", flush=True) - print("[!] This usually means Playwright is not installed or the OS is missing dependencies (e.g. GLIBC).") + print(f"[!] Playwright failed to start: {pe}", flush=True) return - # 2. Launch Browser Engine try: - # 12-Factor/Container Optimization: Standard non-sandbox arguments browser = pw.chromium.launch(headless=True, args=[ - '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu' + '--no-sandbox', '--disable-setuid-sandbox', + '--disable-dev-shm-usage', '--disable-gpu' ]) print("[🌐] Browser Engine Online.", flush=True) except Exception as be: - print(f"[!] Browser Skill: Chromium launch failed: {be}", flush=True) - print("[!] Tip: Run 'playwright install' or check for missing system libraries.") + print(f"[!] Chromium launch failed: {be}", flush=True) if pw: pw.stop() return - except Exception as e: print(f"[!] Browser Actor critical failure: {e}", flush=True) if pw: pw.stop() @@ -83,69 +229,45 @@ while True: try: item = self.task_queue.get() - if item is None: # Sentinel for shutdown + if item is None: print("[🌐] Browser Actor Shutting Down...", flush=True) break - + task, sandbox, on_complete, on_event = item action = task.browser_action sid = action.session_id or "default" - - with self.lock: - if sid not in self.sessions: - # Phase 4: Mount workspace for downloads/uploads - download_dir = None - if self.sync_mgr and task.session_id: - download_dir = self.sync_mgr.get_session_dir(task.session_id) - print(f" [🌐📁] Mapping Browser Context to: {download_dir}") + action_name = agent_pb2.BrowserAction.ActionType.Name(action.action) + print(f" [🌐] {action_name} | Session: {sid}", flush=True) - ctx = browser.new_context(accept_downloads=True) - pg = ctx.new_page() - self._setup_listeners(sid, pg, on_event) - self.sessions[sid] = {"context": ctx, "page": pg, "download_dir": download_dir} - - page = self.sessions[sid]["page"] - print(f" [🌐] Browser Actor Processing: {agent_pb2.BrowserAction.ActionType.Name(action.action)} | Session: {sid}", flush=True) - + sess = self._get_or_create_session(browser, sid, task, on_event) + page = sess["page"] + res_data = {} - # State-Machine Logic for Actions - if action.action == agent_pb2.BrowserAction.NAVIGATE: - page.goto(action.url, wait_until="commit") - elif action.action == agent_pb2.BrowserAction.CLICK: - page.click(action.selector) - elif action.action == agent_pb2.BrowserAction.TYPE: - page.fill(action.selector, action.text) - elif action.action == agent_pb2.BrowserAction.SCREENSHOT: - res_data["snapshot"] = page.screenshot() - elif action.action == agent_pb2.BrowserAction.GET_DOM: - res_data["dom_content"] = page.content() - elif action.action == agent_pb2.BrowserAction.HOVER: - page.hover(action.selector) - elif action.action == agent_pb2.BrowserAction.SCROLL: - page.mouse.wheel(x=0, y=action.y) - elif action.action == agent_pb2.BrowserAction.EVAL: - res_data["eval_result"] = str(page.evaluate(action.text)) - elif action.action == agent_pb2.BrowserAction.GET_A11Y: - res_data["a11y_tree"] = json.dumps(page.accessibility.snapshot()) - elif action.action == agent_pb2.BrowserAction.CLOSE: - with self.lock: - sess = self.sessions.pop(sid, None) - if sess: sess["context"].close() + try: + self._dispatch_action(action, page, sess, res_data) + except Exception as e: + on_complete(task.task_id, {"stderr": str(e), "status": 2}, task.trace_id) + continue - # Results Construction + # Build BrowserResponse — include aria_snapshot result in eval_result br_res = agent_pb2.BrowserResponse( - url=page.url, title=page.title(), + url=page.url, + title=page.title(), snapshot=res_data.get("snapshot", b""), dom_content=res_data.get("dom_content", ""), a11y_tree=res_data.get("a11y_tree", ""), - eval_result=res_data.get("eval_result", "") + eval_result=res_data.get("eval_result", ""), ) on_complete(task.task_id, {"status": 1, "browser_result": br_res}, task.trace_id) + except Exception as e: print(f" [!] Browser Actor Error: {e}", flush=True) - on_complete(task.task_id, {"stderr": str(e), "status": 2}, task.trace_id) + try: + on_complete(task.task_id, {"stderr": str(e), "status": 2}, task.trace_id) + except Exception: + pass - # Cleanup on loop exit + # Cleanup print("[🌐] Cleaning up Browser Engine...", flush=True) with self.lock: for s in self.sessions.values(): @@ -155,11 +277,93 @@ if browser: browser.close() if pw: pw.stop() + # ------------------------------------------------------------------ + # Action Dispatcher + # ------------------------------------------------------------------ + + def _dispatch_action(self, action, page, sess, res_data): + A = agent_pb2.BrowserAction + role_refs = sess["role_refs"] + + def resolve(selector_or_ref: str): + """Accept either a CSS selector or a ref like 'e3'.""" + s = (selector_or_ref or "").strip() + if re.match(r'^e\d+$', s): + return _resolve_ref(page, s, role_refs) + return page.locator(s) + + if action.action == A.NAVIGATE: + page.goto(action.url, wait_until="domcontentloaded", timeout=25000) + # Auto-snapshot after every navigation: give AI page context immediately + aria_raw = page.locator(":root").aria_snapshot() + snap, refs = _build_aria_snapshot(aria_raw) + sess["role_refs"] = refs + # Trim to 8000 chars to avoid bloating the grpc response + trimmed = snap[:8000] + ("\n\n[...snapshot truncated...]" if len(snap) > 8000 else "") + stats = f"refs={len(refs)}" + res_data["a11y_tree"] = trimmed + res_data["eval_result"] = stats + + elif action.action == A.CLICK: + target = action.selector or "" + resolve(target).click(timeout=8000) + + elif action.action == A.TYPE: + target = action.selector or "" + resolve(target).fill(action.text, timeout=8000) + + elif action.action == A.SCREENSHOT: + res_data["snapshot"] = page.screenshot(full_page=False) + + elif action.action == A.GET_DOM: + res_data["dom_content"] = page.content() + + elif action.action == A.HOVER: + target = action.selector or "" + resolve(target).hover(timeout=5000) + + elif action.action == A.SCROLL: + page.mouse.wheel(x=0, y=action.y or 400) + + elif action.action == A.EVAL: + result = page.evaluate(action.text) + res_data["eval_result"] = str(result) + + elif action.action == A.GET_A11Y: + # OpenClaw-style role snapshot with ref labels — the key feature! + aria_raw = page.locator(":root").aria_snapshot() + snap, refs = _build_aria_snapshot(aria_raw) + sess["role_refs"] = refs # remember refs for subsequent click/type calls + + # Trim large snapshots (news pages can be huge) + MAX = 10000 + if len(snap) > MAX: + snap = snap[:MAX] + "\n\n[...snapshot truncated - use eval/scroll to see more...]" + + stats = { + "total_refs": len(refs), + "interactive": sum(1 for r in refs.values() if r["role"] in INTERACTIVE_ROLES), + "url": page.url, + "title": page.title(), + } + res_data["a11y_tree"] = snap + res_data["eval_result"] = json.dumps(stats) + + elif action.action == A.CLOSE: + with self.lock: + s = self.sessions.pop(action.session_id or "default", None) + if s: + s["context"].close() + + # ------------------------------------------------------------------ + # Public Interface + # ------------------------------------------------------------------ + def execute(self, task, sandbox, on_complete, on_event=None): self.task_queue.put((task, sandbox, on_complete, on_event)) - def cancel(self, task_id): return False + def cancel(self, task_id): + return False def shutdown(self): - """Triggers graceful shutdown of the browser engine.""" self.task_queue.put(None)