diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..9cd442d9 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,62 @@ +# Copilot Instructions for workflow-use + +These project-specific instructions help AI coding agents work productively in this repo. Keep responses concise, implement with tools when possible, and follow the repo’s patterns. + +## Architecture (big picture) +- Monorepo with three main parts: + - `extension/` (WXT MV3 Chrome extension): records user actions via rrweb + custom DOM events and streams to a local server. + - `workflows/` (Python package + CLI + FastAPI backend): converts recorded events into deterministic workflow steps, runs/replays steps (fallback to Browser Use/Playwright). + - `ui/` (Vite/React frontend): GUI to view and run workflows. +- Data flow: + 1) Content scripts capture events → background aggregates → posts `WORKFLOW_UPDATE` to `http://127.0.0.1:7331/event`. + 2) Backend stores/processes to `workflows/examples/*.json` and executes with Playwright. + 3) UI/CLI visualize/run. +- Key types: `extension/src/lib/types.ts` (Stored* events), `extension/src/lib/workflow-types.ts` (Step union), `workflows` Python `Workflow` model. + +## Dev workflows +- Build extension: `cd extension && npm install && npm run build`. +- Python backend setup: `cd workflows && uv sync && playwright install chromium && cp .env.example .env`. +- Record: `cd workflows && python cli.py create-workflow` (starts local server; open Chrome with built extension). +- Run workflow as tool: `python cli.py run-as-tool examples/example.workflow.json --prompt "..."`. +- Run workflow: `python cli.py run-workflow examples/example.workflow.json`. +- Launch GUI: `python cli.py launch-gui` (starts FastAPI + UI dev server). + +## Extension patterns +- Use `defineBackground` and `defineContentScript` (WXT). Content script always attaches listeners; background aggregates and emits `WORKFLOW_UPDATE` with a hash to avoid spam. +- Recording: + - rrweb for scroll/meta; custom `CUSTOM_CLICK_EVENT`, `CUSTOM_INPUT_EVENT`, `CUSTOM_KEY_EVENT`, etc. + - New-tab intent: content sends `PREPARE_NEW_TAB`; background correlates `tabs.onCreated` and marks `userInitiated`. + - Activated tab gating: ignore tabs never activated (reduces ad/tracker noise). + - Dedupe: merge consecutive identical steps, collapse rapid empty input bursts, consolidate navigations per tab. + - Iframes: content runs with `allFrames: true` and `matchAboutBlank: true`; events carry `frameUrl` and `frameIdPath`. Background only allows rrweb meta navigations from frames the user interacted with and filters ad/analytics hosts. + +## Backend patterns +- Python FastAPI endpoint `http://127.0.0.1:7331/event` receives: + - `RECORDING_STARTED/STOPPED`, `WORKFLOW_UPDATE` with `steps` only (hash-based dedupe). +- CLI: `workflows/cli.py` provides record/run/launch commands; Playwright is used for replay. +- Keep workflow JSON in `workflows/examples/`. Naming is free-form; version stays at `1.0.0` today. + +## Conventions +- Step schema (extension `workflow-types.ts`): navigation, click, input, key_press, scroll. Prefer merging updates over emitting new steps. +- Use XPath + enhanced CSS selectors; keep values masked for password inputs. +- Avoid sending events from tabs not in `activatedTabs` unless `userInitiated`. +- When adding new event types, extend Stored* in `types.ts`, enrich in content, and map to `Step` in background. + +## Gotchas / Tips +- Avoid noisy iframe navs (recaptcha/ads): rely on `interactedFrameUrls` filtering in background. If adjusting, prefer allow/deny logic over hard-coding hosts in multiple places. +- When changing extension logic, rebuild with `npm run build`; dev opens side panel on install/update. +- Screenshot capture only works for visible tabs; background uses `captureVisibleTab` best-effort. +- If tests are added, ensure they run per package (`extension`, `workflows`, `ui`) rather than at repo root. + +## Example tasks for agents +- Add a new step type (e.g., select): + 1) Extend `StoredCustomSelectEvent` in `types.ts` and emit in `content.ts`. + 2) Map to a `SelectStep` in `background.ts` (convertStoredEventsToSteps). + 3) Update backend replay to handle the new step. +- Reduce noise further: + - Tune debounce windows in content. + - Post-process duplicates in `broadcastWorkflowDataUpdate`. + - Add frame interaction checks before accepting rrweb meta navigations. + +## Security & secrets +- Do not commit real API keys. `.env.example` exists; load secrets locally. If you see a real key in `workflows/.env`, instruct maintainers to rotate and remove it. diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 283a16f4..bf8d49ea 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -30,6 +30,37 @@ export default defineBackground(() => { // Store tab information (URL, potentially title) const tabInfo: { [tabId: number]: { url?: string; title?: string } } = {}; + // Track which tabs have been explicitly activated (brought to foreground) by the user. + // We will ignore events originating from tabs that were never activated to reduce noise + // (for example: ad / tracker tabs that load in the background). + const activatedTabs = new Set(); + + // Track user clicks that are likely to open a new tab (Ctrl/Cmd + click, target=_blank etc.). + // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate + // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated. + const recentNewTabIntents: { [openerTabId: number]: number } = {}; + // Record iframe URLs that the user actually interacted with (via custom events) per tab + const interactedFrameUrls: Record> = {}; + // Additionally track last interaction time per frame for time-window gating + const interactedFrameTimes: Record> = {}; + // Hostname patterns for iframe navigation noise we want to suppress + const BLOCKED_IFRAME_HOST_PATTERNS: RegExp[] = [ + /doubleclick\.net$/i, + /googlesyndication\.com$/i, + /googleadservices\.com$/i, + /amazon-adsystem\.com$/i, + /recaptcha\.google\.com$/i, + /recaptcha\.net$/i, + /googletagmanager\.com$/i, + /indexww\.com$/i, + /adtrafficquality\.google$/i, + /2mdn\.net$/i, + /gstaticadssl\.googleapis\.com$/i, + ]; + + // Heuristic window (ms) within which a created tab following a user intent is considered relevant. + const NEW_TAB_INTENT_WINDOW_MS = 4000; + let isRecordingEnabled = true; // Default to disabled (OFF) let lastWorkflowHash: string | null = null; // Cache for the last logged workflow hash @@ -66,13 +97,72 @@ export default defineBackground(() => { // Function to broadcast workflow data updates to the console bus async function broadcastWorkflowDataUpdate(): Promise { // console.log("[DEBUG] broadcastWorkflowDataUpdate: Entered function"); // Optional: Keep for debugging - const allSteps: Step[] = Object.keys(sessionLogs) + const rawSteps: Step[] = Object.keys(sessionLogs) .flatMap((tabIdStr) => { const tabId = parseInt(tabIdStr, 10); return convertStoredEventsToSteps(sessionLogs[tabId] || []); }) .sort((a, b) => a.timestamp - b.timestamp); // Sort chronologically + // Post-process to collapse consecutive duplicates that only differ by timestamp (e.g. repeated identical navigations) + const allSteps: Step[] = []; + for (const step of rawSteps) { + const last = allSteps.length ? allSteps[allSteps.length - 1] : null; + if (!last) { + allSteps.push(step); + continue; + } + let isDuplicate = false; + if (last.type === step.type) { + switch (step.type) { + case 'navigation': + isDuplicate = (last as NavigationStep).url === (step as NavigationStep).url && last.tabId === step.tabId; + break; + case 'input': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).value === (step as any).value; + break; + case 'click': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).elementText === (step as any).elementText; + break; + case 'scroll': { + const sameXY = (last as any).scrollX === (step as any).scrollX && (last as any).scrollY === (step as any).scrollY; + const sameUrl = (last as any).url === (step as any).url; + const nearTime = Math.abs(step.timestamp - last.timestamp) < 200; + isDuplicate = last.tabId === step.tabId && sameXY && sameUrl && nearTime; + break; + } + case 'key_press': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).key === (step as any).key && + (last as any).xpath === (step as any).xpath; + break; + } + } + if (isDuplicate) { + // Update timestamp (and screenshot if present) to most recent but don't add new step + last.timestamp = step.timestamp; + if ((step as any).screenshot) { + (last as any).screenshot = (step as any).screenshot; + } + } else { + allSteps.push(step); + } + } + // Create the workflowData object *after* sorting steps, but hash only steps const workflowData: Workflow = { name: "Recorded Workflow", @@ -144,6 +234,16 @@ export default defineBackground(() => { console.log(`Sending ${type}:`, payload); const tabId = payload.tabId; if (tabId) { + // Skip capturing events for tabs that have never been activated AND are not the original opener + // unless we have positively identified them as a recent user initiated tab (click intent -> creation). + if ( + type !== "CUSTOM_TAB_ACTIVATED" && + !activatedTabs.has(tabId) && + !(payload.openerTabId && recentNewTabIntents[payload.openerTabId] && Date.now() - recentNewTabIntents[payload.openerTabId] < NEW_TAB_INTENT_WINDOW_MS) + ) { + // Silently ignore background noise (ad/tracker tabs) until user actually focuses them. + return; + } if (!sessionLogs[tabId]) { sessionLogs[tabId] = []; } @@ -171,6 +271,12 @@ export default defineBackground(() => { url: tab.pendingUrl || tab.url, windowId: tab.windowId, index: tab.index, + userInitiated: + !!( + tab.openerTabId && + recentNewTabIntents[tab.openerTabId] && + Date.now() - recentNewTabIntents[tab.openerTabId] < NEW_TAB_INTENT_WINDOW_MS + ), }); }); @@ -188,6 +294,7 @@ export default defineBackground(() => { }); chrome.tabs.onActivated.addListener((activeInfo) => { + activatedTabs.add(activeInfo.tabId); sendTabEvent("CUSTOM_TAB_ACTIVATED", { tabId: activeInfo.tabId, windowId: activeInfo.windowId, @@ -212,72 +319,119 @@ export default defineBackground(() => { // --- Conversion Function --- + const DEFAULT_SETTINGS = { + enableIframes: true as boolean, + iframeWindow: 3000 as number, + blocklist: [ + 'doubleclick.net','googlesyndication.com','googleadservices.com', + 'amazon-adsystem.com','2mdn.net','recaptcha.google.com','recaptcha.net', + 'googletagmanager.com','indexww.com','adtrafficquality.google' + ] as string[], + allowlist: [] as string[], + }; + let settings: { enableIframes: boolean; iframeWindow: number; blocklist: string[]; allowlist: string[] } = { ...DEFAULT_SETTINGS }; + chrome.storage.sync.get(DEFAULT_SETTINGS, (s: any) => { settings = { ...settings, ...s }; }); + chrome.storage.onChanged.addListener((changes, area) => { + if (area !== 'sync') return; + const next = { ...settings } as any; + for (const k of Object.keys(changes)) (next as any)[k] = (changes as any)[k].newValue; + settings = next; + }); + function convertStoredEventsToSteps(events: StoredEvent[]): Step[] { const steps: Step[] = []; + const lastNavigationIndexByTab: Record = {}; + const lastInputPerKey: Record = {}; for (const event of events) { switch (event.messageType) { - case "CUSTOM_CLICK_EVENT": { - const clickEvent = event as StoredCustomClickEvent; - // Ensure required fields are present, even if optional in source type for some reason + case "CUSTOM_TAB_CREATED": + case "CUSTOM_TAB_UPDATED": + case "CUSTOM_TAB_ACTIVATED": { + const navUrl = (event as any).url || (event as any).changeInfo?.url; + if (!navUrl) break; + const tabId = (event as any).tabId; + const userInitiated = (event as any).userInitiated; + if (!activatedTabs.has(tabId) && !userInitiated) break; // suppress background noise + + const existingIdx = lastNavigationIndexByTab[tabId]; if ( - clickEvent.url && - clickEvent.frameUrl && - clickEvent.xpath && - clickEvent.elementTag + existingIdx !== undefined && + steps[existingIdx] && + steps[existingIdx].type === "navigation" ) { + // Update existing navigation (redirect / title change) + (steps[existingIdx] as NavigationStep).url = navUrl; + steps[existingIdx].timestamp = event.timestamp; + } else { + const nav: NavigationStep = { + type: "navigation", + timestamp: event.timestamp, + tabId, + url: navUrl, + }; + steps.push(nav); + lastNavigationIndexByTab[tabId] = steps.length - 1; + } + break; + } + case "CUSTOM_CLICK_EVENT": { + const click = event as StoredCustomClickEvent; + if (click.url && click.xpath && click.elementTag) { const step: ClickStep = { type: "click", - timestamp: clickEvent.timestamp, - tabId: clickEvent.tabId, - url: clickEvent.url, - frameUrl: clickEvent.frameUrl, - xpath: clickEvent.xpath, - cssSelector: clickEvent.cssSelector, - elementTag: clickEvent.elementTag, - elementText: clickEvent.elementText, - screenshot: clickEvent.screenshot, + timestamp: click.timestamp, + tabId: click.tabId, + url: click.url, + frameUrl: click.frameUrl, + frameIdPath: (click as any).frameIdPath, + xpath: click.xpath, + cssSelector: click.cssSelector, + elementTag: click.elementTag, + elementText: click.elementText, + screenshot: click.screenshot, }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_CLICK_EVENT:", clickEvent); + console.warn("Skipping incomplete CUSTOM_CLICK_EVENT", click); } break; } - case "CUSTOM_INPUT_EVENT": { const inputEvent = event as StoredCustomInputEvent; - if ( - inputEvent.url && - // inputEvent.frameUrl && // frameUrl might be null/undefined in some cases, let's allow merging if only one is present or both match - inputEvent.xpath && - inputEvent.elementTag - ) { + if (inputEvent.url && inputEvent.xpath && inputEvent.elementTag) { + const key = `${inputEvent.tabId}|${inputEvent.xpath}`; + const prior = lastInputPerKey[key]; + const nowTs = inputEvent.timestamp; + const isEmpty = (inputEvent as any).value === ""; + if (isEmpty && prior && prior.value === "" && nowTs - prior.ts < 5000) { + // collapse rapid-fire repeated empties + steps[prior.idx].timestamp = nowTs; + break; + } const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; - - // Check if the last step was a mergeable input event if ( lastStep && lastStep.type === "input" && lastStep.tabId === inputEvent.tabId && lastStep.url === inputEvent.url && - lastStep.frameUrl === inputEvent.frameUrl && // Ensure frameUrls match if both exist + lastStep.frameUrl === inputEvent.frameUrl && lastStep.xpath === inputEvent.xpath && lastStep.cssSelector === inputEvent.cssSelector && lastStep.elementTag === inputEvent.elementTag ) { - // Update the last input step (lastStep as InputStep).value = inputEvent.value; - lastStep.timestamp = inputEvent.timestamp; // Update to latest timestamp - (lastStep as InputStep).screenshot = inputEvent.screenshot; // Update to latest screenshot + lastStep.timestamp = inputEvent.timestamp; + (lastStep as InputStep).screenshot = inputEvent.screenshot; + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } else { - // Add a new input step const newStep: InputStep = { type: "input", timestamp: inputEvent.timestamp, tabId: inputEvent.tabId, url: inputEvent.url, frameUrl: inputEvent.frameUrl, + frameIdPath: (inputEvent as any).frameIdPath, xpath: inputEvent.xpath, cssSelector: inputEvent.cssSelector, elementTag: inputEvent.elementTag, @@ -285,6 +439,7 @@ export default defineBackground(() => { screenshot: inputEvent.screenshot, }; steps.push(newStep); + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } } else { console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent); @@ -303,6 +458,7 @@ export default defineBackground(() => { tabId: keyEvent.tabId, url: keyEvent.url, frameUrl: keyEvent.frameUrl, // Can be missing + frameIdPath: (keyEvent as any).frameIdPath, key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -329,14 +485,21 @@ export default defineBackground(() => { y: number; }; // Type assertion for clarity const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL - + // Drop internal chrome pages like chrome://newtab/ + if (currentTabInfo?.url?.startsWith('chrome://')) { + break; + } // Check if the last step added was a mergeable scroll event const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( lastStep && lastStep.type === "scroll" && lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id + // Treat same XY within a short time window as duplicate, regardless of targetId + (lastStep as ScrollStep).scrollX === scrollData.x && + (lastStep as ScrollStep).scrollY === scrollData.y && + Math.abs(rrEvent.timestamp - lastStep.timestamp) < 200 && + (lastStep as any).url === currentTabInfo?.url ) { // Update the last scroll step (lastStep as ScrollStep).scrollX = scrollData.x; @@ -359,11 +522,43 @@ export default defineBackground(() => { } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { // Also handle rrweb meta events as navigation const metaData = rrEvent.data as { href: string }; + const href = metaData.href; + // Drop about:blank always + if (href === 'about:blank') { + break; + } + try { + const urlObj = new URL(href); + const host = urlObj.hostname; + // Allowlist overrides blocklist + const inAllow = settings.allowlist.some(d => host.endsWith(d)); + const inBlock = settings.blocklist.some(d => host.endsWith(d)); + if (!inAllow && inBlock) { + break; + } + if (!settings.enableIframes && !(rrEvent as any).isTopFrame) { + break; // user disabled iframe recording + } + // If top frame, allow + if ((rrEvent as any).isTopFrame) { + // allowed + } else { + const fUrl = (rrEvent as any).frameUrl as string | undefined; + if (!fUrl) break; + const times = interactedFrameTimes[rrEvent.tabId] || {}; + const lastTs = times[fUrl]; + if (!lastTs) break; + if (Date.now() - lastTs > settings.iframeWindow) break; + } + } catch { + break; + } const step: NavigationStep = { type: "navigation", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, url: metaData.href, + // frameIdPath could be attached if needed }; steps.push(step); } @@ -396,6 +591,8 @@ export default defineBackground(() => { "CUSTOM_INPUT_EVENT", "CUSTOM_SELECT_EVENT", "CUSTOM_KEY_EVENT", + // Synthetic event we will emit from content script just before an expected new tab open. + "PREPARE_NEW_TAB", ]; if ( message.type === "RRWEB_EVENT" || @@ -412,6 +609,13 @@ export default defineBackground(() => { const tabId = sender.tab.id; const isCustomEvent = customEventTypes.includes(message.type); + // Record intent for new tab opening to correlate with onCreated event. + if (message.type === "PREPARE_NEW_TAB") { + recentNewTabIntents[sender.tab.id] = Date.now(); + // We do not store this as a workflow step; it's only heuristic metadata. + return false; + } + // Function to store the event const storeEvent = (eventPayload: any, screenshotDataUrl?: string) => { if (!sessionLogs[tabId]) { @@ -434,6 +638,13 @@ export default defineBackground(() => { screenshot: screenshotDataUrl, }; sessionLogs[tabId].push(eventWithMeta); + // Mark frame as interacted so subsequent iframe meta navigations can be allowed + if (message.type.startsWith("CUSTOM_") && eventPayload.frameUrl) { + if (!interactedFrameUrls[tabId]) interactedFrameUrls[tabId] = new Set(); + interactedFrameUrls[tabId].add(eventPayload.frameUrl); + if (!interactedFrameTimes[tabId]) interactedFrameTimes[tabId] = {}; + interactedFrameTimes[tabId][eventPayload.frameUrl] = Date.now(); + } broadcastWorkflowDataUpdate(); // Call is async, will not block // console.log(`Stored ${message.type} from tab ${tabId}`); }; diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts index 8f7f233f..a646562f 100644 --- a/extension/src/entrypoints/content.ts +++ b/extension/src/entrypoints/content.ts @@ -122,6 +122,16 @@ function startRecorder() { emit(event) { if (!isRecordingActive) return; + const frameUrl = window.location.href; + const isTopFrame = window.self === window.top; + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); + // Handle scroll events with debouncing and direction detection if ( event.type === EventType.IncrementalSnapshot && @@ -157,7 +167,10 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); lastDirection = currentDirection; @@ -178,15 +191,18 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); scrollTimeout = null; lastDirection = null; // Reset direction for next scroll }, DEBOUNCE_MS); } else { - // Pass through non-scroll events unchanged - chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: event }); + // Pass through non-scroll events unchanged, but include frame context for filtering in background + chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: { ...event, frameUrl, frameIdPath, isTopFrame } }); } }, maskInputOptions: { @@ -239,47 +255,92 @@ function handleCustomClick(event: MouseEvent) { if (!isRecordingActive) return; const targetElement = event.target as HTMLElement; if (!targetElement) return; - + // Determine a frame identifier (best-effort). Top frame = 0, nested frames build path. + const frameIdPath = (() => { + try { + let win: any = window; + const parts: number[] = []; + while (win !== win.parent) { + const parent = win.parent; + let index = 0; + for (let i = 0; i < parent.frames.length; i++) { + if (parent.frames[i] === win) { index = i; break; } + } + parts.unshift(index); + win = parent; + if (parts.length > 10) break; // safety + } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const clickData = { timestamp: Date.now(), - url: document.location.href, // Use document.location for main page URL - frameUrl: window.location.href, // URL of the frame where the event occurred - xpath: xpath, + url: document.location.href, + frameUrl: window.location.href, + frameIdPath, + xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, elementText: targetElement.textContent?.trim().slice(0, 200) || "", }; - console.log("Sending CUSTOM_CLICK_EVENT:", clickData); - chrome.runtime.sendMessage({ - type: "CUSTOM_CLICK_EVENT", - payload: clickData, - }); - } catch (error) { - console.error("Error capturing click data:", error); - } + chrome.runtime.sendMessage({ type: "CUSTOM_CLICK_EVENT", payload: clickData }); + } catch (error) { console.error("Error capturing click data:", error); } } // --- End Custom Click Handler --- // --- Custom Input Handler --- +// Maintain last recorded value & timestamp per element (keyed by xpath) to suppress noisy repeats +const lastInputRecord: Record = {}; function handleInput(event: Event) { if (!isRecordingActive) return; const targetElement = event.target as HTMLInputElement | HTMLTextAreaElement; if (!targetElement || !("value" in targetElement)) return; const isPassword = targetElement.type === "password"; + // Ignore programmatic (non user-trusted) input events – these often cause massive duplication + if (!(event as InputEvent).isTrusted) return; + + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const inputData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, value: isPassword ? "********" : targetElement.value, }; + + // Dedupe rule 1: If value unchanged for this element and within debounce window, skip + const DEBOUNCE_MS_INPUT = 1500; + const prev = lastInputRecord[xpath]; + if (prev && prev.value === inputData.value && inputData.timestamp - prev.ts < DEBOUNCE_MS_INPUT) { + return; // Suppress noisy duplicate + } + + // Dedupe rule 2: If value is empty string and we already recorded empty in last 5s, suppress further empties + if ( + inputData.value === "" && + prev && + prev.value === "" && + inputData.timestamp - prev.ts < 5000 + ) { + return; + } + + // Store/update last record metadata + lastInputRecord[xpath] = { value: inputData.value, ts: inputData.timestamp }; console.log("Sending CUSTOM_INPUT_EVENT:", inputData); chrome.runtime.sendMessage({ type: "CUSTOM_INPUT_EVENT", @@ -297,6 +358,7 @@ function handleSelectChange(event: Event) { const targetElement = event.target as HTMLSelectElement; // Ensure it's a select element if (!targetElement || targetElement.tagName !== "SELECT") return; + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const xpath = getXPath(targetElement); @@ -305,6 +367,7 @@ function handleSelectChange(event: Event) { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, @@ -377,11 +440,13 @@ function handleKeydown(event: KeyboardEvent) { } } + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const keyData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, key: keyToLog, // The key or combination pressed xpath: xpath, // XPath of the element in focus (if any) cssSelector: cssSelector, // CSS selector of the element in focus (if any) @@ -544,6 +609,9 @@ function handleBlur(event: FocusEvent) { export default defineContentScript({ matches: [""], + // Ensure injection into all frames (iframes) so we can capture interactions inside nested documents. + allFrames: true, + matchAboutBlank: true, main(ctx) { // Listener for status updates from the background script chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { diff --git a/extension/src/entrypoints/options.html b/extension/src/entrypoints/options.html new file mode 100644 index 00000000..dfb7bf49 --- /dev/null +++ b/extension/src/entrypoints/options.html @@ -0,0 +1,80 @@ + + + + + Workflow Use - Options + + + +

Recording Settings

+
+
+ +
When disabled, iframe-originated navigation/meta events are ignored.
+
+
+ + +
Time after a user interaction in an iframe during which rrweb meta navigations are allowed.
+
+
+ + +
+
+ + +
+ + +
+ + + + diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx index 0fa3456a..1aa8a0de 100644 --- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx +++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx @@ -1,7 +1,7 @@ import React from "react"; import { useWorkflow } from "../context/workflow-provider"; import { Button } from "@/components/ui/button"; -import { EventViewer } from "./event-viewer"; // Import EventViewer +import { EventViewer } from "./event-viewer"; export const RecordingView: React.FC = () => { const { stopRecording, workflow } = useWorkflow(); @@ -19,12 +19,13 @@ export const RecordingView: React.FC = () => { Recording ({stepCount} steps) - +
+ +
- {/* EventViewer will now take full available space within this div */}
diff --git a/extension/src/lib/types.ts b/extension/src/lib/types.ts index 601d3aac..8cc94232 100644 --- a/extension/src/lib/types.ts +++ b/extension/src/lib/types.ts @@ -2,6 +2,7 @@ export interface StoredCustomClickEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -15,6 +16,7 @@ export interface StoredCustomInputEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -28,6 +30,7 @@ export interface StoredCustomSelectEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -42,6 +45,7 @@ export interface StoredCustomKeyEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; key: string; xpath?: string; // XPath of focused element cssSelector?: string; @@ -73,6 +77,7 @@ export interface StoredRrwebEvent { data: any; timestamp: number; tabId: number; + frameUrl?: string; // URL of the frame where the rrweb event originated (when injected per-frame) messageType: "RRWEB_EVENT"; } diff --git a/extension/src/lib/workflow-types.ts b/extension/src/lib/workflow-types.ts index 57d4470f..35605bd6 100644 --- a/extension/src/lib/workflow-types.ts +++ b/extension/src/lib/workflow-types.ts @@ -27,12 +27,14 @@ export interface NavigationStep extends BaseStep { type: "navigation"; url: string; // Navigation implies a URL change screenshot?: string; // Optional in source + frameIdPath?: string; // Optional frame context } export interface ClickStep extends BaseStep { type: "click"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -44,6 +46,7 @@ export interface InputStep extends BaseStep { type: "input"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -55,6 +58,7 @@ export interface KeyPressStep extends BaseStep { type: "key_press"; url?: string; // Can be missing if key press happens without element focus? Source is optional. frameUrl?: string; // Might be missing + frameIdPath?: string; key: string; xpath?: string; // Optional in source cssSelector?: string; // Optional in source @@ -67,6 +71,7 @@ export interface ScrollStep extends BaseStep { targetId: number; // The rrweb ID of the element being scrolled scrollX: number; scrollY: number; + frameIdPath?: string; // Note: url might be missing if scroll happens on initial load before meta event? } diff --git a/extension/wxt.config.ts b/extension/wxt.config.ts index 9d7ae4ec..83bf64d8 100644 --- a/extension/wxt.config.ts +++ b/extension/wxt.config.ts @@ -13,9 +13,15 @@ export default defineConfig({ // WXT-specific overrides (optional) }), manifest: { - permissions: ["tabs", "sidePanel", ""], - host_permissions: ["http://127.0.0.1/*"], - // options_page: "options.html", + permissions: ["tabs", "sidePanel", "storage", ""], + // Broaden host permissions so content script can inject into iframes on external sites. + // Note: in permissions allows some access, but host_permissions explicitly grants injection rights. + host_permissions: [ + "http://127.0.0.1/*", + "https://*/*", + "http://*/*" + ], + options_page: "options.html", // action: { // default_popup: "popup.html", // }, diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json new file mode 100644 index 00000000..b1834355 --- /dev/null +++ b/workflows/examples/test_iframes.json @@ -0,0 +1,101 @@ +{ + "name": "Recorded Workflow", + "description": "Recorded on 8/11/2025, 4:20:47 AM", + "version": "1.0.0", + "input_schema": [], + "steps": [ + { + "type": "navigation", + "timestamp": 1754866228439, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "scroll", + "timestamp": 1754866228608, + "tabId": 388342781, + "targetId": 219, + "scrollX": 0, + "scrollY": 7, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "navigation", + "timestamp": 1754866228634, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866228849, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "id(\"textareawrapper\")/div[1]/div[6]", + "cssSelector": "div.CodeMirror-scroll", + "elementTag": "DIV", + "elementText": "​

The iframe element

​​" + }, + { + "type": "navigation", + "timestamp": 1754866230495, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866231531, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "body/div[2]/div[1]/a[4]", + "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]", + "elementTag": "A", + "elementText": "" + }, + { + "type": "navigation", + "timestamp": 1754866237707, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866238574, + "tabId": 388342781, + "url": "https://www.w3schools.com/", + "frameUrl": "https://www.w3schools.com/", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[3]", + "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]", + "elementTag": "A", + "elementText": "JAVASCRIPT" + }, + { + "type": "navigation", + "timestamp": 1754866242778, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866242882, + "tabId": 388342781, + "url": "https://www.w3schools.com/js/default.asp", + "frameUrl": "https://www.w3schools.com/js/default.asp", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[1]", + "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]", + "elementTag": "A", + "elementText": "HTML" + }, + { + "type": "navigation", + "timestamp": 1754866246395, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + } + ] +} \ No newline at end of file diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 238fcbb5..2465d8cc 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -DEFAULT_ACTION_TIMEOUT_MS = 1000 +DEFAULT_ACTION_TIMEOUT_MS = 2500 # List of default actions from browser_use.controller.service.Controller to disable # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case) @@ -83,16 +83,101 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse page = await browser_session.get_current_page() original_selector = params.cssSelector + # If frameUrl or frameIdPath are provided, narrow the search to that frame + def _select_context(pg): + try: + from playwright.async_api import Page, Frame + ctx: Page | Frame = pg + # If frame hints point to top document, stay on page + fid = getattr(params, 'frameIdPath', None) + furl = getattr(params, 'frameUrl', None) + curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else '' + if furl and curr_url and furl.split('#')[0] == curr_url: + return pg + if fid: + segs = [s for s in str(fid).split('.') if s != ''] + if all(s == '0' for s in segs): + return pg + f = pg.main_frame + for s in segs[1:]: # skip top marker + idx = int(s) + if 0 <= idx < len(f.child_frames): + f = f.child_frames[idx] + else: + return pg + return f + if furl: + from urllib.parse import urlparse + pf = urlparse(furl) + # If frameUrl equals current page URL (origin+path), stay on page + try: + from urllib.parse import urlparse as _u + cu = _u(curr_url) + if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): + return pg + except Exception: + pass + for fr in pg.frames: + try: + ff = urlparse(fr.url) + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): + return fr + except Exception: + continue + except Exception: + return pg + return ctx + + # Fallback: search all frames for selector (prefer frames matching target origin) + async def _find_in_frames(pg, selector: str): + from urllib.parse import urlparse + prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' + pref_o = urlparse(prefer) if prefer else None + frames = list(pg.frames) + def score(fr): + if not pref_o: + return 0 + try: + fo = urlparse(fr.url) + return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0 + except Exception: + return 0 + frames.sort(key=score, reverse=True) + for fr in frames: + try: + loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2)) + return fr, loc, used + except Exception: + continue + return None, None, None + try: - locator, selector_used = await get_best_element_handle( - page, - params.cssSelector, - params, - timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, - ) + # Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared + curr = (page.url or '').split('#')[0] + declared_url = (getattr(params, 'url', None) or '').split('#')[0] + has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None)) + if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url: + await page.goto(declared_url) + await page.wait_for_load_state() + + ctx = _select_context(page) + try: + locator, selector_used = await get_best_element_handle( + ctx, + params.cssSelector, + params, + timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, + ) + except Exception: + # Fallback: search all frames + fr, locator, selector_used = await _find_in_frames(page, params.cssSelector) + if locator is None: + raise + await locator.click(force=True) - msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})' + used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector + msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py index 8c61470a..ffb25b4a 100644 --- a/workflows/workflow_use/controller/views.py +++ b/workflows/workflow_use/controller/views.py @@ -23,6 +23,8 @@ class RecorderBase(StepMeta): elementTag: Optional[str] = None elementText: Optional[str] = None frameUrl: Optional[str] = None + frameIdPath: Optional[str] = None + url: Optional[str] = None screenshot: Optional[str] = None diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py index fb154ba9..5de21556 100644 --- a/workflows/workflow_use/recorder/service.py +++ b/workflows/workflow_use/recorder/service.py @@ -85,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): async with self.final_workflow_processed_lock: if not self.final_workflow_processed_flag and self.last_workflow_update_event: print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).') - self.final_workflow_output = self.last_workflow_update_event.payload + wf = self.last_workflow_update_event.payload + # Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations + try: + clean_steps = [] + for s in wf.steps: + st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None) + url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None) + if st == 'navigation': + if not url or url == 'about:blank': + continue + from urllib.parse import urlparse + host = urlparse(url).hostname or '' + blocked = any( + pat in host for pat in ( + 'doubleclick.net', 'googlesyndication.com', 'googleadservices.com', + 'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net', + 'googletagmanager.com', 'indexww.com', 'adtrafficquality.google' + ) + ) + if blocked: + continue + clean_steps.append(s) + wf.steps = clean_steps + except Exception as e: + print(f'[Service] Backend filter failed: {e}') + self.final_workflow_output = wf self.final_workflow_processed_flag = True processed_this_call = True diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index aa7008c1..d21da2e8 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -144,11 +144,20 @@ def truncate_selector(selector: str) -> str: await self.browser._wait_for_stable_network() page = await self.browser.get_current_page() - logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') - locator, selector_used = await get_best_element_handle( - page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT - ) - logger.info(f'Element with selector found: {truncate_selector(selector_used)}') + # If the next step declares a URL/frameUrl and it does not match the current page URL, + # skip waiting for its element on the current page (prevents false failures like step 7). + curr_url = (page.url or '').split('#')[0] + declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0] + if declared_next_url and declared_next_url != curr_url: + logger.info( + f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})" + ) + else: + logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') + locator, selector_used = await get_best_element_handle( + page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT + ) + logger.info(f'Element with selector found: {truncate_selector(selector_used)}') except Exception as e: logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}') raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e