From ab941a2089f31b623fa3930799705446cae1d6ff Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 03:00:16 +0530 Subject: [PATCH 1/7] New tab clicks (target=_blank / modifier clicks) stalled recording; child tab steps missing. Background/ad/tracker tabs polluted logs. Excessive duplicate navigation events per redirect/loading cycle. Massive explosion of input steps (hundreds of empty, unchanged values). Unnecessary workflow updates when steps unchanged. New Tab Intent Heuristic: Content script emits PREPARE_NEW_TAB on ctrl/cmd/middle click or target=_blank. Background correlates upcoming chrome.tabs.onCreated to mark userInitiated. Activated tabs tracked; only activated or userInitiated tabs produce steps. Tab Filtering: Suppress all events (except activation) from tabs never activated and not correlated with an intent window (4s). Reduces noise from ads/trackers. Navigation Consolidation: Maintain lastNavigationIndexByTab; update existing navigation step instead of appending duplicates during rapid redirects or title/url churn. Input Event Deduplication: Content script: per-xpath cache; skip unchanged value; debounce; skip rapid empty repeats. Background: merge consecutive identical field edits; collapse bursts of empty values within 5s (timestamp refresh only). Track lastInputPerKey (tabId|xpath) to decide merge vs new step. --- extension/src/entrypoints/background.ts | 222 ++++++++++++++---------- extension/src/entrypoints/content.ts | 44 +++++ 2 files changed, 176 insertions(+), 90 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 283a16f4..50c05309 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -30,6 +30,19 @@ export default defineBackground(() => { // Store tab information (URL, potentially title) const tabInfo: { [tabId: number]: { url?: string; title?: string } } = {}; + // Track which tabs have been explicitly activated (brought to foreground) by the user. + // We will ignore events originating from tabs that were never activated to reduce noise + // (for example: ad / tracker tabs that load in the background). + const activatedTabs = new Set(); + + // Track user clicks that are likely to open a new tab (Ctrl/Cmd + click, target=_blank etc.). + // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate + // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated. + const recentNewTabIntents: { [openerTabId: number]: number } = {}; + + // Heuristic window (ms) within which a created tab following a user intent is considered relevant. + const NEW_TAB_INTENT_WINDOW_MS = 4000; + let isRecordingEnabled = true; // Default to disabled (OFF) let lastWorkflowHash: string | null = null; // Cache for the last logged workflow hash @@ -144,6 +157,16 @@ export default defineBackground(() => { console.log(`Sending ${type}:`, payload); const tabId = payload.tabId; if (tabId) { + // Skip capturing events for tabs that have never been activated AND are not the original opener + // unless we have positively identified them as a recent user initiated tab (click intent -> creation). + if ( + type !== "CUSTOM_TAB_ACTIVATED" && + !activatedTabs.has(tabId) && + !(payload.openerTabId && recentNewTabIntents[payload.openerTabId] && Date.now() - recentNewTabIntents[payload.openerTabId] < NEW_TAB_INTENT_WINDOW_MS) + ) { + // Silently ignore background noise (ad/tracker tabs) until user actually focuses them. + return; + } if (!sessionLogs[tabId]) { sessionLogs[tabId] = []; } @@ -171,6 +194,12 @@ export default defineBackground(() => { url: tab.pendingUrl || tab.url, windowId: tab.windowId, index: tab.index, + userInitiated: + !!( + tab.openerTabId && + recentNewTabIntents[tab.openerTabId] && + Date.now() - recentNewTabIntents[tab.openerTabId] < NEW_TAB_INTENT_WINDOW_MS + ), }); }); @@ -188,6 +217,7 @@ export default defineBackground(() => { }); chrome.tabs.onActivated.addListener((activeInfo) => { + activatedTabs.add(activeInfo.tabId); sendTabEvent("CUSTOM_TAB_ACTIVATED", { tabId: activeInfo.tabId, windowId: activeInfo.windowId, @@ -214,64 +244,90 @@ export default defineBackground(() => { function convertStoredEventsToSteps(events: StoredEvent[]): Step[] { const steps: Step[] = []; + const lastNavigationIndexByTab: Record = {}; + const lastInputPerKey: Record = {}; for (const event of events) { switch (event.messageType) { - case "CUSTOM_CLICK_EVENT": { - const clickEvent = event as StoredCustomClickEvent; - // Ensure required fields are present, even if optional in source type for some reason + case "CUSTOM_TAB_CREATED": + case "CUSTOM_TAB_UPDATED": + case "CUSTOM_TAB_ACTIVATED": { + const navUrl = (event as any).url || (event as any).changeInfo?.url; + if (!navUrl) break; + const tabId = (event as any).tabId; + const userInitiated = (event as any).userInitiated; + if (!activatedTabs.has(tabId) && !userInitiated) break; // suppress background noise + + const existingIdx = lastNavigationIndexByTab[tabId]; if ( - clickEvent.url && - clickEvent.frameUrl && - clickEvent.xpath && - clickEvent.elementTag + existingIdx !== undefined && + steps[existingIdx] && + steps[existingIdx].type === "navigation" ) { + // Update existing navigation (redirect / title change) + (steps[existingIdx] as NavigationStep).url = navUrl; + steps[existingIdx].timestamp = event.timestamp; + } else { + const nav: NavigationStep = { + type: "navigation", + timestamp: event.timestamp, + tabId, + url: navUrl, + }; + steps.push(nav); + lastNavigationIndexByTab[tabId] = steps.length - 1; + } + break; + } + case "CUSTOM_CLICK_EVENT": { + const click = event as StoredCustomClickEvent; + if (click.url && click.xpath && click.elementTag) { const step: ClickStep = { type: "click", - timestamp: clickEvent.timestamp, - tabId: clickEvent.tabId, - url: clickEvent.url, - frameUrl: clickEvent.frameUrl, - xpath: clickEvent.xpath, - cssSelector: clickEvent.cssSelector, - elementTag: clickEvent.elementTag, - elementText: clickEvent.elementText, - screenshot: clickEvent.screenshot, + timestamp: click.timestamp, + tabId: click.tabId, + url: click.url, + frameUrl: click.frameUrl, + xpath: click.xpath, + cssSelector: click.cssSelector, + elementTag: click.elementTag, + elementText: click.elementText, + screenshot: click.screenshot, }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_CLICK_EVENT:", clickEvent); + console.warn("Skipping incomplete CUSTOM_CLICK_EVENT", click); } break; } - case "CUSTOM_INPUT_EVENT": { const inputEvent = event as StoredCustomInputEvent; - if ( - inputEvent.url && - // inputEvent.frameUrl && // frameUrl might be null/undefined in some cases, let's allow merging if only one is present or both match - inputEvent.xpath && - inputEvent.elementTag - ) { + if (inputEvent.url && inputEvent.xpath && inputEvent.elementTag) { + const key = `${inputEvent.tabId}|${inputEvent.xpath}`; + const prior = lastInputPerKey[key]; + const nowTs = inputEvent.timestamp; + const isEmpty = (inputEvent as any).value === ""; + if (isEmpty && prior && prior.value === "" && nowTs - prior.ts < 5000) { + // collapse rapid-fire repeated empties + steps[prior.idx].timestamp = nowTs; + break; + } const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; - - // Check if the last step was a mergeable input event if ( lastStep && lastStep.type === "input" && lastStep.tabId === inputEvent.tabId && lastStep.url === inputEvent.url && - lastStep.frameUrl === inputEvent.frameUrl && // Ensure frameUrls match if both exist + lastStep.frameUrl === inputEvent.frameUrl && lastStep.xpath === inputEvent.xpath && lastStep.cssSelector === inputEvent.cssSelector && lastStep.elementTag === inputEvent.elementTag ) { - // Update the last input step (lastStep as InputStep).value = inputEvent.value; - lastStep.timestamp = inputEvent.timestamp; // Update to latest timestamp - (lastStep as InputStep).screenshot = inputEvent.screenshot; // Update to latest screenshot + lastStep.timestamp = inputEvent.timestamp; + (lastStep as InputStep).screenshot = inputEvent.screenshot; + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } else { - // Add a new input step const newStep: InputStep = { type: "input", timestamp: inputEvent.timestamp, @@ -285,24 +341,22 @@ export default defineBackground(() => { screenshot: inputEvent.screenshot, }; steps.push(newStep); + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } } else { - console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent); + console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent); } break; } - case "CUSTOM_KEY_EVENT": { const keyEvent = event as StoredCustomKeyEvent; - // Key press might not always have a target element (xpath, etc.) - // but needs at least url and key if (keyEvent.url && keyEvent.key) { const step: KeyPressStep = { type: "key_press", timestamp: keyEvent.timestamp, tabId: keyEvent.tabId, url: keyEvent.url, - frameUrl: keyEvent.frameUrl, // Can be missing + frameUrl: keyEvent.frameUrl, key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -311,77 +365,56 @@ export default defineBackground(() => { }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent); + console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent); } break; } - case "RRWEB_EVENT": { - // We only care about scroll events from rrweb for now const rrEvent = event as StoredRrwebEvent; - if ( - rrEvent.type === EventType.IncrementalSnapshot && - rrEvent.data.source === IncrementalSource.Scroll - ) { - const scrollData = rrEvent.data as { - id: number; - x: number; - y: number; - }; // Type assertion for clarity - const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL - - // Check if the last step added was a mergeable scroll event - const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( - lastStep && - lastStep.type === "scroll" && - lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id + rrEvent.type === EventType.IncrementalSnapshot && + rrEvent.data.source === IncrementalSource.Scroll ) { - // Update the last scroll step - (lastStep as ScrollStep).scrollX = scrollData.x; - (lastStep as ScrollStep).scrollY = scrollData.y; - lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp - // URL should already be set from the first event in the sequence - } else { - // Add a new scroll step - const newStep: ScrollStep = { - type: "scroll", + const scrollData = rrEvent.data as { id: number; x: number; y: number }; + const currentTabInfo = tabInfo[rrEvent.tabId]; + const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; + if ( + lastStep && + lastStep.type === "scroll" && + lastStep.tabId === rrEvent.tabId && + (lastStep as ScrollStep).targetId === scrollData.id + ) { + (lastStep as ScrollStep).scrollX = scrollData.x; + (lastStep as ScrollStep).scrollY = scrollData.y; + lastStep.timestamp = rrEvent.timestamp; + } else { + const scrollStep: ScrollStep = { + type: "scroll", + timestamp: rrEvent.timestamp, + tabId: rrEvent.tabId, + targetId: scrollData.id, + scrollX: scrollData.x, + scrollY: scrollData.y, + url: currentTabInfo?.url, + }; + steps.push(scrollStep); + } + } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { + const metaData = rrEvent.data as { href: string }; + const nav: NavigationStep = { + type: "navigation", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, - targetId: scrollData.id, - scrollX: scrollData.x, - scrollY: scrollData.y, - url: currentTabInfo?.url, // Add URL if available + url: metaData.href, }; - steps.push(newStep); + steps.push(nav); } - } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { - // Also handle rrweb meta events as navigation - const metaData = rrEvent.data as { href: string }; - const step: NavigationStep = { - type: "navigation", - timestamp: rrEvent.timestamp, - tabId: rrEvent.tabId, - url: metaData.href, - }; - steps.push(step); - } break; } - - // Add cases for other StoredEvent types to Step types if needed - // e.g., CUSTOM_SELECT_EVENT -> SelectStep - // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep - // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed. - default: - // Ignore other event types for now - // console.log("Ignoring event type:", event.messageType); break; } } - return steps; } @@ -396,6 +429,8 @@ export default defineBackground(() => { "CUSTOM_INPUT_EVENT", "CUSTOM_SELECT_EVENT", "CUSTOM_KEY_EVENT", + // Synthetic event we will emit from content script just before an expected new tab open. + "PREPARE_NEW_TAB", ]; if ( message.type === "RRWEB_EVENT" || @@ -412,6 +447,13 @@ export default defineBackground(() => { const tabId = sender.tab.id; const isCustomEvent = customEventTypes.includes(message.type); + // Record intent for new tab opening to correlate with onCreated event. + if (message.type === "PREPARE_NEW_TAB") { + recentNewTabIntents[sender.tab.id] = Date.now(); + // We do not store this as a workflow step; it's only heuristic metadata. + return false; + } + // Function to store the event const storeEvent = (eventPayload: any, screenshotDataUrl?: string) => { if (!sessionLogs[tabId]) { diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts index 8f7f233f..a1a60231 100644 --- a/extension/src/entrypoints/content.ts +++ b/extension/src/entrypoints/content.ts @@ -240,6 +240,25 @@ function handleCustomClick(event: MouseEvent) { const targetElement = event.target as HTMLElement; if (!targetElement) return; + try { + // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation. + // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener. + const isMiddle = event.button === 1; + const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux) + let anchorOpensNew = false; + if (targetElement instanceof HTMLAnchorElement) { + const a = targetElement as HTMLAnchorElement; + anchorOpensNew = + (a.target && a.target.toLowerCase() === "_blank") || + a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer"); + } + if (isMiddle || isModifier || anchorOpensNew) { + chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } }); + } + } catch (e) { + console.warn("Failed to evaluate new tab heuristic", e); + } + try { const xpath = getXPath(targetElement); const clickData = { @@ -263,12 +282,17 @@ function handleCustomClick(event: MouseEvent) { // --- End Custom Click Handler --- // --- Custom Input Handler --- +// Maintain last recorded value & timestamp per element (keyed by xpath) to suppress noisy repeats +const lastInputRecord: Record = {}; function handleInput(event: Event) { if (!isRecordingActive) return; const targetElement = event.target as HTMLInputElement | HTMLTextAreaElement; if (!targetElement || !("value" in targetElement)) return; const isPassword = targetElement.type === "password"; + // Ignore programmatic (non user-trusted) input events – these often cause massive duplication + if (!(event as InputEvent).isTrusted) return; + try { const xpath = getXPath(targetElement); const inputData = { @@ -280,6 +304,26 @@ function handleInput(event: Event) { elementTag: targetElement.tagName, value: isPassword ? "********" : targetElement.value, }; + + // Dedupe rule 1: If value unchanged for this element and within debounce window, skip + const DEBOUNCE_MS_INPUT = 1500; + const prev = lastInputRecord[xpath]; + if (prev && prev.value === inputData.value && inputData.timestamp - prev.ts < DEBOUNCE_MS_INPUT) { + return; // Suppress noisy duplicate + } + + // Dedupe rule 2: If value is empty string and we already recorded empty in last 5s, suppress further empties + if ( + inputData.value === "" && + prev && + prev.value === "" && + inputData.timestamp - prev.ts < 5000 + ) { + return; + } + + // Store/update last record metadata + lastInputRecord[xpath] = { value: inputData.value, ts: inputData.timestamp }; console.log("Sending CUSTOM_INPUT_EVENT:", inputData); chrome.runtime.sendMessage({ type: "CUSTOM_INPUT_EVENT", From c9d6dec933e45cb2a5dec4330cfaa9379b48f601 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 03:11:20 +0530 Subject: [PATCH 2/7] Does no t record duplicate steps just because of different mouse positions --- extension/src/entrypoints/background.ts | 155 ++++++++++++++++++------ 1 file changed, 119 insertions(+), 36 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 50c05309..ad039b04 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -79,13 +79,72 @@ export default defineBackground(() => { // Function to broadcast workflow data updates to the console bus async function broadcastWorkflowDataUpdate(): Promise { // console.log("[DEBUG] broadcastWorkflowDataUpdate: Entered function"); // Optional: Keep for debugging - const allSteps: Step[] = Object.keys(sessionLogs) + const rawSteps: Step[] = Object.keys(sessionLogs) .flatMap((tabIdStr) => { const tabId = parseInt(tabIdStr, 10); return convertStoredEventsToSteps(sessionLogs[tabId] || []); }) .sort((a, b) => a.timestamp - b.timestamp); // Sort chronologically + // Post-process to collapse consecutive duplicates that only differ by timestamp (e.g. repeated identical navigations) + const allSteps: Step[] = []; + for (const step of rawSteps) { + const last = allSteps.length ? allSteps[allSteps.length - 1] : null; + if (!last) { + allSteps.push(step); + continue; + } + let isDuplicate = false; + if (last.type === step.type) { + switch (step.type) { + case 'navigation': + isDuplicate = (last as NavigationStep).url === (step as NavigationStep).url && last.tabId === step.tabId; + break; + case 'input': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).value === (step as any).value; + break; + case 'click': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).elementText === (step as any).elementText; + break; + case 'scroll': + isDuplicate = + last.tabId === step.tabId && + (last as any).targetId === (step as any).targetId && + (last as any).scrollX === (step as any).scrollX && + (last as any).scrollY === (step as any).scrollY; + break; + case 'key_press': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).key === (step as any).key && + (last as any).xpath === (step as any).xpath; + break; + } + } + if (isDuplicate) { + // Update timestamp (and screenshot if present) to most recent but don't add new step + last.timestamp = step.timestamp; + if ((step as any).screenshot) { + (last as any).screenshot = (step as any).screenshot; + } + } else { + allSteps.push(step); + } + } + // Create the workflowData object *after* sorting steps, but hash only steps const workflowData: Workflow = { name: "Recorded Workflow", @@ -344,19 +403,22 @@ export default defineBackground(() => { lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } } else { - console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent); + console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent); } break; } + case "CUSTOM_KEY_EVENT": { const keyEvent = event as StoredCustomKeyEvent; + // Key press might not always have a target element (xpath, etc.) + // but needs at least url and key if (keyEvent.url && keyEvent.key) { const step: KeyPressStep = { type: "key_press", timestamp: keyEvent.timestamp, tabId: keyEvent.tabId, url: keyEvent.url, - frameUrl: keyEvent.frameUrl, + frameUrl: keyEvent.frameUrl, // Can be missing key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -365,56 +427,77 @@ export default defineBackground(() => { }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent); + console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent); } break; } + case "RRWEB_EVENT": { + // We only care about scroll events from rrweb for now const rrEvent = event as StoredRrwebEvent; + if ( + rrEvent.type === EventType.IncrementalSnapshot && + rrEvent.data.source === IncrementalSource.Scroll + ) { + const scrollData = rrEvent.data as { + id: number; + x: number; + y: number; + }; // Type assertion for clarity + const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL + + // Check if the last step added was a mergeable scroll event + const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( - rrEvent.type === EventType.IncrementalSnapshot && - rrEvent.data.source === IncrementalSource.Scroll + lastStep && + lastStep.type === "scroll" && + lastStep.tabId === rrEvent.tabId && + (lastStep as ScrollStep).targetId === scrollData.id ) { - const scrollData = rrEvent.data as { id: number; x: number; y: number }; - const currentTabInfo = tabInfo[rrEvent.tabId]; - const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; - if ( - lastStep && - lastStep.type === "scroll" && - lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id - ) { - (lastStep as ScrollStep).scrollX = scrollData.x; - (lastStep as ScrollStep).scrollY = scrollData.y; - lastStep.timestamp = rrEvent.timestamp; - } else { - const scrollStep: ScrollStep = { - type: "scroll", - timestamp: rrEvent.timestamp, - tabId: rrEvent.tabId, - targetId: scrollData.id, - scrollX: scrollData.x, - scrollY: scrollData.y, - url: currentTabInfo?.url, - }; - steps.push(scrollStep); - } - } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { - const metaData = rrEvent.data as { href: string }; - const nav: NavigationStep = { - type: "navigation", + // Update the last scroll step + (lastStep as ScrollStep).scrollX = scrollData.x; + (lastStep as ScrollStep).scrollY = scrollData.y; + lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp + // URL should already be set from the first event in the sequence + } else { + // Add a new scroll step + const newStep: ScrollStep = { + type: "scroll", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, - url: metaData.href, + targetId: scrollData.id, + scrollX: scrollData.x, + scrollY: scrollData.y, + url: currentTabInfo?.url, // Add URL if available }; - steps.push(nav); + steps.push(newStep); } + } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { + // Also handle rrweb meta events as navigation + const metaData = rrEvent.data as { href: string }; + const step: NavigationStep = { + type: "navigation", + timestamp: rrEvent.timestamp, + tabId: rrEvent.tabId, + url: metaData.href, + }; + steps.push(step); + } break; } + + // Add cases for other StoredEvent types to Step types if needed + // e.g., CUSTOM_SELECT_EVENT -> SelectStep + // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep + // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed. + default: + // Ignore other event types for now + // console.log("Ignoring event type:", event.messageType); break; } } + return steps; } From cd9ebc7d7538ab9e8e7ad682c5be116e8c56a11e Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:05:48 +0530 Subject: [PATCH 3/7] works but ui needs to change --- extension/src/entrypoints/background.ts | 79 ++++++++++++++++ extension/src/entrypoints/content.ts | 94 ++++++++++++------- extension/src/entrypoints/options.html | 80 ++++++++++++++++ .../sidepanel/components/recording-view.tsx | 40 +++++++- extension/src/lib/types.ts | 5 + extension/src/lib/workflow-types.ts | 5 + extension/wxt.config.ts | 12 ++- workflows/workflow_use/recorder/service.py | 27 +++++- 8 files changed, 298 insertions(+), 44 deletions(-) create mode 100644 extension/src/entrypoints/options.html diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index ad039b04..6e1ae033 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -39,6 +39,24 @@ export default defineBackground(() => { // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated. const recentNewTabIntents: { [openerTabId: number]: number } = {}; + // Record iframe URLs that the user actually interacted with (via custom events) per tab + const interactedFrameUrls: Record> = {}; + // Additionally track last interaction time per frame for time-window gating + const interactedFrameTimes: Record> = {}; + // Hostname patterns for iframe navigation noise we want to suppress + const BLOCKED_IFRAME_HOST_PATTERNS: RegExp[] = [ + /doubleclick\.net$/i, + /googlesyndication\.com$/i, + /googleadservices\.com$/i, + /amazon-adsystem\.com$/i, + /recaptcha\.google\.com$/i, + /recaptcha\.net$/i, + /googletagmanager\.com$/i, + /indexww\.com$/i, + /adtrafficquality\.google$/i, + /2mdn\.net$/i, + /gstaticadssl\.googleapis\.com$/i, + ]; // Heuristic window (ms) within which a created tab following a user intent is considered relevant. const NEW_TAB_INTENT_WINDOW_MS = 4000; @@ -301,6 +319,25 @@ export default defineBackground(() => { // --- Conversion Function --- + const DEFAULT_SETTINGS = { + enableIframes: true as boolean, + iframeWindow: 3000 as number, + blocklist: [ + 'doubleclick.net','googlesyndication.com','googleadservices.com', + 'amazon-adsystem.com','2mdn.net','recaptcha.google.com','recaptcha.net', + 'googletagmanager.com','indexww.com','adtrafficquality.google' + ] as string[], + allowlist: [] as string[], + }; + let settings: { enableIframes: boolean; iframeWindow: number; blocklist: string[]; allowlist: string[] } = { ...DEFAULT_SETTINGS }; + chrome.storage.sync.get(DEFAULT_SETTINGS, (s: any) => { settings = { ...settings, ...s }; }); + chrome.storage.onChanged.addListener((changes, area) => { + if (area !== 'sync') return; + const next = { ...settings } as any; + for (const k of Object.keys(changes)) (next as any)[k] = (changes as any)[k].newValue; + settings = next; + }); + function convertStoredEventsToSteps(events: StoredEvent[]): Step[] { const steps: Step[] = []; const lastNavigationIndexByTab: Record = {}; @@ -347,6 +384,7 @@ export default defineBackground(() => { tabId: click.tabId, url: click.url, frameUrl: click.frameUrl, + frameIdPath: (click as any).frameIdPath, xpath: click.xpath, cssSelector: click.cssSelector, elementTag: click.elementTag, @@ -393,6 +431,7 @@ export default defineBackground(() => { tabId: inputEvent.tabId, url: inputEvent.url, frameUrl: inputEvent.frameUrl, + frameIdPath: (inputEvent as any).frameIdPath, xpath: inputEvent.xpath, cssSelector: inputEvent.cssSelector, elementTag: inputEvent.elementTag, @@ -419,6 +458,7 @@ export default defineBackground(() => { tabId: keyEvent.tabId, url: keyEvent.url, frameUrl: keyEvent.frameUrl, // Can be missing + frameIdPath: (keyEvent as any).frameIdPath, key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -475,11 +515,43 @@ export default defineBackground(() => { } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { // Also handle rrweb meta events as navigation const metaData = rrEvent.data as { href: string }; + const href = metaData.href; + // Drop about:blank always + if (href === 'about:blank') { + break; + } + try { + const urlObj = new URL(href); + const host = urlObj.hostname; + // Allowlist overrides blocklist + const inAllow = settings.allowlist.some(d => host.endsWith(d)); + const inBlock = settings.blocklist.some(d => host.endsWith(d)); + if (!inAllow && inBlock) { + break; + } + if (!settings.enableIframes && !(rrEvent as any).isTopFrame) { + break; // user disabled iframe recording + } + // If top frame, allow + if ((rrEvent as any).isTopFrame) { + // allowed + } else { + const fUrl = (rrEvent as any).frameUrl as string | undefined; + if (!fUrl) break; + const times = interactedFrameTimes[rrEvent.tabId] || {}; + const lastTs = times[fUrl]; + if (!lastTs) break; + if (Date.now() - lastTs > settings.iframeWindow) break; + } + } catch { + break; + } const step: NavigationStep = { type: "navigation", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, url: metaData.href, + // frameIdPath could be attached if needed }; steps.push(step); } @@ -559,6 +631,13 @@ export default defineBackground(() => { screenshot: screenshotDataUrl, }; sessionLogs[tabId].push(eventWithMeta); + // Mark frame as interacted so subsequent iframe meta navigations can be allowed + if (message.type.startsWith("CUSTOM_") && eventPayload.frameUrl) { + if (!interactedFrameUrls[tabId]) interactedFrameUrls[tabId] = new Set(); + interactedFrameUrls[tabId].add(eventPayload.frameUrl); + if (!interactedFrameTimes[tabId]) interactedFrameTimes[tabId] = {}; + interactedFrameTimes[tabId][eventPayload.frameUrl] = Date.now(); + } broadcastWorkflowDataUpdate(); // Call is async, will not block // console.log(`Stored ${message.type} from tab ${tabId}`); }; diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts index a1a60231..a646562f 100644 --- a/extension/src/entrypoints/content.ts +++ b/extension/src/entrypoints/content.ts @@ -122,6 +122,16 @@ function startRecorder() { emit(event) { if (!isRecordingActive) return; + const frameUrl = window.location.href; + const isTopFrame = window.self === window.top; + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); + // Handle scroll events with debouncing and direction detection if ( event.type === EventType.IncrementalSnapshot && @@ -157,7 +167,10 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); lastDirection = currentDirection; @@ -178,15 +191,18 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); scrollTimeout = null; lastDirection = null; // Reset direction for next scroll }, DEBOUNCE_MS); } else { - // Pass through non-scroll events unchanged - chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: event }); + // Pass through non-scroll events unchanged, but include frame context for filtering in background + chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: { ...event, frameUrl, frameIdPath, isTopFrame } }); } }, maskInputOptions: { @@ -239,45 +255,38 @@ function handleCustomClick(event: MouseEvent) { if (!isRecordingActive) return; const targetElement = event.target as HTMLElement; if (!targetElement) return; - - try { - // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation. - // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener. - const isMiddle = event.button === 1; - const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux) - let anchorOpensNew = false; - if (targetElement instanceof HTMLAnchorElement) { - const a = targetElement as HTMLAnchorElement; - anchorOpensNew = - (a.target && a.target.toLowerCase() === "_blank") || - a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer"); - } - if (isMiddle || isModifier || anchorOpensNew) { - chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } }); - } - } catch (e) { - console.warn("Failed to evaluate new tab heuristic", e); - } - + // Determine a frame identifier (best-effort). Top frame = 0, nested frames build path. + const frameIdPath = (() => { + try { + let win: any = window; + const parts: number[] = []; + while (win !== win.parent) { + const parent = win.parent; + let index = 0; + for (let i = 0; i < parent.frames.length; i++) { + if (parent.frames[i] === win) { index = i; break; } + } + parts.unshift(index); + win = parent; + if (parts.length > 10) break; // safety + } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const clickData = { timestamp: Date.now(), - url: document.location.href, // Use document.location for main page URL - frameUrl: window.location.href, // URL of the frame where the event occurred - xpath: xpath, + url: document.location.href, + frameUrl: window.location.href, + frameIdPath, + xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, elementText: targetElement.textContent?.trim().slice(0, 200) || "", }; - console.log("Sending CUSTOM_CLICK_EVENT:", clickData); - chrome.runtime.sendMessage({ - type: "CUSTOM_CLICK_EVENT", - payload: clickData, - }); - } catch (error) { - console.error("Error capturing click data:", error); - } + chrome.runtime.sendMessage({ type: "CUSTOM_CLICK_EVENT", payload: clickData }); + } catch (error) { console.error("Error capturing click data:", error); } } // --- End Custom Click Handler --- @@ -293,12 +302,20 @@ function handleInput(event: Event) { // Ignore programmatic (non user-trusted) input events – these often cause massive duplication if (!(event as InputEvent).isTrusted) return; + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const inputData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, @@ -341,6 +358,7 @@ function handleSelectChange(event: Event) { const targetElement = event.target as HTMLSelectElement; // Ensure it's a select element if (!targetElement || targetElement.tagName !== "SELECT") return; + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const xpath = getXPath(targetElement); @@ -349,6 +367,7 @@ function handleSelectChange(event: Event) { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, @@ -421,11 +440,13 @@ function handleKeydown(event: KeyboardEvent) { } } + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const keyData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, key: keyToLog, // The key or combination pressed xpath: xpath, // XPath of the element in focus (if any) cssSelector: cssSelector, // CSS selector of the element in focus (if any) @@ -588,6 +609,9 @@ function handleBlur(event: FocusEvent) { export default defineContentScript({ matches: [""], + // Ensure injection into all frames (iframes) so we can capture interactions inside nested documents. + allFrames: true, + matchAboutBlank: true, main(ctx) { // Listener for status updates from the background script chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { diff --git a/extension/src/entrypoints/options.html b/extension/src/entrypoints/options.html new file mode 100644 index 00000000..dfb7bf49 --- /dev/null +++ b/extension/src/entrypoints/options.html @@ -0,0 +1,80 @@ + + + + + Workflow Use - Options + + + +

Recording Settings

+
+
+ +
When disabled, iframe-originated navigation/meta events are ignored.
+
+
+ + +
Time after a user interaction in an iframe during which rrweb meta navigations are allowed.
+
+
+ + +
+
+ + +
+ + +
+ + + + diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx index 0fa3456a..0aa09f33 100644 --- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx +++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx @@ -1,12 +1,25 @@ import React from "react"; import { useWorkflow } from "../context/workflow-provider"; import { Button } from "@/components/ui/button"; -import { EventViewer } from "./event-viewer"; // Import EventViewer +import { EventViewer } from "./event-viewer"; export const RecordingView: React.FC = () => { const { stopRecording, workflow } = useWorkflow(); const stepCount = workflow?.steps?.length || 0; + const openOptions = () => chrome.runtime.openOptionsPage(); + const [enableIframes, setEnableIframes] = React.useState(true); + React.useEffect(() => { + chrome.storage.sync.get({ enableIframes: true }, (s) => + setEnableIframes(!!s.enableIframes) + ); + }, []); + const toggleIframes = async () => { + const next = !enableIframes; + setEnableIframes(next); + await chrome.storage.sync.set({ enableIframes: next }); + }; + return (
@@ -19,12 +32,29 @@ export const RecordingView: React.FC = () => { Recording ({stepCount} steps)
- +
+ + + +
- {/* EventViewer will now take full available space within this div */}
diff --git a/extension/src/lib/types.ts b/extension/src/lib/types.ts index 601d3aac..8cc94232 100644 --- a/extension/src/lib/types.ts +++ b/extension/src/lib/types.ts @@ -2,6 +2,7 @@ export interface StoredCustomClickEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -15,6 +16,7 @@ export interface StoredCustomInputEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -28,6 +30,7 @@ export interface StoredCustomSelectEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -42,6 +45,7 @@ export interface StoredCustomKeyEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; key: string; xpath?: string; // XPath of focused element cssSelector?: string; @@ -73,6 +77,7 @@ export interface StoredRrwebEvent { data: any; timestamp: number; tabId: number; + frameUrl?: string; // URL of the frame where the rrweb event originated (when injected per-frame) messageType: "RRWEB_EVENT"; } diff --git a/extension/src/lib/workflow-types.ts b/extension/src/lib/workflow-types.ts index 57d4470f..35605bd6 100644 --- a/extension/src/lib/workflow-types.ts +++ b/extension/src/lib/workflow-types.ts @@ -27,12 +27,14 @@ export interface NavigationStep extends BaseStep { type: "navigation"; url: string; // Navigation implies a URL change screenshot?: string; // Optional in source + frameIdPath?: string; // Optional frame context } export interface ClickStep extends BaseStep { type: "click"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -44,6 +46,7 @@ export interface InputStep extends BaseStep { type: "input"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -55,6 +58,7 @@ export interface KeyPressStep extends BaseStep { type: "key_press"; url?: string; // Can be missing if key press happens without element focus? Source is optional. frameUrl?: string; // Might be missing + frameIdPath?: string; key: string; xpath?: string; // Optional in source cssSelector?: string; // Optional in source @@ -67,6 +71,7 @@ export interface ScrollStep extends BaseStep { targetId: number; // The rrweb ID of the element being scrolled scrollX: number; scrollY: number; + frameIdPath?: string; // Note: url might be missing if scroll happens on initial load before meta event? } diff --git a/extension/wxt.config.ts b/extension/wxt.config.ts index 9d7ae4ec..83bf64d8 100644 --- a/extension/wxt.config.ts +++ b/extension/wxt.config.ts @@ -13,9 +13,15 @@ export default defineConfig({ // WXT-specific overrides (optional) }), manifest: { - permissions: ["tabs", "sidePanel", ""], - host_permissions: ["http://127.0.0.1/*"], - // options_page: "options.html", + permissions: ["tabs", "sidePanel", "storage", ""], + // Broaden host permissions so content script can inject into iframes on external sites. + // Note: in permissions allows some access, but host_permissions explicitly grants injection rights. + host_permissions: [ + "http://127.0.0.1/*", + "https://*/*", + "http://*/*" + ], + options_page: "options.html", // action: { // default_popup: "popup.html", // }, diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py index fb154ba9..5de21556 100644 --- a/workflows/workflow_use/recorder/service.py +++ b/workflows/workflow_use/recorder/service.py @@ -85,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): async with self.final_workflow_processed_lock: if not self.final_workflow_processed_flag and self.last_workflow_update_event: print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).') - self.final_workflow_output = self.last_workflow_update_event.payload + wf = self.last_workflow_update_event.payload + # Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations + try: + clean_steps = [] + for s in wf.steps: + st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None) + url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None) + if st == 'navigation': + if not url or url == 'about:blank': + continue + from urllib.parse import urlparse + host = urlparse(url).hostname or '' + blocked = any( + pat in host for pat in ( + 'doubleclick.net', 'googlesyndication.com', 'googleadservices.com', + 'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net', + 'googletagmanager.com', 'indexww.com', 'adtrafficquality.google' + ) + ) + if blocked: + continue + clean_steps.append(s) + wf.steps = clean_steps + except Exception as e: + print(f'[Service] Backend filter failed: {e}') + self.final_workflow_output = wf self.final_workflow_processed_flag = True processed_this_call = True From cb846efe170a7c4da47a9d9051506b827de1442d Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:23:07 +0530 Subject: [PATCH 4/7] final recording steps completed frontend side --- extension/src/entrypoints/background.ts | 23 ++++++++++----- .../sidepanel/components/recording-view.tsx | 29 ------------------- 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 6e1ae033..bf8d49ea 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -136,13 +136,13 @@ export default defineBackground(() => { (last as any).elementTag === (step as any).elementTag && (last as any).elementText === (step as any).elementText; break; - case 'scroll': - isDuplicate = - last.tabId === step.tabId && - (last as any).targetId === (step as any).targetId && - (last as any).scrollX === (step as any).scrollX && - (last as any).scrollY === (step as any).scrollY; + case 'scroll': { + const sameXY = (last as any).scrollX === (step as any).scrollX && (last as any).scrollY === (step as any).scrollY; + const sameUrl = (last as any).url === (step as any).url; + const nearTime = Math.abs(step.timestamp - last.timestamp) < 200; + isDuplicate = last.tabId === step.tabId && sameXY && sameUrl && nearTime; break; + } case 'key_press': isDuplicate = last.tabId === step.tabId && @@ -485,14 +485,21 @@ export default defineBackground(() => { y: number; }; // Type assertion for clarity const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL - + // Drop internal chrome pages like chrome://newtab/ + if (currentTabInfo?.url?.startsWith('chrome://')) { + break; + } // Check if the last step added was a mergeable scroll event const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( lastStep && lastStep.type === "scroll" && lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id + // Treat same XY within a short time window as duplicate, regardless of targetId + (lastStep as ScrollStep).scrollX === scrollData.x && + (lastStep as ScrollStep).scrollY === scrollData.y && + Math.abs(rrEvent.timestamp - lastStep.timestamp) < 200 && + (lastStep as any).url === currentTabInfo?.url ) { // Update the last scroll step (lastStep as ScrollStep).scrollX = scrollData.x; diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx index 0aa09f33..1aa8a0de 100644 --- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx +++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx @@ -7,19 +7,6 @@ export const RecordingView: React.FC = () => { const { stopRecording, workflow } = useWorkflow(); const stepCount = workflow?.steps?.length || 0; - const openOptions = () => chrome.runtime.openOptionsPage(); - const [enableIframes, setEnableIframes] = React.useState(true); - React.useEffect(() => { - chrome.storage.sync.get({ enableIframes: true }, (s) => - setEnableIframes(!!s.enableIframes) - ); - }, []); - const toggleIframes = async () => { - const next = !enableIframes; - setEnableIframes(next); - await chrome.storage.sync.set({ enableIframes: next }); - }; - return (
@@ -33,22 +20,6 @@ export const RecordingView: React.FC = () => {
- - From 1856775978b152928dda3835ad049aba687cb43a Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:40:41 +0530 Subject: [PATCH 5/7] Added interaction with iframe from backend now needs tocontain the click in iframe itself --- workflows/workflow_use/controller/service.py | 43 +++++++++++++++++++- workflows/workflow_use/controller/views.py | 2 + workflows/workflow_use/workflow/service.py | 19 ++++++--- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 238fcbb5..f4c54d4d 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -DEFAULT_ACTION_TIMEOUT_MS = 1000 +DEFAULT_ACTION_TIMEOUT_MS = 2500 # List of default actions from browser_use.controller.service.Controller to disable # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case) @@ -83,9 +83,48 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse page = await browser_session.get_current_page() original_selector = params.cssSelector + # If frameUrl or frameIdPath are provided, narrow the search to that frame + def _select_context(pg): + try: + from playwright.async_api import Page, Frame + ctx: Page | Frame = pg + if getattr(params, 'frameIdPath', None): + parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != ''] + f = pg.main_frame + for seg in parts: + idx = int(seg) + if idx < len(f.child_frames): + f = f.child_frames[idx] + else: + return ctx + ctx = f + elif getattr(params, 'frameUrl', None): + from urllib.parse import urlparse + pf = urlparse(params.frameUrl) + for fr in pg.frames: + try: + ff = urlparse(fr.url) + # Match origin, and allow target frameUrl to be a prefix of full URL + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl): + ctx = fr + break + except Exception: + continue + except Exception: + ctx = pg + return ctx + try: + # If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM) + curr = (page.url or '').split('#')[0] + tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0] + if tgt and tgt.startswith('http') and curr != tgt: + await page.goto(tgt) + await page.wait_for_load_state() + + ctx = _select_context(page) locator, selector_used = await get_best_element_handle( - page, + ctx, params.cssSelector, params, timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py index 8c61470a..ffb25b4a 100644 --- a/workflows/workflow_use/controller/views.py +++ b/workflows/workflow_use/controller/views.py @@ -23,6 +23,8 @@ class RecorderBase(StepMeta): elementTag: Optional[str] = None elementText: Optional[str] = None frameUrl: Optional[str] = None + frameIdPath: Optional[str] = None + url: Optional[str] = None screenshot: Optional[str] = None diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index aa7008c1..d21da2e8 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -144,11 +144,20 @@ def truncate_selector(selector: str) -> str: await self.browser._wait_for_stable_network() page = await self.browser.get_current_page() - logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') - locator, selector_used = await get_best_element_handle( - page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT - ) - logger.info(f'Element with selector found: {truncate_selector(selector_used)}') + # If the next step declares a URL/frameUrl and it does not match the current page URL, + # skip waiting for its element on the current page (prevents false failures like step 7). + curr_url = (page.url or '').split('#')[0] + declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0] + if declared_next_url and declared_next_url != curr_url: + logger.info( + f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})" + ) + else: + logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') + locator, selector_used = await get_best_element_handle( + page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT + ) + logger.info(f'Element with selector found: {truncate_selector(selector_used)}') except Exception as e: logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}') raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e From 2ccea941462751b0406981ee802c4bb0620ca526 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Thu, 14 Aug 2025 00:12:17 +0530 Subject: [PATCH 6/7] Backend Perfectly clicks on iframe using iterative approach --- workflows/workflow_use/controller/service.py | 96 +++++++++++++++----- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index f4c54d4d..2465d8cc 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -88,50 +88,96 @@ def _select_context(pg): try: from playwright.async_api import Page, Frame ctx: Page | Frame = pg - if getattr(params, 'frameIdPath', None): - parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != ''] + # If frame hints point to top document, stay on page + fid = getattr(params, 'frameIdPath', None) + furl = getattr(params, 'frameUrl', None) + curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else '' + if furl and curr_url and furl.split('#')[0] == curr_url: + return pg + if fid: + segs = [s for s in str(fid).split('.') if s != ''] + if all(s == '0' for s in segs): + return pg f = pg.main_frame - for seg in parts: - idx = int(seg) - if idx < len(f.child_frames): + for s in segs[1:]: # skip top marker + idx = int(s) + if 0 <= idx < len(f.child_frames): f = f.child_frames[idx] else: - return ctx - ctx = f - elif getattr(params, 'frameUrl', None): + return pg + return f + if furl: from urllib.parse import urlparse - pf = urlparse(params.frameUrl) + pf = urlparse(furl) + # If frameUrl equals current page URL (origin+path), stay on page + try: + from urllib.parse import urlparse as _u + cu = _u(curr_url) + if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): + return pg + except Exception: + pass for fr in pg.frames: try: ff = urlparse(fr.url) - # Match origin, and allow target frameUrl to be a prefix of full URL - if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl): - ctx = fr - break + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): + return fr except Exception: continue except Exception: - ctx = pg + return pg return ctx + # Fallback: search all frames for selector (prefer frames matching target origin) + async def _find_in_frames(pg, selector: str): + from urllib.parse import urlparse + prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' + pref_o = urlparse(prefer) if prefer else None + frames = list(pg.frames) + def score(fr): + if not pref_o: + return 0 + try: + fo = urlparse(fr.url) + return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0 + except Exception: + return 0 + frames.sort(key=score, reverse=True) + for fr in frames: + try: + loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2)) + return fr, loc, used + except Exception: + continue + return None, None, None + try: - # If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM) + # Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared curr = (page.url or '').split('#')[0] - tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0] - if tgt and tgt.startswith('http') and curr != tgt: - await page.goto(tgt) + declared_url = (getattr(params, 'url', None) or '').split('#')[0] + has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None)) + if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url: + await page.goto(declared_url) await page.wait_for_load_state() ctx = _select_context(page) - locator, selector_used = await get_best_element_handle( - ctx, - params.cssSelector, - params, - timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, - ) + try: + locator, selector_used = await get_best_element_handle( + ctx, + params.cssSelector, + params, + timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, + ) + except Exception: + # Fallback: search all frames + fr, locator, selector_used = await _find_in_frames(page, params.cssSelector) + if locator is None: + raise + await locator.click(force=True) - msg = f'πŸ–±οΈ Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})' + used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector + msg = f'πŸ–±οΈ Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: From 3ec843fccee7212829a20369a18612b803ab6610 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Thu, 14 Aug 2025 00:45:17 +0530 Subject: [PATCH 7/7] Testing Script Added --- .github/copilot-instructions.md | 62 ++++++++++++++++ workflows/examples/test_iframes.json | 101 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 .github/copilot-instructions.md create mode 100644 workflows/examples/test_iframes.json diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..9cd442d9 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,62 @@ +# Copilot Instructions for workflow-use + +These project-specific instructions help AI coding agents work productively in this repo. Keep responses concise, implement with tools when possible, and follow the repo’s patterns. + +## Architecture (big picture) +- Monorepo with three main parts: + - `extension/` (WXT MV3 Chrome extension): records user actions via rrweb + custom DOM events and streams to a local server. + - `workflows/` (Python package + CLI + FastAPI backend): converts recorded events into deterministic workflow steps, runs/replays steps (fallback to Browser Use/Playwright). + - `ui/` (Vite/React frontend): GUI to view and run workflows. +- Data flow: + 1) Content scripts capture events β†’ background aggregates β†’ posts `WORKFLOW_UPDATE` to `http://127.0.0.1:7331/event`. + 2) Backend stores/processes to `workflows/examples/*.json` and executes with Playwright. + 3) UI/CLI visualize/run. +- Key types: `extension/src/lib/types.ts` (Stored* events), `extension/src/lib/workflow-types.ts` (Step union), `workflows` Python `Workflow` model. + +## Dev workflows +- Build extension: `cd extension && npm install && npm run build`. +- Python backend setup: `cd workflows && uv sync && playwright install chromium && cp .env.example .env`. +- Record: `cd workflows && python cli.py create-workflow` (starts local server; open Chrome with built extension). +- Run workflow as tool: `python cli.py run-as-tool examples/example.workflow.json --prompt "..."`. +- Run workflow: `python cli.py run-workflow examples/example.workflow.json`. +- Launch GUI: `python cli.py launch-gui` (starts FastAPI + UI dev server). + +## Extension patterns +- Use `defineBackground` and `defineContentScript` (WXT). Content script always attaches listeners; background aggregates and emits `WORKFLOW_UPDATE` with a hash to avoid spam. +- Recording: + - rrweb for scroll/meta; custom `CUSTOM_CLICK_EVENT`, `CUSTOM_INPUT_EVENT`, `CUSTOM_KEY_EVENT`, etc. + - New-tab intent: content sends `PREPARE_NEW_TAB`; background correlates `tabs.onCreated` and marks `userInitiated`. + - Activated tab gating: ignore tabs never activated (reduces ad/tracker noise). + - Dedupe: merge consecutive identical steps, collapse rapid empty input bursts, consolidate navigations per tab. + - Iframes: content runs with `allFrames: true` and `matchAboutBlank: true`; events carry `frameUrl` and `frameIdPath`. Background only allows rrweb meta navigations from frames the user interacted with and filters ad/analytics hosts. + +## Backend patterns +- Python FastAPI endpoint `http://127.0.0.1:7331/event` receives: + - `RECORDING_STARTED/STOPPED`, `WORKFLOW_UPDATE` with `steps` only (hash-based dedupe). +- CLI: `workflows/cli.py` provides record/run/launch commands; Playwright is used for replay. +- Keep workflow JSON in `workflows/examples/`. Naming is free-form; version stays at `1.0.0` today. + +## Conventions +- Step schema (extension `workflow-types.ts`): navigation, click, input, key_press, scroll. Prefer merging updates over emitting new steps. +- Use XPath + enhanced CSS selectors; keep values masked for password inputs. +- Avoid sending events from tabs not in `activatedTabs` unless `userInitiated`. +- When adding new event types, extend Stored* in `types.ts`, enrich in content, and map to `Step` in background. + +## Gotchas / Tips +- Avoid noisy iframe navs (recaptcha/ads): rely on `interactedFrameUrls` filtering in background. If adjusting, prefer allow/deny logic over hard-coding hosts in multiple places. +- When changing extension logic, rebuild with `npm run build`; dev opens side panel on install/update. +- Screenshot capture only works for visible tabs; background uses `captureVisibleTab` best-effort. +- If tests are added, ensure they run per package (`extension`, `workflows`, `ui`) rather than at repo root. + +## Example tasks for agents +- Add a new step type (e.g., select): + 1) Extend `StoredCustomSelectEvent` in `types.ts` and emit in `content.ts`. + 2) Map to a `SelectStep` in `background.ts` (convertStoredEventsToSteps). + 3) Update backend replay to handle the new step. +- Reduce noise further: + - Tune debounce windows in content. + - Post-process duplicates in `broadcastWorkflowDataUpdate`. + - Add frame interaction checks before accepting rrweb meta navigations. + +## Security & secrets +- Do not commit real API keys. `.env.example` exists; load secrets locally. If you see a real key in `workflows/.env`, instruct maintainers to rotate and remove it. diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json new file mode 100644 index 00000000..b1834355 --- /dev/null +++ b/workflows/examples/test_iframes.json @@ -0,0 +1,101 @@ +{ + "name": "Recorded Workflow", + "description": "Recorded on 8/11/2025, 4:20:47 AM", + "version": "1.0.0", + "input_schema": [], + "steps": [ + { + "type": "navigation", + "timestamp": 1754866228439, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "scroll", + "timestamp": 1754866228608, + "tabId": 388342781, + "targetId": 219, + "scrollX": 0, + "scrollY": 7, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "navigation", + "timestamp": 1754866228634, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866228849, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "id(\"textareawrapper\")/div[1]/div[6]", + "cssSelector": "div.CodeMirror-scroll", + "elementTag": "DIV", + "elementText": "​

The iframe element

​​​" + }, + { + "type": "navigation", + "timestamp": 1754866230495, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866231531, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "body/div[2]/div[1]/a[4]", + "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]", + "elementTag": "A", + "elementText": "" + }, + { + "type": "navigation", + "timestamp": 1754866237707, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866238574, + "tabId": 388342781, + "url": "https://www.w3schools.com/", + "frameUrl": "https://www.w3schools.com/", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[3]", + "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]", + "elementTag": "A", + "elementText": "JAVASCRIPT" + }, + { + "type": "navigation", + "timestamp": 1754866242778, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866242882, + "tabId": 388342781, + "url": "https://www.w3schools.com/js/default.asp", + "frameUrl": "https://www.w3schools.com/js/default.asp", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[1]", + "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]", + "elementTag": "A", + "elementText": "HTML" + }, + { + "type": "navigation", + "timestamp": 1754866246395, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + } + ] +} \ No newline at end of file