Skip to content

Commit 545cd5a

Browse files
committed
feat: keyboard key-press actions; enforce single-tab navigation; UX improvements
fix: prevent base64/binary output leakage to chat stream
1 parent 19e0690 commit 545cd5a

File tree

9 files changed

+523
-72
lines changed

9 files changed

+523
-72
lines changed

src/core/prompts/tools/browser-action.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ Parameters:
2323
- Always click in the center of an element (icon, button, link, etc.) based on coordinates derived from a screenshot.
2424
* type: Type a string of text on the keyboard. You might use this after clicking on a text field to input text.
2525
- Use with the \`text\` parameter to provide the string to type.
26+
* press: Press a single keyboard key (e.g., Enter, Tab, Escape).
27+
- Use with the \`text\` parameter to provide the key name (e.g., Enter).
2628
* resize: Resize the viewport to a specific w,h size.
2729
- Use with the \`size\` parameter to specify the new size.
2830
* scroll_down: Scroll down the page by one page height.
@@ -39,7 +41,7 @@ Parameters:
3941
* Example: <text>Hello, world!</text>
4042
Usage:
4143
<browser_action>
42-
<action>Action to perform (e.g., launch, click, type, scroll_down, scroll_up, close)</action>
44+
<action>Action to perform (e.g., launch, click, type, press, scroll_down, scroll_up, close)</action>
4345
<url>URL to launch the browser at (optional)</url>
4446
<coordinate>x,y coordinates (optional)</coordinate>
4547
<text>Text to type (optional)</text>

src/core/tools/browserActionTool.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ export async function browserActionTool(
9090
}
9191
}
9292

93-
if (action === "type") {
93+
if (action === "type" || action === "press") {
9494
if (!text) {
9595
cline.consecutiveMistakeCount++
9696
cline.recordToolError("browser_action")
@@ -133,6 +133,9 @@ export async function browserActionTool(
133133
case "type":
134134
browserActionResult = await cline.browserSession.type(text!)
135135
break
136+
case "press":
137+
browserActionResult = await cline.browserSession.press(text!)
138+
break
136139
case "scroll_down":
137140
browserActionResult = await cline.browserSession.scrollDown()
138141
break
@@ -153,6 +156,7 @@ export async function browserActionTool(
153156
case "click":
154157
case "hover":
155158
case "type":
159+
case "press":
156160
case "scroll_down":
157161
case "scroll_up":
158162
case "resize":

src/services/browser/BrowserSession.ts

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,64 @@ export class BrowserSession {
453453
}
454454
}
455455

456+
/**
457+
* Force links and window.open to navigate in the same tab.
458+
* This makes clicks on anchors with target="_blank" stay in the current page
459+
* and also intercepts window.open so SPA/open-in-new-tab patterns don't spawn popups.
460+
*/
461+
private async forceLinksToSameTab(page: Page): Promise<void> {
462+
try {
463+
await page.evaluate(() => {
464+
try {
465+
// Ensure we only install once per document
466+
if ((window as any).__ROO_FORCE_SAME_TAB__) return
467+
;(window as any).__ROO_FORCE_SAME_TAB__ = true
468+
469+
// Override window.open to navigate current tab instead of creating a new one
470+
const originalOpen = window.open
471+
window.open = function (url: string | URL, target?: string, features?: string) {
472+
try {
473+
const href = typeof url === "string" ? url : String(url)
474+
location.href = href
475+
} catch {
476+
// fall back to original if something unexpected occurs
477+
try {
478+
return originalOpen.apply(window, [url as any, "_self", features]) as any
479+
} catch {}
480+
}
481+
return null as any
482+
} as any
483+
484+
// Rewrite anchors that explicitly open new tabs
485+
document.querySelectorAll('a[target="_blank"]').forEach((a) => {
486+
a.setAttribute("target", "_self")
487+
})
488+
489+
// Defensive capture: if an element still tries to open in a new tab, force same-tab
490+
document.addEventListener(
491+
"click",
492+
(ev) => {
493+
const el = (ev.target as HTMLElement | null)?.closest?.(
494+
'a[target="_blank"]',
495+
) as HTMLAnchorElement | null
496+
if (el && el.href) {
497+
ev.preventDefault()
498+
try {
499+
location.href = el.href
500+
} catch {}
501+
}
502+
},
503+
{ capture: true, passive: false },
504+
)
505+
} catch {
506+
// no-op; forcing same-tab is best-effort
507+
}
508+
})
509+
} catch {
510+
// If evaluate fails (e.g., cross-origin/state), continue without breaking the action
511+
}
512+
}
513+
456514
/**
457515
* Handles mouse interaction with network activity monitoring
458516
*/
@@ -463,6 +521,9 @@ export class BrowserSession {
463521
): Promise<void> {
464522
const [x, y] = coordinate.split(",").map(Number)
465523

524+
// Force any new-tab behavior (target="_blank", window.open) to stay in the same tab
525+
await this.forceLinksToSameTab(page)
526+
466527
// Set up network request monitoring
467528
let hasNetworkActivity = false
468529
const requestListener = () => {
@@ -506,6 +567,27 @@ export class BrowserSession {
506567
})
507568
}
508569

570+
async press(key: string): Promise<BrowserActionResult> {
571+
return this.doAction(async (page) => {
572+
// Allow common aliases
573+
const mapping: Record<string, string> = {
574+
esc: "Escape",
575+
return: "Enter",
576+
escape: "Escape",
577+
enter: "Enter",
578+
tab: "Tab",
579+
space: "Space",
580+
arrowup: "ArrowUp",
581+
arrowdown: "ArrowDown",
582+
arrowleft: "ArrowLeft",
583+
arrowright: "ArrowRight",
584+
}
585+
const normalized = key.trim()
586+
const mapped = mapping[normalized.toLowerCase()] ?? key
587+
await page.keyboard.press(mapped)
588+
})
589+
}
590+
509591
/**
510592
* Scrolls the page by the specified amount
511593
*/

src/services/browser/__tests__/BrowserSession.spec.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,65 @@ describe("BrowserSession", () => {
229229
expect(mockBrowser.close).not.toHaveBeenCalled()
230230
})
231231
})
232+
233+
it("forces same-tab behavior before click", async () => {
234+
// Prepare a minimal mock page with required APIs
235+
const page: any = {
236+
on: vi.fn(),
237+
off: vi.fn(),
238+
screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"),
239+
url: vi.fn().mockReturnValue("https://example.com"),
240+
waitForNavigation: vi.fn().mockResolvedValue(undefined),
241+
evaluate: vi.fn().mockResolvedValue(undefined),
242+
mouse: {
243+
click: vi.fn().mockResolvedValue(undefined),
244+
move: vi.fn().mockResolvedValue(undefined),
245+
},
246+
}
247+
248+
;(browserSession as any).page = page
249+
250+
// Spy on the forceLinksToSameTab helper to ensure it's invoked
251+
const forceSpy = vi.fn().mockResolvedValue(undefined)
252+
;(browserSession as any).forceLinksToSameTab = forceSpy
253+
254+
await browserSession.click("10,20")
255+
256+
expect(forceSpy).toHaveBeenCalledTimes(1)
257+
expect(forceSpy).toHaveBeenCalledWith(page)
258+
expect(page.mouse.click).toHaveBeenCalledWith(10, 20)
259+
})
260+
})
261+
262+
describe("keyboard press", () => {
263+
it("presses a keyboard key", async () => {
264+
// Prepare a minimal mock page with required APIs
265+
const page: any = {
266+
on: vi.fn(),
267+
off: vi.fn(),
268+
screenshot: vi.fn().mockResolvedValue("mockScreenshotBase64"),
269+
url: vi.fn().mockReturnValue("https://example.com"),
270+
waitForNavigation: vi.fn().mockResolvedValue(undefined),
271+
evaluate: vi.fn().mockResolvedValue(undefined),
272+
keyboard: {
273+
press: vi.fn().mockResolvedValue(undefined),
274+
type: vi.fn().mockResolvedValue(undefined),
275+
},
276+
}
277+
278+
// Create a fresh BrowserSession with a mock context
279+
const mockCtx: any = {
280+
globalState: { get: vi.fn(), update: vi.fn() },
281+
globalStorageUri: { fsPath: "/mock/global/storage/path" },
282+
extensionUri: { fsPath: "/mock/extension/path" },
283+
}
284+
const session = new BrowserSession(mockCtx)
285+
286+
;(session as any).page = page
287+
288+
await session.press("Enter")
289+
290+
expect(page.keyboard.press).toHaveBeenCalledTimes(1)
291+
expect(page.keyboard.press).toHaveBeenCalledWith("Enter")
292+
})
232293
})

src/shared/ExtensionMessage.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ export const browserActions = [
432432
"click",
433433
"hover",
434434
"type",
435+
"press",
435436
"scroll_down",
436437
"scroll_up",
437438
"resize",

0 commit comments

Comments
 (0)