fix: capture tools (#40)

ndthanhdev · web-flow · commit 10d46daf1631 · 2025-10-21T08:32:27.000Z
diff --git a/apps/m2/src/manifest.json b/apps/m2/src/manifest.json
@@ -2,7 +2,7 @@
 	"$schema": "https://json.schemastore.org/webextension.json",
 	"manifest_version": 2,
 	"name": "MCP Browser Kit M2",
-	"version": "5.0.0",
+	"version": "6.0.0",
 
 	"description": "Allow MCP Browser Kit to interact with the browser.",
 	"permissions": [
diff --git a/apps/m3/src/manifest.json b/apps/m3/src/manifest.json
@@ -2,7 +2,7 @@
 	"$schema": "https://json.schemastore.org/chrome-manifest.json",
 	"manifest_version": 3,
 	"name": "MCP Browser Kit M3",
-	"version": "5.0.0",
+	"version": "6.0.0",
 	"description": "Allow MCP Browser Kit to interact with this browser.",
 	"permissions": [
 		"tabs",
diff --git a/apps/server/package.json b/apps/server/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@mcp-browser-kit/server",
-	"version": "5.0.2",
+	"version": "6.0.0",
 	"packageManager": "yarn@4.8.0",
 	"homepage": "https://github.com/ndthanhdev/mcp-browser-kit",
 	"bin": {
diff --git a/apps/server/src/services/mcp-server/browser-tools.ts b/apps/server/src/services/mcp-server/browser-tools.ts
@@ -41,7 +41,7 @@ export class BrowserTools {
 	 */
 	register(server: McpServer): void {
 		this.registerGetBasicBrowserContext(server);
-		this.registerCaptureActiveTab(server);
+		this.registerCaptureTab(server);
 		this.registerInvokeJsFn(server);
 		this.registerOpenTab(server);
 		this.registerCloseTab(server);
@@ -81,20 +81,25 @@ export class BrowserTools {
 	}
 
 	/**
-	 * Registers the captureActiveTab tool
+	 * Registers the captureTab tool
 	 */
-	private registerCaptureActiveTab(server: McpServer): void {
-		this.logger.verbose("Registering tool: captureActiveTab");
+	private registerCaptureTab(server: McpServer): void {
+		this.logger.verbose("Registering tool: captureTab");
 		server.tool(
-			"captureActiveTab",
-			this.toolDescriptionsInputPort.captureActiveTabInstruction(),
-			{},
-			async () => {
-				this.logger.info("Executing captureActiveTab");
-				const overScreenshot = await over(this.toolsInputPort.captureTab);
+			"captureTab",
+			this.toolDescriptionsInputPort.captureTabInstruction(),
+			tabKeySchema,
+			async ({ tabKey }) => {
+				this.logger.info("Executing captureTab", {
+					tabKey,
+				});
+				const overScreenshot = await over(() =>
+					this.toolsInputPort.captureTab(tabKey),
+				);
 
 				if (!overScreenshot.ok) {
-					this.logger.error("Failed to capture active tab screenshot", {
+					this.logger.error("Failed to capture tab screenshot", {
+						tabKey,
 						reason: overScreenshot.reason,
 					});
 					return createErrorResponse(
@@ -105,6 +110,7 @@ export class BrowserTools {
 
 				const screenshot = overScreenshot.value;
 				this.logger.verbose("Screenshot captured", {
+					tabKey,
 					width: screenshot.width,
 					height: screenshot.height,
 				});
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -52,7 +52,6 @@ flowchart TD
   end
 
   subgraph ServerDriven["Driven"]
-    ExtensionDriver
     ConfigProvider["ConfigProvider"]
     ExtensionChannelProvider
     LoggerProvider["LoggerProvider"]
@@ -63,7 +62,6 @@ flowchart TD
   ToolDescriptions --> ToolDescriptionUseCases
   ManageChannels --> ManageChannelUseCases
   %% From Core
-  ServerToolCallUseCases --"x"--> ExtensionDriver
   ServerToolCallUseCases--> ExtensionChannelManager
   ManageChannelUseCases --> ExtensionChannelManager
   ExtensionChannelManager --> ExtensionChannelProvider
diff --git a/packages/core-server/src/core/tool-descriptions.ts b/packages/core-server/src/core/tool-descriptions.ts
@@ -18,22 +18,23 @@ export class ToolDescriptionsUseCases implements ToolDescriptionsInputPort {
 			"  1) getBasicBrowserContext → get browser state and tabKey",
 			"  2) Analyze page content based on your goal and manifest version:",
 			"     - If interaction is required (clicking, filling forms, etc.):",
-			"       · For Manifest Version 2: Use captureActiveTab for visual context or getReadableElements for element identification",
+			"       · For Manifest Version 2: Use captureTab for visual context or getReadableElements for element identification",
 			"       · For other Manifest Versions: Use only getReadableElements for element identification",
 			"     - If no interaction is required (just reading page content):",
 			"       · Use getReadableText to extract all visible text from the page",
 			"  3) Interact using click/fill/enter tools with the obtained tabKey",
 		].join("\n");
 	};
 
-	captureActiveTabInstruction = (): string => {
+	captureTabInstruction = (): string => {
 		return [
-			"📷 Captures a screenshot of the active browser tab",
-			"* Use this tool after calling getBasicBrowserContext to obtain visual context of the current page",
+			"📷 Captures a screenshot of a browser tab",
+			"* Use this tool after calling getBasicBrowserContext to obtain visual context of the page",
 			"* The screenshot helps you see what the browser is displaying to the user",
-			"* No parameters are needed as it automatically captures the active tab",
+			"* Requires tabKey from getBasicBrowserContext",
 			"* Returns an image with width, height, and data in base64 format",
-			"* Workflow: 1) getBasicBrowserContext → 2) captureActiveTab → 3) interact with elements",
+			"* Workflow: 1) getBasicBrowserContext → 2) captureTab → 3) interact with elements",
+			"* Parameters: tabKey",
 			"* NOTE: This feature is only available in browsers supporting Manifest Version 2",
 		].join("\n");
 	};
@@ -65,7 +66,7 @@ export class ToolDescriptionsUseCases implements ToolDescriptionsInputPort {
 			"👆 Clicks on an element at specific X,Y coordinates",
 			"* Use this to click on elements by their position on the screen",
 			"* Requires tabKey from getBasicBrowserContext and x,y coordinates from the screenshot",
-			"* Coordinates are based on the captureActiveTab screenshot dimensions",
+			"* Coordinates are based on the captureTab screenshot dimensions",
 			"* Useful when you know the visual position of an element",
 			"* Parameters: tabKey, x, y",
 		].join("\n");
@@ -76,7 +77,7 @@ export class ToolDescriptionsUseCases implements ToolDescriptionsInputPort {
 			"⌨️ Types text into an input field at specific X,Y coordinates",
 			"* Use this to enter text into form fields by their position",
 			"* Requires tabKey from getBasicBrowserContext, x,y coordinates, and the text to enter",
-			"* Coordinates are based on the captureActiveTab screenshot dimensions",
+			"* Coordinates are based on the captureTab screenshot dimensions",
 			"* First clicks at the specified position, then types the provided text",
 			"* After filling text, check for associated submit-like buttons (submit, search, send, etc.)",
 			"* If submit button is visible, use clickOnViewableElement with that button",
@@ -90,7 +91,7 @@ export class ToolDescriptionsUseCases implements ToolDescriptionsInputPort {
 			"↵ Hits the Enter key on an element at specific X,Y coordinates",
 			"* Use this to trigger actions like form submission or button clicks",
 			"* Requires tabKey from getBasicBrowserContext and x,y coordinates from the screenshot",
-			"* Coordinates are based on the captureActiveTab screenshot dimensions",
+			"* Coordinates are based on the captureTab screenshot dimensions",
 			"* Parameters: tabKey, x, y",
 		].join("\n");
 	};
diff --git a/packages/core-server/src/input-ports/tool-descriptions.ts b/packages/core-server/src/input-ports/tool-descriptions.ts
@@ -5,8 +5,8 @@ export interface ToolDescriptionsInputPort {
 	// GetTabs
 	getBasicBrowserContextInstruction(): string;
 
-	// CaptureActiveTab
-	captureActiveTabInstruction(): string;
+	// CaptureTab
+	captureTabInstruction(): string;
 
 	// GetReadableText
 	getReadableTextInstruction(): string;
diff --git a/packages/server-driven-trpc-channel-provider/src/routers/defer.ts b/packages/server-driven-trpc-channel-provider/src/routers/defer.ts
@@ -65,8 +65,9 @@ export const createDeferRouter = (container: Container) => {
 					signal.onabort = () => {
 						logger.info(`Subscription aborted for channel: ${channelId}`);
 						stopped = true;
-						defer.reject(new Error("Client closed subscription"));
 						unsubscribe();
+						defer.reject(new Error("Client closed subscription"));
+						extensionChannelProvider.closeChannel(channelId);
 					};
 				}
 				while (!stopped) {
diff --git a/packages/server-driven-trpc-channel-provider/src/services/server-driven-trpc-channel-provider.ts b/packages/server-driven-trpc-channel-provider/src/services/server-driven-trpc-channel-provider.ts
@@ -57,6 +57,10 @@ export class ServerDrivenTrpcChannelProvider
 		return this.baseExtensionChannelProvider.openChannel(id);
 	};
 
+	closeChannel = (id: string) => {
+		return this.baseExtensionChannelProvider.closeChannel(id);
+	};
+
 	public async start() {
 		this.logger.verbose("Starting HTTP Server");
 		const httpServer = this.createHttpServer();

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@mcp-browser-kit/server",`
`3`		`- "version": "5.0.2",`
	`3`	`+ "version": "6.0.0",`
`4`	`4`	`"packageManager": "yarn@4.8.0",`
`5`	`5`	`"homepage": "https://github.com/ndthanhdev/mcp-browser-kit",`
`6`	`6`	`"bin": {`