Skip to content

Commit 4fefb7f

Browse files
committed
feat: Update default tool RAG Web Browser
1 parent ed750c0 commit 4fefb7f

File tree

6 files changed

+118
-51
lines changed

6 files changed

+118
-51
lines changed

package-lock.json

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
"dotenv": "^16.4.7",
6565
"eslint": "^9.19.0",
6666
"eventsource": "^3.0.2",
67-
"tsx": "^4.6.2",
67+
"tsx": "^4.20.5",
6868
"typescript": "^5.3.3",
6969
"typescript-eslint": "^8.23.0",
7070
"vitest": "^3.0.8"

src/const.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,16 @@ export enum HelperTools {
4747
GET_HTML_SKELETON = 'get-html-skeleton',
4848
}
4949

50-
export const ACTOR_RAG_WEB_BROWSER = 'apify/rag-web-browser';
50+
export const RAG_WEB_BROWSER = 'apify/rag-web-browser';
51+
export const RAG_WEB_BROWSER_WHITELISTED_FIELDS = ['query', 'maxResults', 'outputFormats'];
52+
export const RAG_WEB_BROWSER_ADDITIONAL_DESC = `This tool provides general web browsing functionality, for specific sites like e-commerce, social media it is always better to search for a specific Actor`;
5153

5254
export const defaults = {
5355
actors: [
54-
ACTOR_RAG_WEB_BROWSER,
56+
RAG_WEB_BROWSER,
5557
],
5658
};
5759

58-
export const ACTOR_ADDITIONAL_INSTRUCTIONS = 'Never call/execute tool/Actor unless confirmed by the user.';
5960
export const SKYFIRE_MIN_CHARGE_USD = 5.0;
6061
export const SKYFIRE_SELLER_ID = process.env.SKYFIRE_SELLER_SERVICE_ID;
6162

src/tools/actor.ts

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ import log from '@apify/log';
77

88
import { ApifyClient } from '../apify-client.js';
99
import {
10-
ACTOR_ADDITIONAL_INSTRUCTIONS,
1110
ACTOR_MAX_MEMORY_MBYTES,
1211
HelperTools,
12+
RAG_WEB_BROWSER,
13+
RAG_WEB_BROWSER_ADDITIONAL_DESC,
1314
SKYFIRE_TOOL_INSTRUCTIONS,
1415
TOOL_MAX_OUTPUT_CHARS,
1516
} from '../const.js';
@@ -26,7 +27,11 @@ import type { ProgressTracker } from '../utils/progress.js';
2627
import type { JsonSchemaProperty } from '../utils/schema-generation.js';
2728
import { generateSchemaFromItems } from '../utils/schema-generation.js';
2829
import { getActorDefinition } from './build.js';
29-
import { actorNameToToolName, fixedAjvCompile, getToolSchemaID, transformActorInputSchemaProperties } from './utils.js';
30+
import {
31+
actorNameToToolName,
32+
buildActorInputSchema,
33+
fixedAjvCompile,
34+
} from './utils.js';
3035

3136
// Define a named return type for callActorGetDataset
3237
export type CallActorGetDatasetResult = {
@@ -156,45 +161,48 @@ export async function getNormalActorsAsTools(
156161
): Promise<ToolEntry[]> {
157162
const tools: ToolEntry[] = [];
158163

159-
// Zip the results with their corresponding actorIDs
160164
for (const actorInfo of actorsInfo) {
161165
const { actorDefinitionPruned } = actorInfo;
162166

163-
if (actorDefinitionPruned) {
164-
const schemaID = getToolSchemaID(actorDefinitionPruned.actorFullName);
165-
if (actorDefinitionPruned.input && 'properties' in actorDefinitionPruned.input && actorDefinitionPruned.input) {
166-
actorDefinitionPruned.input.properties = transformActorInputSchemaProperties(actorDefinitionPruned.input);
167-
// Add schema $id, each valid JSON schema should have a unique $id
168-
// see https://json-schema.org/understanding-json-schema/basics#declaring-a-unique-identifier
169-
actorDefinitionPruned.input.$id = schemaID;
170-
}
171-
try {
172-
const memoryMbytes = actorDefinitionPruned.defaultRunOptions?.memoryMbytes || ACTOR_MAX_MEMORY_MBYTES;
173-
const tool: ToolEntry = {
174-
type: 'actor',
175-
tool: {
176-
name: actorNameToToolName(actorDefinitionPruned.actorFullName),
177-
actorFullName: actorDefinitionPruned.actorFullName,
178-
description: `This tool calls the Actor "${actorDefinitionPruned.actorFullName}" and retrieves its output results. Use this tool instead of the "${HelperTools.ACTOR_CALL}" if user requests to use this specific Actor.
179-
Actor description: ${actorDefinitionPruned.description}
180-
Instructions: ${ACTOR_ADDITIONAL_INSTRUCTIONS}`,
181-
inputSchema: actorDefinitionPruned.input
182-
// So Actor without input schema works - MCP client expects JSON schema valid output
183-
|| {
184-
type: 'object',
185-
properties: {},
186-
required: [],
187-
},
188-
// Additional props true to allow skyfire-pay-id
189-
ajvValidate: fixedAjvCompile(ajv, { ...actorDefinitionPruned.input, additionalProperties: true }),
190-
memoryMbytes: memoryMbytes > ACTOR_MAX_MEMORY_MBYTES ? ACTOR_MAX_MEMORY_MBYTES : memoryMbytes,
191-
},
192-
};
193-
tools.push(tool);
194-
} catch (validationError) {
195-
log.error('Failed to compile AJV schema for Actor', { actorName: actorDefinitionPruned.actorFullName, error: validationError });
196-
}
167+
if (!actorDefinitionPruned) continue;
168+
169+
const isRag = actorDefinitionPruned.actorFullName === RAG_WEB_BROWSER;
170+
const { inputSchema } = buildActorInputSchema(actorDefinitionPruned.actorFullName, actorDefinitionPruned.input, isRag);
171+
172+
let description = `This tool calls the Actor "${actorDefinitionPruned.actorFullName}" and retrieves its output results.
173+
Use this tool instead of the "${HelperTools.ACTOR_CALL}" if user requests this specific Actor.
174+
Actor description: ${actorDefinitionPruned.description}`;
175+
if (isRag) {
176+
description += RAG_WEB_BROWSER_ADDITIONAL_DESC;
177+
}
178+
179+
const memoryMbytes = Math.min(
180+
actorDefinitionPruned.defaultRunOptions?.memoryMbytes || ACTOR_MAX_MEMORY_MBYTES,
181+
ACTOR_MAX_MEMORY_MBYTES,
182+
);
183+
184+
let ajvValidate;
185+
try {
186+
ajvValidate = fixedAjvCompile(ajv, { ...inputSchema, additionalProperties: true });
187+
} catch (e) {
188+
log.error('Failed to compile schema', {
189+
actorName: actorDefinitionPruned.actorFullName,
190+
error: e,
191+
});
192+
continue;
197193
}
194+
195+
tools.push({
196+
type: 'actor',
197+
tool: {
198+
name: actorNameToToolName(actorDefinitionPruned.actorFullName),
199+
actorFullName: actorDefinitionPruned.actorFullName,
200+
description,
201+
inputSchema,
202+
ajvValidate,
203+
memoryMbytes,
204+
},
205+
});
198206
}
199207
return tools;
200208
}

src/tools/get-html-skeleton.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { z } from 'zod';
22
import zodToJsonSchema from 'zod-to-json-schema';
33

44
import { ApifyClient } from '../apify-client.js';
5-
import { ACTOR_RAG_WEB_BROWSER, HelperTools, TOOL_MAX_OUTPUT_CHARS } from '../const.js';
5+
import { HelperTools, RAG_WEB_BROWSER, TOOL_MAX_OUTPUT_CHARS } from '../const.js';
66
import { getHtmlSkeletonCache } from '../state.js';
77
import type { InternalTool, ToolEntry } from '../types.js';
88
import { ajv } from '../utils/ajv.js';
@@ -58,7 +58,7 @@ export const getHtmlSkeleton: ToolEntry = {
5858
// Not in cache, call the Actor for scraping
5959
const client = new ApifyClient({ token: apifyToken });
6060

61-
const run = await client.actor(ACTOR_RAG_WEB_BROWSER).call({
61+
const run = await client.actor(RAG_WEB_BROWSER).call({
6262
query: parsed.url,
6363
outputFormats: [
6464
'html',
@@ -68,16 +68,16 @@ export const getHtmlSkeleton: ToolEntry = {
6868

6969
const datasetItems = await client.dataset(run.defaultDatasetId).listItems();
7070
if (datasetItems.items.length === 0) {
71-
return buildMCPResponse([`The scraping Actor (${ACTOR_RAG_WEB_BROWSER}) did not return any output for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
71+
return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) did not return any output for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
7272
}
7373

7474
const firstItem = datasetItems.items[0] as unknown as ScrapedPageItem;
7575
if (firstItem.crawl.httpStatusMessage.toLocaleLowerCase() !== 'ok') {
76-
return buildMCPResponse([`The scraping Actor (${ACTOR_RAG_WEB_BROWSER}) returned an HTTP status ${firstItem.crawl.httpStatusCode} (${firstItem.crawl.httpStatusMessage}) for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
76+
return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) returned an HTTP status ${firstItem.crawl.httpStatusCode} (${firstItem.crawl.httpStatusMessage}) for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
7777
}
7878

7979
if (!firstItem.html) {
80-
return buildMCPResponse([`The scraping Actor (${ACTOR_RAG_WEB_BROWSER}) did not return any HTML content for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
80+
return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) did not return any HTML content for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]);
8181
}
8282

8383
strippedHtml = stripHtml(firstItem.html);

src/tools/utils.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import type { ValidateFunction } from 'ajv';
22
import type Ajv from 'ajv';
33

4-
import { ACTOR_ENUM_MAX_LENGTH, ACTOR_MAX_DESCRIPTION_LENGTH } from '../const.js';
4+
import { ACTOR_ENUM_MAX_LENGTH, ACTOR_MAX_DESCRIPTION_LENGTH, RAG_WEB_BROWSER_WHITELISTED_FIELDS } from '../const.js';
55
import type { ActorInputSchemaProperties, IActorInputSchema, ISchemaProperties } from '../types.js';
66
import {
77
addGlobsProperties,
@@ -151,6 +151,64 @@ export function markInputPropertiesAsRequired(input: IActorInputSchema): Record<
151151
return properties;
152152
}
153153

154+
/**
155+
* Builds the final Actor input schema for MCP tool usage.
156+
*/
157+
export function buildActorInputSchema(actorFullName: string, input: IActorInputSchema | undefined, isRag: boolean) {
158+
if (!input) {
159+
return {
160+
inputSchema: {
161+
$id: getToolSchemaID(actorFullName),
162+
type: 'object',
163+
properties: {},
164+
required: [],
165+
},
166+
};
167+
}
168+
169+
// Work on a shallow cloned structure (deep clone only if needed later)
170+
const working = structuredClone(input);
171+
172+
if (working && typeof working === 'object' && 'properties' in working && working.properties) {
173+
working.properties = transformActorInputSchemaProperties(working);
174+
}
175+
176+
let finalSchema = working;
177+
if (isRag) {
178+
finalSchema = pruneSchemaPropertiesByWhitelist(finalSchema, RAG_WEB_BROWSER_WHITELISTED_FIELDS);
179+
}
180+
181+
finalSchema.$id = getToolSchemaID(actorFullName);
182+
return { inputSchema: finalSchema };
183+
}
184+
185+
/**
186+
* Returns a shallow-cloned input schema that keeps only whitelisted properties
187+
* and filters the required array accordingly. All other top-level fields are preserved.
188+
* If properties are missing, the original input is returned unchanged.
189+
*
190+
* This is used specifically for apify/rag-web-browser where we want to expose
191+
* only a subset of input properties to the MCP tool without redefining the schema.
192+
*/
193+
export function pruneSchemaPropertiesByWhitelist(
194+
input: IActorInputSchema,
195+
whitelist: Iterable<string>,
196+
): IActorInputSchema {
197+
if (!input || !input.properties || typeof input.properties !== 'object' || !whitelist) return input;
198+
199+
const allowed = new Set<string>(Array.from(whitelist));
200+
const newProps: Record<string, ISchemaProperties> = {};
201+
for (const key of Object.keys(input.properties)) {
202+
if (allowed.has(key)) newProps[key] = input.properties[key];
203+
}
204+
205+
const cloned: IActorInputSchema = { ...input, properties: newProps };
206+
if (Array.isArray(input.required)) {
207+
cloned.required = input.required.filter((k) => allowed.has(k));
208+
}
209+
return cloned;
210+
}
211+
154212
/**
155213
* Helps determine the type of items in an array schema property.
156214
* Priority order: explicit type in items > prefill type > default value type > editor type.

0 commit comments

Comments
 (0)