Skip to content

Commit e127971

Browse files
committed
feat: Add tool to get dataset schema so LLMs can understand dataset structure without fetching everything (mainly for Zuzka).
Placed under storage since that group isn’t enabled by default. Also, I added runId and datasetId injection into context, hopefully it won't break anything affect existing flows.
1 parent 90113f1 commit e127971

File tree

8 files changed

+208
-21
lines changed

8 files changed

+208
-21
lines changed

package-lock.json

Lines changed: 51 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"apify": "^3.4.2",
4040
"apify-client": "^2.12.6",
4141
"express": "^4.21.2",
42+
"to-json-schema": "^0.2.5",
4243
"turndown": "^7.2.0",
4344
"yargs": "^17.7.2",
4445
"zod": "^3.24.1",
@@ -50,6 +51,7 @@
5051
"@apify/eslint-config": "^1.0.0",
5152
"@apify/tsconfig": "^0.1.0",
5253
"@types/express": "^4.0.0",
54+
"@types/to-json-schema": "^0.2.4",
5355
"@types/yargs": "^17.0.33",
5456
"@types/yargs-parser": "^21.0.3",
5557
"dotenv": "^16.4.7",

src/const.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export enum HelperTools {
2828
DATASET_GET = 'get-dataset',
2929
DATASET_LIST_GET = 'get-dataset-list',
3030
DATASET_GET_ITEMS = 'get-dataset-items',
31+
DATASET_SCHEMA_GET = 'get-dataset-schema',
3132
KEY_VALUE_STORE_LIST_GET = 'get-key-value-store-list',
3233
KEY_VALUE_STORE_GET = 'get-key-value-store',
3334
KEY_VALUE_STORE_KEYS_GET = 'get-key-value-store-keys',
@@ -44,11 +45,6 @@ export const defaults = {
4445
],
4546
};
4647

47-
// Actor output const
48-
export const ACTOR_OUTPUT_MAX_CHARS_PER_ITEM = 5_000;
49-
export const ACTOR_OUTPUT_TRUNCATED_MESSAGE = `Output was truncated because it will not fit into context.`
50-
+ `There is no reason to call this tool again! You can use ${HelperTools.DATASET_GET_ITEMS} tool to get more items from the dataset.`;
51-
5248
export const ACTOR_ADDITIONAL_INSTRUCTIONS = 'Never call/execute tool/Actor unless confirmed by the user.';
5349

5450
// Cache

src/mcp/server.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -495,22 +495,22 @@ export class ActorsMcpServer {
495495
const callOptions: ActorCallOptions = { memory: actorTool.memoryMbytes };
496496

497497
try {
498-
const { items } = await callActorGetDataset(
498+
const { runId, datasetId, items } = await callActorGetDataset(
499499
actorTool.actorFullName,
500500
args,
501501
apifyToken as string,
502502
callOptions,
503503
progressTracker,
504504
);
505+
const content = [
506+
{ type: 'text', text: `Actor finished with runId: ${runId}, datasetId ${datasetId}` },
507+
];
505508

506-
return {
507-
content: items.items.map((item: Record<string, unknown>) => {
508-
return {
509-
type: 'text',
510-
text: JSON.stringify(item),
511-
};
512-
}),
513-
};
509+
const itemContents = items.items.map((item: Record<string, unknown>) => {
510+
return { type: 'text', text: JSON.stringify(item) };
511+
});
512+
content.push(...itemContents);
513+
return { content };
514514
} finally {
515515
if (progressTracker) {
516516
progressTracker.stop();

src/stdio.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ const actorList = actors ? actors.split(',').map((a: string) => a.trim()) : [];
9595
// Keys of the tool categories to enable
9696
const toolCategoryKeys = argv.tools ? argv.tools.split(',').map((t: string) => t.trim()) : [];
9797

98+
// Propagate log.error to console.error for easier debugging
99+
const originalError = log.error.bind(log);
100+
log.error = (...args: Parameters<typeof log.error>) => {
101+
originalError(...args);
102+
// eslint-disable-next-line no-console
103+
console.error(...args);
104+
};
105+
98106
// Validate environment
99107
if (!process.env.APIFY_TOKEN) {
100108
log.error('APIFY_TOKEN is required but not set in the environment variables.');

src/tools/actor.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ const ajv = new Ajv({ coerceTypes: 'array', strict: false });
3737

3838
// Define a named return type for callActorGetDataset
3939
export type CallActorGetDatasetResult = {
40+
runId: string;
41+
datasetId: string;
4042
items: PaginatedList<Record<string, unknown>>;
4143
};
4244

@@ -95,8 +97,7 @@ export async function callActorGetDataset(
9597
}
9698

9799
log.info(`Actor ${actorName} finished with ${items.count} items`);
98-
99-
return { items };
100+
return { runId: actorRun.id, datasetId: completedRun.defaultDatasetId, items };
100101
} catch (error) {
101102
log.error(`Error calling actor: ${error}. Actor: ${actorName}, input: ${JSON.stringify(input)}`);
102103
throw new Error(`Error calling Actor: ${error}`);
@@ -120,9 +121,8 @@ export async function callActorGetDataset(
120121
* 4. Properties are shortened using shortenProperties()
121122
* 5. Enums are added to descriptions with examples using addEnumsToDescriptionsWithExamples()
122123
*
123-
* @param {string[]} actors - An array of actor IDs or Actor full names.
124-
* @param {string} apifyToken - The Apify token to use for authentication.
125-
* @returns {Promise<Tool[]>} - A promise that resolves to an array of MCP tools.
124+
* @param {ActorInfo[]} actorsInfo - An array of ActorInfo objects with webServerMcpPath and actorDefinitionPruned.
125+
* @returns {Promise<ToolEntry[]>} - A promise that resolves to an array of MCP tools.
126126
*/
127127
export async function getNormalActorsAsTools(
128128
actorsInfo: ActorInfo[],

src/tools/dataset.ts

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Ajv } from 'ajv';
2+
import toJsonSchema from 'to-json-schema';
23
import { z } from 'zod';
34
import zodToJsonSchema from 'zod-to-json-schema';
45

@@ -112,3 +113,132 @@ export const getDatasetItems: ToolEntry = {
112113
},
113114
} as InternalTool,
114115
};
116+
117+
/**
118+
* Function to recursively remove empty arrays from an object
119+
*/
120+
function removeEmptyArrays(obj: unknown): unknown {
121+
if (Array.isArray(obj)) {
122+
// If the item is an array, recursively call removeEmptyArrays on each element.
123+
return obj.map((item) => removeEmptyArrays(item));
124+
}
125+
126+
if (typeof obj !== 'object' || obj === null) {
127+
// Return primitives and null values as is.
128+
return obj;
129+
}
130+
131+
// Use reduce to build a new object, excluding keys with empty arrays.
132+
return Object.entries(obj).reduce((acc, [key, value]) => {
133+
const processedValue = removeEmptyArrays(value);
134+
135+
// Exclude the key if the processed value is an empty array.
136+
if (Array.isArray(processedValue) && processedValue.length === 0) {
137+
return acc;
138+
}
139+
140+
acc[key] = processedValue;
141+
return acc;
142+
}, {} as Record<string, unknown>);
143+
}
144+
145+
const getDatasetSchemaArgs = z.object({
146+
datasetId: z.string()
147+
.min(1)
148+
.describe('Dataset ID or username~dataset-name.'),
149+
limit: z.number().optional()
150+
.describe('Maximum number of items to use for schema generation. Default is 5.')
151+
.default(5),
152+
clean: z.boolean().optional()
153+
.describe('If true, uses only non-empty items and skips hidden fields (starting with #). Default is true.')
154+
.default(true),
155+
arrayMode: z.enum(['first', 'all']).optional()
156+
.describe('Strategy for handling arrays. "first" uses first item as template, "all" merges all items. Default is "all".')
157+
.default('all'),
158+
additionalProperties: z.boolean().optional()
159+
.describe('If true, allows additional properties in objects. Default is true.')
160+
.default(true),
161+
});
162+
163+
/**
164+
* Generates a JSON schema from dataset items
165+
*/
166+
export const getDatasetSchema: ToolEntry = {
167+
type: 'internal',
168+
tool: {
169+
name: HelperTools.DATASET_SCHEMA_GET,
170+
actorFullName: HelperTools.DATASET_SCHEMA_GET,
171+
description: 'Generates a JSON schema from dataset items. '
172+
+ 'The schema describes the structure of the data in the dataset, which can be used for validation, documentation, or data processing.'
173+
+ 'Since the dataset can be large it is convenient to understand the structure of the dataset before getting dataset items.',
174+
inputSchema: zodToJsonSchema(getDatasetSchemaArgs),
175+
ajvValidate: ajv.compile(zodToJsonSchema(getDatasetSchemaArgs)),
176+
call: async (toolArgs) => {
177+
const { args, apifyToken } = toolArgs;
178+
const parsed = getDatasetSchemaArgs.parse(args);
179+
const client = new ApifyClient({ token: apifyToken });
180+
181+
try {
182+
// Get dataset items
183+
const datasetResponse = await client.dataset(parsed.datasetId).listItems({
184+
clean: parsed.clean,
185+
limit: parsed.limit,
186+
});
187+
188+
if (!datasetResponse) {
189+
return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] };
190+
}
191+
192+
const datasetItems = datasetResponse.items;
193+
194+
if (datasetItems.length === 0) {
195+
return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' is empty.` }] };
196+
}
197+
198+
// Clean the dataset items by removing empty arrays
199+
const cleanedDatasetItems = datasetItems.map((item) => removeEmptyArrays(item));
200+
201+
try {
202+
// Generate schema with options to handle arrays better
203+
const schema = toJsonSchema(cleanedDatasetItems, {
204+
arrays: { mode: parsed.arrayMode },
205+
objects: { additionalProperties: parsed.additionalProperties },
206+
// strings: { detectFormat: false },
207+
});
208+
209+
return {
210+
content: [{
211+
type: 'text',
212+
text: JSON.stringify(schema),
213+
}],
214+
};
215+
} catch (schemaError) {
216+
// Fallback: try with a simpler approach
217+
try {
218+
const fallbackSchema = toJsonSchema(cleanedDatasetItems, {
219+
arrays: { mode: 'first' },
220+
});
221+
222+
return {
223+
content: [{ type: 'text', text: JSON.stringify(fallbackSchema) }],
224+
};
225+
} catch (fallbackError) {
226+
return {
227+
content: [{
228+
type: 'text',
229+
text: `Error generating schema: ${(schemaError as Error).message}. Fallback also failed: ${(fallbackError as Error).message}`,
230+
}],
231+
};
232+
}
233+
}
234+
} catch (error) {
235+
return {
236+
content: [{
237+
type: 'text',
238+
text: `Error generating schema: ${(error as Error).message}`,
239+
}],
240+
};
241+
}
242+
},
243+
} as InternalTool,
244+
};

src/tools/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Import specific tools that are being used
22
import type { ToolCategory } from '../types.js';
33
import { callActor, callActorGetDataset, getActorsAsTools } from './actor.js';
4-
import { getDataset, getDatasetItems } from './dataset.js';
4+
import { getDataset, getDatasetItems, getDatasetSchema } from './dataset.js';
55
import { getUserDatasetsList } from './dataset_collection.js';
66
import { fetchApifyDocsTool } from './fetch-apify-docs.js';
77
import { getActorDetailsTool } from './get-actor-details.js';
@@ -26,6 +26,7 @@ export const toolCategories = {
2626
storage: [
2727
getDataset,
2828
getDatasetItems,
29+
getDatasetSchema,
2930
getKeyValueStore,
3031
getKeyValueStoreKeys,
3132
getKeyValueStoreRecord,

0 commit comments

Comments
 (0)