Skip to content

Commit 9ad36d1

Browse files
authored
feat: Add tool to get dataset schema so LLMs can understand dataset structure without fetching everything (#190)
* feat: Add tool to get dataset schema so LLMs can understand dataset structure without fetching everything (mainly for Zuzka). * fix: simplify code by removing nested try catch
1 parent eca53e7 commit 9ad36d1

File tree

8 files changed

+189
-21
lines changed

8 files changed

+189
-21
lines changed

package-lock.json

Lines changed: 51 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"apify": "^3.4.2",
4545
"apify-client": "^2.12.6",
4646
"express": "^4.21.2",
47+
"to-json-schema": "^0.2.5",
4748
"turndown": "^7.2.0",
4849
"yargs": "^17.7.2",
4950
"zod": "^3.24.1",
@@ -55,6 +56,7 @@
5556
"@apify/eslint-config": "^1.0.0",
5657
"@apify/tsconfig": "^0.1.0",
5758
"@types/express": "^4.0.0",
59+
"@types/to-json-schema": "^0.2.4",
5860
"@types/yargs": "^17.0.33",
5961
"@types/yargs-parser": "^21.0.3",
6062
"dotenv": "^16.4.7",

src/const.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export enum HelperTools {
2828
DATASET_GET = 'get-dataset',
2929
DATASET_LIST_GET = 'get-dataset-list',
3030
DATASET_GET_ITEMS = 'get-dataset-items',
31+
DATASET_SCHEMA_GET = 'get-dataset-schema',
3132
KEY_VALUE_STORE_LIST_GET = 'get-key-value-store-list',
3233
KEY_VALUE_STORE_GET = 'get-key-value-store',
3334
KEY_VALUE_STORE_KEYS_GET = 'get-key-value-store-keys',
@@ -44,11 +45,6 @@ export const defaults = {
4445
],
4546
};
4647

47-
// Actor output const
48-
export const ACTOR_OUTPUT_MAX_CHARS_PER_ITEM = 5_000;
49-
export const ACTOR_OUTPUT_TRUNCATED_MESSAGE = `Output was truncated because it will not fit into context.`
50-
+ `There is no reason to call this tool again! You can use ${HelperTools.DATASET_GET_ITEMS} tool to get more items from the dataset.`;
51-
5248
export const ACTOR_ADDITIONAL_INSTRUCTIONS = 'Never call/execute tool/Actor unless confirmed by the user.';
5349

5450
// Cache

src/mcp/server.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -544,22 +544,22 @@ export class ActorsMcpServer {
544544
const callOptions: ActorCallOptions = { memory: actorTool.memoryMbytes };
545545

546546
try {
547-
const { items } = await callActorGetDataset(
547+
const { runId, datasetId, items } = await callActorGetDataset(
548548
actorTool.actorFullName,
549549
args,
550550
apifyToken as string,
551551
callOptions,
552552
progressTracker,
553553
);
554+
const content = [
555+
{ type: 'text', text: `Actor finished with runId: ${runId}, datasetId ${datasetId}` },
556+
];
554557

555-
return {
556-
content: items.items.map((item: Record<string, unknown>) => {
557-
return {
558-
type: 'text',
559-
text: JSON.stringify(item),
560-
};
561-
}),
562-
};
558+
const itemContents = items.items.map((item: Record<string, unknown>) => {
559+
return { type: 'text', text: JSON.stringify(item) };
560+
});
561+
content.push(...itemContents);
562+
return { content };
563563
} finally {
564564
if (progressTracker) {
565565
progressTracker.stop();

src/stdio.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ const actorList = actors ? actors.split(',').map((a: string) => a.trim()) : [];
9595
// Keys of the tool categories to enable
9696
const toolCategoryKeys = argv.tools ? argv.tools.split(',').map((t: string) => t.trim()) : [];
9797

98+
// Propagate log.error to console.error for easier debugging
99+
const originalError = log.error.bind(log);
100+
log.error = (...args: Parameters<typeof log.error>) => {
101+
originalError(...args);
102+
// eslint-disable-next-line no-console
103+
console.error(...args);
104+
};
105+
98106
// Validate environment
99107
if (!process.env.APIFY_TOKEN) {
100108
log.error('APIFY_TOKEN is required but not set in the environment variables.');

src/tools/actor.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ const ajv = new Ajv({ coerceTypes: 'array', strict: false });
3737

3838
// Define a named return type for callActorGetDataset
3939
export type CallActorGetDatasetResult = {
40+
runId: string;
41+
datasetId: string;
4042
items: PaginatedList<Record<string, unknown>>;
4143
};
4244

@@ -95,8 +97,7 @@ export async function callActorGetDataset(
9597
}
9698

9799
log.info(`Actor ${actorName} finished with ${items.count} items`);
98-
99-
return { items };
100+
return { runId: actorRun.id, datasetId: completedRun.defaultDatasetId, items };
100101
} catch (error) {
101102
log.error(`Error calling actor: ${error}. Actor: ${actorName}, input: ${JSON.stringify(input)}`);
102103
throw new Error(`Error calling Actor: ${error}`);
@@ -120,9 +121,8 @@ export async function callActorGetDataset(
120121
* 4. Properties are shortened using shortenProperties()
121122
* 5. Enums are added to descriptions with examples using addEnumsToDescriptionsWithExamples()
122123
*
123-
* @param {string[]} actors - An array of actor IDs or Actor full names.
124-
* @param {string} apifyToken - The Apify token to use for authentication.
125-
* @returns {Promise<Tool[]>} - A promise that resolves to an array of MCP tools.
124+
* @param {ActorInfo[]} actorsInfo - An array of ActorInfo objects with webServerMcpPath and actorDefinitionPruned.
125+
* @returns {Promise<ToolEntry[]>} - A promise that resolves to an array of MCP tools.
126126
*/
127127
export async function getNormalActorsAsTools(
128128
actorsInfo: ActorInfo[],

src/tools/dataset.ts

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { Ajv } from 'ajv';
2+
import toJsonSchema from 'to-json-schema';
23
import { z } from 'zod';
34
import zodToJsonSchema from 'zod-to-json-schema';
45

@@ -112,3 +113,113 @@ export const getDatasetItems: ToolEntry = {
112113
},
113114
} as InternalTool,
114115
};
116+
117+
/**
118+
* Function to recursively remove empty arrays from an object
119+
*/
120+
function removeEmptyArrays(obj: unknown): unknown {
121+
if (Array.isArray(obj)) {
122+
// If the item is an array, recursively call removeEmptyArrays on each element.
123+
return obj.map((item) => removeEmptyArrays(item));
124+
}
125+
126+
if (typeof obj !== 'object' || obj === null) {
127+
// Return primitives and null values as is.
128+
return obj;
129+
}
130+
131+
// Use reduce to build a new object, excluding keys with empty arrays.
132+
return Object.entries(obj).reduce((acc, [key, value]) => {
133+
const processedValue = removeEmptyArrays(value);
134+
135+
// Exclude the key if the processed value is an empty array.
136+
if (Array.isArray(processedValue) && processedValue.length === 0) {
137+
return acc;
138+
}
139+
140+
acc[key] = processedValue;
141+
return acc;
142+
}, {} as Record<string, unknown>);
143+
}
144+
145+
const getDatasetSchemaArgs = z.object({
146+
datasetId: z.string()
147+
.min(1)
148+
.describe('Dataset ID or username~dataset-name.'),
149+
limit: z.number().optional()
150+
.describe('Maximum number of items to use for schema generation. Default is 5.')
151+
.default(5),
152+
clean: z.boolean().optional()
153+
.describe('If true, uses only non-empty items and skips hidden fields (starting with #). Default is true.')
154+
.default(true),
155+
arrayMode: z.enum(['first', 'all']).optional()
156+
.describe('Strategy for handling arrays. "first" uses first item as template, "all" merges all items. Default is "all".')
157+
.default('all'),
158+
additionalProperties: z.boolean().optional()
159+
.describe('If true, allows additional properties in objects. Default is true.')
160+
.default(true),
161+
});
162+
163+
/**
164+
* Generates a JSON schema from dataset items
165+
*/
166+
export const getDatasetSchema: ToolEntry = {
167+
type: 'internal',
168+
tool: {
169+
name: HelperTools.DATASET_SCHEMA_GET,
170+
actorFullName: HelperTools.DATASET_SCHEMA_GET,
171+
description: 'Generates a JSON schema from dataset items. '
172+
+ 'The schema describes the structure of the data in the dataset, which can be used for validation, documentation, or data processing.'
173+
+ 'Since the dataset can be large it is convenient to understand the structure of the dataset before getting dataset items.',
174+
inputSchema: zodToJsonSchema(getDatasetSchemaArgs),
175+
ajvValidate: ajv.compile(zodToJsonSchema(getDatasetSchemaArgs)),
176+
call: async (toolArgs) => {
177+
const { args, apifyToken } = toolArgs;
178+
const parsed = getDatasetSchemaArgs.parse(args);
179+
const client = new ApifyClient({ token: apifyToken });
180+
181+
// Get dataset items
182+
const datasetResponse = await client.dataset(parsed.datasetId).listItems({
183+
clean: parsed.clean,
184+
limit: parsed.limit,
185+
});
186+
187+
if (!datasetResponse) {
188+
return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] };
189+
}
190+
191+
const datasetItems = datasetResponse.items;
192+
193+
if (datasetItems.length === 0) {
194+
return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' is empty.` }] };
195+
}
196+
197+
// Clean the dataset items by removing empty arrays
198+
const cleanedDatasetItems = datasetItems.map((item) => removeEmptyArrays(item));
199+
200+
// Try to generate schema with full options first
201+
try {
202+
const schema = toJsonSchema(cleanedDatasetItems, {
203+
arrays: { mode: parsed.arrayMode },
204+
objects: { additionalProperties: parsed.additionalProperties },
205+
});
206+
207+
return {
208+
content: [{
209+
type: 'text',
210+
text: JSON.stringify(schema),
211+
}],
212+
};
213+
} catch {
214+
// Fallback: try with simpler approach
215+
const fallbackSchema = toJsonSchema(cleanedDatasetItems, {
216+
arrays: { mode: 'first' },
217+
});
218+
219+
return {
220+
content: [{ type: 'text', text: JSON.stringify(fallbackSchema) }],
221+
};
222+
}
223+
},
224+
} as InternalTool,
225+
};

src/tools/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Import specific tools that are being used
22
import type { ToolCategory } from '../types.js';
33
import { callActor, callActorGetDataset, getActorsAsTools } from './actor.js';
4-
import { getDataset, getDatasetItems } from './dataset.js';
4+
import { getDataset, getDatasetItems, getDatasetSchema } from './dataset.js';
55
import { getUserDatasetsList } from './dataset_collection.js';
66
import { fetchApifyDocsTool } from './fetch-apify-docs.js';
77
import { getActorDetailsTool } from './get-actor-details.js';
@@ -26,6 +26,7 @@ export const toolCategories = {
2626
storage: [
2727
getDataset,
2828
getDatasetItems,
29+
getDatasetSchema,
2930
getKeyValueStore,
3031
getKeyValueStoreKeys,
3132
getKeyValueStoreRecord,

0 commit comments

Comments
 (0)