feat: Add tool to get dataset schema so LLMs can understand dataset structure without fetching everything (mainly for Zuzka).

jirispilka · jirispilka · commit e1279713eb4d · 2025-07-24T15:44:09.000+02:00
Placed under storage since that group isn’t enabled by default.

Also, I added runId and datasetId injection into context, hopefully it won't break anything affect existing flows.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -39,6 +39,7 @@
     "apify": "^3.4.2",
     "apify-client": "^2.12.6",
     "express": "^4.21.2",
+    "to-json-schema": "^0.2.5",
     "turndown": "^7.2.0",
     "yargs": "^17.7.2",
     "zod": "^3.24.1",
@@ -50,6 +51,7 @@
     "@apify/eslint-config": "^1.0.0",
     "@apify/tsconfig": "^0.1.0",
     "@types/express": "^4.0.0",
+    "@types/to-json-schema": "^0.2.4",
     "@types/yargs": "^17.0.33",
     "@types/yargs-parser": "^21.0.3",
     "dotenv": "^16.4.7",
diff --git a/src/const.ts b/src/const.ts
@@ -28,6 +28,7 @@ export enum HelperTools {
     DATASET_GET = 'get-dataset',
     DATASET_LIST_GET = 'get-dataset-list',
     DATASET_GET_ITEMS = 'get-dataset-items',
+    DATASET_SCHEMA_GET = 'get-dataset-schema',
     KEY_VALUE_STORE_LIST_GET = 'get-key-value-store-list',
     KEY_VALUE_STORE_GET = 'get-key-value-store',
     KEY_VALUE_STORE_KEYS_GET = 'get-key-value-store-keys',
@@ -44,11 +45,6 @@ export const defaults = {
     ],
 };
 
-// Actor output const
-export const ACTOR_OUTPUT_MAX_CHARS_PER_ITEM = 5_000;
-export const ACTOR_OUTPUT_TRUNCATED_MESSAGE = `Output was truncated because it will not fit into context.`
-    + `There is no reason to call this tool again! You can use ${HelperTools.DATASET_GET_ITEMS} tool to get more items from the dataset.`;
-
 export const ACTOR_ADDITIONAL_INSTRUCTIONS = 'Never call/execute tool/Actor unless confirmed by the user.';
 
 // Cache
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
@@ -495,22 +495,22 @@ export class ActorsMcpServer {
                     const callOptions: ActorCallOptions = { memory: actorTool.memoryMbytes };
 
                     try {
-                        const { items } = await callActorGetDataset(
+                        const { runId, datasetId, items } = await callActorGetDataset(
                             actorTool.actorFullName,
                             args,
                             apifyToken as string,
                             callOptions,
                             progressTracker,
                         );
+                        const content = [
+                            { type: 'text', text: `Actor finished with runId: ${runId}, datasetId ${datasetId}` },
+                        ];
 
-                        return {
-                            content: items.items.map((item: Record<string, unknown>) => {
-                                return {
-                                    type: 'text',
-                                    text: JSON.stringify(item),
-                                };
-                            }),
-                        };
+                        const itemContents = items.items.map((item: Record<string, unknown>) => {
+                            return { type: 'text', text: JSON.stringify(item) };
+                        });
+                        content.push(...itemContents);
+                        return { content };
                     } finally {
                         if (progressTracker) {
                             progressTracker.stop();
diff --git a/src/stdio.ts b/src/stdio.ts
@@ -95,6 +95,14 @@ const actorList = actors ? actors.split(',').map((a: string) => a.trim()) : [];
 // Keys of the tool categories to enable
 const toolCategoryKeys = argv.tools ? argv.tools.split(',').map((t: string) => t.trim()) : [];
 
+// Propagate log.error to console.error for easier debugging
+const originalError = log.error.bind(log);
+log.error = (...args: Parameters<typeof log.error>) => {
+    originalError(...args);
+    // eslint-disable-next-line no-console
+    console.error(...args);
+};
+
 // Validate environment
 if (!process.env.APIFY_TOKEN) {
     log.error('APIFY_TOKEN is required but not set in the environment variables.');
diff --git a/src/tools/actor.ts b/src/tools/actor.ts
@@ -37,6 +37,8 @@ const ajv = new Ajv({ coerceTypes: 'array', strict: false });
 
 // Define a named return type for callActorGetDataset
 export type CallActorGetDatasetResult = {
+    runId: string;
+    datasetId: string;
     items: PaginatedList<Record<string, unknown>>;
 };
 
@@ -95,8 +97,7 @@ export async function callActorGetDataset(
         }
 
         log.info(`Actor ${actorName} finished with ${items.count} items`);
-
-        return { items };
+        return { runId: actorRun.id, datasetId: completedRun.defaultDatasetId, items };
     } catch (error) {
         log.error(`Error calling actor: ${error}. Actor: ${actorName}, input: ${JSON.stringify(input)}`);
         throw new Error(`Error calling Actor: ${error}`);
@@ -120,9 +121,8 @@ export async function callActorGetDataset(
  * 4. Properties are shortened using shortenProperties()
  * 5. Enums are added to descriptions with examples using addEnumsToDescriptionsWithExamples()
  *
- * @param {string[]} actors - An array of actor IDs or Actor full names.
- * @param {string} apifyToken - The Apify token to use for authentication.
- * @returns {Promise<Tool[]>} - A promise that resolves to an array of MCP tools.
+ * @param {ActorInfo[]} actorsInfo - An array of ActorInfo objects with webServerMcpPath and actorDefinitionPruned.
+ * @returns {Promise<ToolEntry[]>} - A promise that resolves to an array of MCP tools.
  */
 export async function getNormalActorsAsTools(
     actorsInfo: ActorInfo[],
diff --git a/src/tools/dataset.ts b/src/tools/dataset.ts
@@ -1,4 +1,5 @@
 import { Ajv } from 'ajv';
+import toJsonSchema from 'to-json-schema';
 import { z } from 'zod';
 import zodToJsonSchema from 'zod-to-json-schema';
 
@@ -112,3 +113,132 @@ export const getDatasetItems: ToolEntry = {
         },
     } as InternalTool,
 };
+
+/**
+ * Function to recursively remove empty arrays from an object
+ */
+function removeEmptyArrays(obj: unknown): unknown {
+    if (Array.isArray(obj)) {
+        // If the item is an array, recursively call removeEmptyArrays on each element.
+        return obj.map((item) => removeEmptyArrays(item));
+    }
+
+    if (typeof obj !== 'object' || obj === null) {
+        // Return primitives and null values as is.
+        return obj;
+    }
+
+    // Use reduce to build a new object, excluding keys with empty arrays.
+    return Object.entries(obj).reduce((acc, [key, value]) => {
+        const processedValue = removeEmptyArrays(value);
+
+        // Exclude the key if the processed value is an empty array.
+        if (Array.isArray(processedValue) && processedValue.length === 0) {
+            return acc;
+        }
+
+        acc[key] = processedValue;
+        return acc;
+    }, {} as Record<string, unknown>);
+}
+
+const getDatasetSchemaArgs = z.object({
+    datasetId: z.string()
+        .min(1)
+        .describe('Dataset ID or username~dataset-name.'),
+    limit: z.number().optional()
+        .describe('Maximum number of items to use for schema generation. Default is 5.')
+        .default(5),
+    clean: z.boolean().optional()
+        .describe('If true, uses only non-empty items and skips hidden fields (starting with #). Default is true.')
+        .default(true),
+    arrayMode: z.enum(['first', 'all']).optional()
+        .describe('Strategy for handling arrays. "first" uses first item as template, "all" merges all items. Default is "all".')
+        .default('all'),
+    additionalProperties: z.boolean().optional()
+        .describe('If true, allows additional properties in objects. Default is true.')
+        .default(true),
+});
+
+/**
+ * Generates a JSON schema from dataset items
+ */
+export const getDatasetSchema: ToolEntry = {
+    type: 'internal',
+    tool: {
+        name: HelperTools.DATASET_SCHEMA_GET,
+        actorFullName: HelperTools.DATASET_SCHEMA_GET,
+        description: 'Generates a JSON schema from dataset items. '
+            + 'The schema describes the structure of the data in the dataset, which can be used for validation, documentation, or data processing.'
+            + 'Since the dataset can be large it is convenient to understand the structure of the dataset before getting dataset items.',
+        inputSchema: zodToJsonSchema(getDatasetSchemaArgs),
+        ajvValidate: ajv.compile(zodToJsonSchema(getDatasetSchemaArgs)),
+        call: async (toolArgs) => {
+            const { args, apifyToken } = toolArgs;
+            const parsed = getDatasetSchemaArgs.parse(args);
+            const client = new ApifyClient({ token: apifyToken });
+
+            try {
+                // Get dataset items
+                const datasetResponse = await client.dataset(parsed.datasetId).listItems({
+                    clean: parsed.clean,
+                    limit: parsed.limit,
+                });
+
+                if (!datasetResponse) {
+                    return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' not found.` }] };
+                }
+
+                const datasetItems = datasetResponse.items;
+
+                if (datasetItems.length === 0) {
+                    return { content: [{ type: 'text', text: `Dataset '${parsed.datasetId}' is empty.` }] };
+                }
+
+                // Clean the dataset items by removing empty arrays
+                const cleanedDatasetItems = datasetItems.map((item) => removeEmptyArrays(item));
+
+                try {
+                    // Generate schema with options to handle arrays better
+                    const schema = toJsonSchema(cleanedDatasetItems, {
+                        arrays: { mode: parsed.arrayMode },
+                        objects: { additionalProperties: parsed.additionalProperties },
+                        // strings: { detectFormat: false },
+                    });
+
+                    return {
+                        content: [{
+                            type: 'text',
+                            text: JSON.stringify(schema),
+                        }],
+                    };
+                } catch (schemaError) {
+                    // Fallback: try with a simpler approach
+                    try {
+                        const fallbackSchema = toJsonSchema(cleanedDatasetItems, {
+                            arrays: { mode: 'first' },
+                        });
+
+                        return {
+                            content: [{ type: 'text', text: JSON.stringify(fallbackSchema) }],
+                        };
+                    } catch (fallbackError) {
+                        return {
+                            content: [{
+                                type: 'text',
+                                text: `Error generating schema: ${(schemaError as Error).message}. Fallback also failed: ${(fallbackError as Error).message}`,
+                            }],
+                        };
+                    }
+                }
+            } catch (error) {
+                return {
+                    content: [{
+                        type: 'text',
+                        text: `Error generating schema: ${(error as Error).message}`,
+                    }],
+                };
+            }
+        },
+    } as InternalTool,
+};
diff --git a/src/tools/index.ts b/src/tools/index.ts
@@ -1,7 +1,7 @@
 // Import specific tools that are being used
 import type { ToolCategory } from '../types.js';
 import { callActor, callActorGetDataset, getActorsAsTools } from './actor.js';
-import { getDataset, getDatasetItems } from './dataset.js';
+import { getDataset, getDatasetItems, getDatasetSchema } from './dataset.js';
 import { getUserDatasetsList } from './dataset_collection.js';
 import { fetchApifyDocsTool } from './fetch-apify-docs.js';
 import { getActorDetailsTool } from './get-actor-details.js';
@@ -26,6 +26,7 @@ export const toolCategories = {
     storage: [
         getDataset,
         getDatasetItems,
+        getDatasetSchema,
         getKeyValueStore,
         getKeyValueStoreKeys,
         getKeyValueStoreRecord,