Skip to content

Commit 4c4839d

Browse files
committed
fix: improve llm comprehension by adding count possibilty for metric
1 parent 5053f3d commit 4c4839d

File tree

1 file changed

+23
-29
lines changed

1 file changed

+23
-29
lines changed

src/mcp-servers/datasets/tools.ts

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,14 @@ const registerTools = (server: McpServer) => {
204204
}
205205
)
206206

207+
/** zod schema for filters */
208+
const filtersSchema = z.record(
209+
z.string().regex(/^.+_(search|eq|in|gte?|lte?|n?exists)$/, {
210+
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt, _exists, _nexists)'
211+
}),
212+
z.string()
213+
).optional().describe('Precise filters on specific columns. This applies to each row individually. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than), _exists (exists), and _nexists (does not exist). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.')
214+
207215
/**
208216
* Tool to search for specific data rows within a dataset using either full-text search OR precise filters.
209217
* This tool can search data in two ways:
@@ -224,14 +232,7 @@ const registerTools = (server: McpServer) => {
224232
inputSchema: {
225233
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets or provided by the user'),
226234
query: z.string().optional().describe('French keywords for full-text search across all dataset columns (simple keywords, not sentences). Do not use with filters parameter. Examples: "Jean Dupont", "Paris", "2025"'),
227-
filters: z.record(
228-
z.string().regex(/^.+_(search|eq|in|gte?|lte?|n?exists)$/, {
229-
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt, _exists, _nexists)'
230-
}),
231-
z.string()
232-
)
233-
.optional()
234-
.describe('Precise filters on specific columns. This applies to each row individually. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than), _exists (exists), and _nexists (does not exist). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
235+
filters: filtersSchema,
235236
select: z.string().optional().describe('Optional comma-separated list of column keys to include in the results. Useful when the dataset has many columns to reduce output size. If not provided, all columns are returned. Use column keys from describe_dataset. Format: column1,column2,column3 (No spaces after commas). Example: "nom,age,ville"')
236237
},
237238
outputSchema: {
@@ -313,7 +314,7 @@ const registerTools = (server: McpServer) => {
313314
* summing numeric columns, or calculating averages. It is useful for summarizing dataset content
314315
* and extracting insights without retrieving all data rows.
315316
* @param {string} datasetId - The unique ID of the dataset to aggregate (obtained from search_datasets)
316-
* @param {string} aggregationColumn - The column key to aggregate (use keys from describe_dataset)
317+
* @param {string} aggregationColumns - The column key to aggregate (use keys from describe_dataset)
317318
* @param {Object} aggregation - The aggregation specification to perform on the specified column.
318319
* If not provided, defaults to counting unique values in the specified column.
319320
* Example: { "column": "age", "metric": "avg" }
@@ -325,26 +326,20 @@ const registerTools = (server: McpServer) => {
325326
'aggregate_data',
326327
{
327328
title: 'Aggregate data from a dataset',
328-
description: 'Perform aggregations on dataset columns, such as counting unique values, summing numeric columns, or calculating averages. Use this after describe_dataset to understand the dataset structure and available column keys. Example: {"datasetId": "123", "aggregationColumn": ["code_sexe", "region"], "aggregation": {"column": "age", "metric": "avg"}} this will return the average age grouped by code_sexe and region. Aggregation is limited to a maximum of 3 columns.',
329+
description: 'Perform aggregations on dataset columns, such as counting unique values, summing numeric columns, or calculating averages. Use this after describe_dataset to understand the dataset structure and available column keys. Example: {"datasetId": "123", "aggregationColumns": ["code_sexe", "region"], "aggregation": {"column": "age", "metric": "avg"}} this will return the average age grouped by code_sexe and region. Aggregation is limited to a maximum of 3 columns.',
329330
inputSchema: {
330331
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets tool'),
331-
aggregationColumn: z.array(z.string())
332+
aggregationColumns: z.array(z.string())
333+
.min(1, 'You must specify at least one column to aggregate')
332334
.max(3, 'You can aggregate by at most 3 columns')
333-
.describe('List of column keys to aggregate (use keys from describe_dataset, max 3 columns)'),
335+
.describe('List of column keys to aggregate (use keys from describe_dataset, min 1 column, max 3 columns)'),
334336
aggregation: z.object({
335337
column: z.string().describe('The column key to aggregate (use keys from describe_dataset)'),
336-
metric: z.enum(['sum', 'avg', 'min', 'max']).describe('Aggregation metric to perform on the column')
338+
metric: z.enum(['sum', 'avg', 'min', 'max', 'count']).describe('Aggregation metric to perform on the column. Available operations are: sum, avg, min, max, count.')
337339
})
338340
.optional()
339-
.describe('The aggregation specification to perform on the specified column. Use keys from describe_dataset. If not provided, defaults to counting unique values in the specified column.'),
340-
filters: z.record(
341-
z.string().regex(/^.+_(search|eq|in|gte?|lte?|n?exists)$/, {
342-
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt, _exists, _nexists)'
343-
}),
344-
z.string()
345-
)
346-
.optional()
347-
.describe('Precise filters on specific columns. This applies to each row individually. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than), _exists (exists), and _nexists (does not exist). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
341+
.describe('The aggregation specification to perform on the specified column. Use keys from describe_dataset. If not provided, defaults to counting unique values in the aggregation column.'),
342+
filters: filtersSchema
348343
},
349344
outputSchema: {
350345
total: z.number().describe('The total number of rows in the dataset'),
@@ -358,24 +353,23 @@ const registerTools = (server: McpServer) => {
358353
readOnlyHint: true
359354
}
360355
},
361-
async (params: { datasetId: string, aggregationColumn: string[], aggregation?: { column: string, metric: 'sum' | 'avg' | 'min' | 'max' }, filters?: Record<string, string> }) => {
362-
debug('Executing aggregate_data tool with dataset:', params.datasetId, 'aggregation:', JSON.stringify(params.aggregation))
356+
async (params: { datasetId: string, aggregationColumns: string[], aggregation?: { column: string, metric: 'sum' | 'avg' | 'min' | 'max' | 'count' }, filters?: Record<string, string> }) => {
357+
debug('Executing aggregate_data tool with dataset:', params.datasetId, 'columns:', params.aggregationColumns, 'aggregation:', JSON.stringify(params.aggregation))
363358

364-
// Limit aggregationColumn to 3 elements max (runtime check for extra safety)
365-
if (params.aggregationColumn.length > 3) {
359+
// Limit aggregationColumns to 3 elements max
360+
if (params.aggregationColumns.length > 3) {
366361
throw new Error('You can aggregate by at most 3 columns')
367362
}
368363

369364
const fetchUrl = new URL(`${config.dataFairUrl}/data-fair/api/v1/datasets/${params.datasetId}/values_agg`)
370365

371366
// Build common search parameters for both fetch and source URLs
372367
const aggsParams = new URLSearchParams()
373-
aggsParams.append('field', params.aggregationColumn.slice(0, 3).join(';'))
374-
if (params.aggregation) {
368+
aggsParams.append('field', params.aggregationColumns.slice(0, 3).join(';'))
369+
if (params.aggregation && params.aggregation.metric !== 'count') {
375370
aggsParams.append('metric', params.aggregation.metric)
376371
aggsParams.append('metric_field', params.aggregation.column)
377372
}
378-
aggsParams.append('missing', 'Données manquantes')
379373

380374
if (params.filters) {
381375
for (const [key, value] of Object.entries(params.filters)) {

0 commit comments

Comments
 (0)