Skip to content

Commit 5053f3d

Browse files
committed
feat: add aggregation support for multiple fields
1 parent 0ab3de1 commit 5053f3d

File tree

1 file changed

+57
-22
lines changed

1 file changed

+57
-22
lines changed

src/mcp-servers/datasets/tools.ts

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -225,13 +225,13 @@ const registerTools = (server: McpServer) => {
225225
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets or provided by the user'),
226226
query: z.string().optional().describe('French keywords for full-text search across all dataset columns (simple keywords, not sentences). Do not use with filters parameter. Examples: "Jean Dupont", "Paris", "2025"'),
227227
filters: z.record(
228-
z.string().regex(/^.+_(search|eq|in|gte?|lte?|lt|gt)$/, {
229-
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt)'
228+
z.string().regex(/^.+_(search|eq|in|gte?|lte?|n?exists)$/, {
229+
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt, _exists, _nexists)'
230230
}),
231231
z.string()
232232
)
233233
.optional()
234-
.describe('Precise filters on specific columns. Ideal for multi-condition queries or range searches. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
234+
.describe('Precise filters on specific columns. This applies to each row individually. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than), _exists (exists), and _nexists (does not exist). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
235235
select: z.string().optional().describe('Optional comma-separated list of column keys to include in the results. Useful when the dataset has many columns to reduce output size. If not provided, all columns are returned. Use column keys from describe_dataset. Format: column1,column2,column3 (No spaces after commas). Example: "nom,age,ville"')
236236
},
237237
outputSchema: {
@@ -297,6 +297,16 @@ const registerTools = (server: McpServer) => {
297297
}
298298
)
299299

300+
/** Type zod récursif pour l'output schema */
301+
const AggregationResult: z.ZodType<any> = z.object({
302+
total: z.number().describe('Total number of rows aggregated for this column'),
303+
totalAggregated: z.number().optional().describe('Total number of different values aggregated for this column'),
304+
nonRepresented: z.number().optional().describe('The number of non-represented rows for this column'),
305+
columnValue: z.union([z.string(), z.number()]).describe('The value of the aggregated column (string or number)'),
306+
metricValue: z.number().nullable().optional().describe('The value of the aggregation metric (e.g., sum, avg) on the selected column'),
307+
aggregations: z.lazy(() => z.array(AggregationResult)).optional().describe('Nested aggregation results when multiple columns are specified (max 3 levels deep)')
308+
})
309+
300310
/**
301311
* Tool to aggregate data from a specific dataset.
302312
* This tool allows users to perform aggregations on dataset columns, such as counting unique values,
@@ -315,48 +325,64 @@ const registerTools = (server: McpServer) => {
315325
'aggregate_data',
316326
{
317327
title: 'Aggregate data from a dataset',
318-
description: 'Perform aggregations on dataset columns, such as counting unique values, summing numeric columns, or calculating averages. Use this after describe_dataset to understand the dataset structure and available column keys. Example: {"datasetId": "123", "aggregationColumn": "code_sexe", "operation": {"column": "age", "operation": "avg"}} this will return the average age grouped by code_sexe',
328+
description: 'Perform aggregations on dataset columns, such as counting unique values, summing numeric columns, or calculating averages. Use this after describe_dataset to understand the dataset structure and available column keys. Example: {"datasetId": "123", "aggregationColumn": ["code_sexe", "region"], "aggregation": {"column": "age", "metric": "avg"}} this will return the average age grouped by code_sexe and region. Aggregation is limited to a maximum of 3 columns.',
319329
inputSchema: {
320330
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets tool'),
321-
aggregationColumn: z.string().describe('The column key to aggregate (use keys from describe_dataset)'),
331+
aggregationColumn: z.array(z.string())
332+
.max(3, 'You can aggregate by at most 3 columns')
333+
.describe('List of column keys to aggregate (use keys from describe_dataset, max 3 columns)'),
322334
aggregation: z.object({
323335
column: z.string().describe('The column key to aggregate (use keys from describe_dataset)'),
324336
metric: z.enum(['sum', 'avg', 'min', 'max']).describe('Aggregation metric to perform on the column')
325337
})
326338
.optional()
327-
.describe('The aggregation specification to perform on the specified column. Use keys from describe_dataset. If not provided, defaults to counting unique values in the specified column.')
339+
.describe('The aggregation specification to perform on the specified column. Use keys from describe_dataset. If not provided, defaults to counting unique values in the specified column.'),
340+
filters: z.record(
341+
z.string().regex(/^.+_(search|eq|in|gte?|lte?|n?exists)$/, {
342+
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt, _exists, _nexists)'
343+
}),
344+
z.string()
345+
)
346+
.optional()
347+
.describe('Precise filters on specific columns. This applies to each row individually. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than), _exists (exists), and _nexists (does not exist). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
328348
},
329349
outputSchema: {
330350
total: z.number().describe('The total number of rows in the dataset'),
331351
totalAggregated: z.number().describe('The total number of different values aggregated across all specified columns'),
352+
nonRepresented: z.number().describe('The number of non-represented rows in the dataset, 0 if totalAggregated is less than 20, otherwise the number of non-represented rows'),
332353
datasetId: z.string().describe('The dataset ID that was aggregated'),
333-
filteredViewUrl: z.string().describe('Direct URL to view the filtered dataset results in JSON format (must be included in responses for citation and direct access to aggregated view)'),
334-
aggregations: z.array(
335-
z.object({
336-
total: z.number().describe('Total number of rows aggregated for this column'),
337-
columnValue: z.string().describe('The value of the aggregated column'),
338-
metricValue: z.number().optional().describe('The value of the aggregation metric (e.g., sum, avg) on the selected column'),
339-
})
340-
).describe('Array of aggregation results for each specified column')
354+
requestUrl: z.string().describe('Direct URL to API results in JSON format (must be included in responses for citation and direct access to aggregated view)'),
355+
aggregations: z.array(AggregationResult).describe('Array of aggregation results for each specified column (limited to 20 rows)')
341356
},
342357
annotations: {
343358
readOnlyHint: true
344359
}
345360
},
346-
async (params: { datasetId: string, aggregationColumn: string, aggregation?: { column: string, metric: 'sum' | 'avg' | 'min' | 'max' } }) => {
361+
async (params: { datasetId: string, aggregationColumn: string[], aggregation?: { column: string, metric: 'sum' | 'avg' | 'min' | 'max' }, filters?: Record<string, string> }) => {
347362
debug('Executing aggregate_data tool with dataset:', params.datasetId, 'aggregation:', JSON.stringify(params.aggregation))
348363

364+
// Limit aggregationColumn to 3 elements max (runtime check for extra safety)
365+
if (params.aggregationColumn.length > 3) {
366+
throw new Error('You can aggregate by at most 3 columns')
367+
}
368+
349369
const fetchUrl = new URL(`${config.dataFairUrl}/data-fair/api/v1/datasets/${params.datasetId}/values_agg`)
350370

351371
// Build common search parameters for both fetch and source URLs
352372
const aggsParams = new URLSearchParams()
353-
aggsParams.append('field', params.aggregationColumn)
373+
aggsParams.append('field', params.aggregationColumn.slice(0, 3).join(';'))
354374
if (params.aggregation) {
355375
aggsParams.append('metric', params.aggregation.metric)
356376
aggsParams.append('metric_field', params.aggregation.column)
357377
}
358378
aggsParams.append('missing', 'Données manquantes')
359379

380+
if (params.filters) {
381+
for (const [key, value] of Object.entries(params.filters)) {
382+
aggsParams.append(key, value)
383+
}
384+
}
385+
360386
fetchUrl.search = aggsParams.toString()
361387

362388
// Fetch detailed dataset information
@@ -365,17 +391,26 @@ const registerTools = (server: McpServer) => {
365391
axiosOptions
366392
)).data
367393

394+
// Map the aggregation results to a structured format (recursive)
395+
const mapAggregation = (agg: any): any => ({
396+
total: agg.total,
397+
totalAggregated: agg.total_values,
398+
nonRepresented: agg.total_other,
399+
columnValue: agg.value,
400+
metricValue: agg.metric,
401+
...(agg.aggs && agg.aggs.length > 0 && {
402+
aggregations: agg.aggs.map(mapAggregation)
403+
})
404+
})
405+
368406
// Format the fetched data into a structured content object
369407
const structuredContent = {
370408
total: response.total,
371409
totalAggregated: response.total_values,
410+
nonRepresented: response.total_other,
372411
datasetId: params.datasetId,
373-
filteredViewUrl: fetchUrl.toString(),
374-
aggregations: response.aggs.map((agg: any) => ({
375-
total: agg.total,
376-
columnValue: agg.value,
377-
metricValue: agg.metric
378-
}))
412+
requestUrl: fetchUrl.toString(),
413+
aggregations: response.aggs.map(mapAggregation)
379414
}
380415

381416
return { // https://modelcontextprotocol.io/specification/2025-06-18/server/tools#tool-result

0 commit comments

Comments
 (0)