Skip to content

Commit 0ab3de1

Browse files
committed
feat: add aggregation
1 parent 79580b3 commit 0ab3de1

File tree

1 file changed

+97
-3
lines changed

1 file changed

+97
-3
lines changed

src/mcp-servers/datasets/tools.ts

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,13 +225,13 @@ const registerTools = (server: McpServer) => {
225225
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets or provided by the user'),
226226
query: z.string().optional().describe('French keywords for full-text search across all dataset columns (simple keywords, not sentences). Do not use with filters parameter. Examples: "Jean Dupont", "Paris", "2025"'),
227227
filters: z.record(
228-
z.string().regex(/^.+_(search|eq|gte|lte)$/, {
229-
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _gte, _lte)'
228+
z.string().regex(/^.+_(search|eq|in|gte?|lte?|lt|gt)$/, {
229+
message: 'Filter key must follow pattern: column_key + suffix (_eq, _search, _in, _gte, _gt, _lte, _lt)'
230230
}),
231231
z.string()
232232
)
233233
.optional()
234-
.describe('Precise filters on specific columns. Ideal for multi-condition queries or range searches. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _lte (less than or equal). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris" } searches for people whose names contain "Jean", who are 30 years old or younger, and who live in Paris.'),
234+
.describe('Precise filters on specific columns. Ideal for multi-condition queries or range searches. Each filter key must be: column_key + suffix. Available suffixes: _eq (strictly equal, case-sensitive), _in (value must be in the list, case-sensitive, values separated by a comma), _search (full-text search within that column, case-insensitive and flexible matching), _gte (greater than or equal), _gt (greater than), _lte (less than or equal), _lt (less than). Use column keys from describe_dataset. Example: { "nom_search": "Jean", "age_lte": "30", "ville_eq": "Paris", "code_in": "A,B,C" } searches for people whose names contain "Jean", who are 30 years old or younger, who live in Paris, and whose code is A, B, or C.'),
235235
select: z.string().optional().describe('Optional comma-separated list of column keys to include in the results. Useful when the dataset has many columns to reduce output size. If not provided, all columns are returned. Use column keys from describe_dataset. Format: column1,column2,column3 (No spaces after commas). Example: "nom,age,ville"')
236236
},
237237
outputSchema: {
@@ -296,6 +296,100 @@ const registerTools = (server: McpServer) => {
296296
}
297297
}
298298
)
299+
300+
/**
301+
* Tool to aggregate data from a specific dataset.
302+
* This tool allows users to perform aggregations on dataset columns, such as counting unique values,
303+
* summing numeric columns, or calculating averages. It is useful for summarizing dataset content
304+
* and extracting insights without retrieving all data rows.
305+
* @param {string} datasetId - The unique ID of the dataset to aggregate (obtained from search_datasets)
306+
* @param {string} aggregationColumn - The column key to aggregate (use keys from describe_dataset)
307+
* @param {Object} aggregation - The aggregation specification to perform on the specified column.
308+
* If not provided, defaults to counting unique values in the specified column.
309+
* Example: { "column": "age", "metric": "avg" }
310+
* This will return the average age grouped by the specified aggregationColumn.
311+
* Supported metrics: sum, avg, min, max.
312+
* If you want to sum a numeric column, use { "column": "column_key", "metric": "sum" }.
313+
*/
314+
server.registerTool(
315+
'aggregate_data',
316+
{
317+
title: 'Aggregate data from a dataset',
318+
description: 'Perform aggregations on dataset columns, such as counting unique values, summing numeric columns, or calculating averages. Use this after describe_dataset to understand the dataset structure and available column keys. Example: {"datasetId": "123", "aggregationColumn": "code_sexe", "operation": {"column": "age", "operation": "avg"}} this will return the average age grouped by code_sexe',
319+
inputSchema: {
320+
datasetId: z.string().describe('The unique dataset ID obtained from search_datasets tool'),
321+
aggregationColumn: z.string().describe('The column key to aggregate (use keys from describe_dataset)'),
322+
aggregation: z.object({
323+
column: z.string().describe('The column key to aggregate (use keys from describe_dataset)'),
324+
metric: z.enum(['sum', 'avg', 'min', 'max']).describe('Aggregation metric to perform on the column')
325+
})
326+
.optional()
327+
.describe('The aggregation specification to perform on the specified column. Use keys from describe_dataset. If not provided, defaults to counting unique values in the specified column.')
328+
},
329+
outputSchema: {
330+
total: z.number().describe('The total number of rows in the dataset'),
331+
totalAggregated: z.number().describe('The total number of different values aggregated across all specified columns'),
332+
datasetId: z.string().describe('The dataset ID that was aggregated'),
333+
filteredViewUrl: z.string().describe('Direct URL to view the filtered dataset results in JSON format (must be included in responses for citation and direct access to aggregated view)'),
334+
aggregations: z.array(
335+
z.object({
336+
total: z.number().describe('Total number of rows aggregated for this column'),
337+
columnValue: z.string().describe('The value of the aggregated column'),
338+
metricValue: z.number().optional().describe('The value of the aggregation metric (e.g., sum, avg) on the selected column'),
339+
})
340+
).describe('Array of aggregation results for each specified column')
341+
},
342+
annotations: {
343+
readOnlyHint: true
344+
}
345+
},
346+
async (params: { datasetId: string, aggregationColumn: string, aggregation?: { column: string, metric: 'sum' | 'avg' | 'min' | 'max' } }) => {
347+
debug('Executing aggregate_data tool with dataset:', params.datasetId, 'aggregation:', JSON.stringify(params.aggregation))
348+
349+
const fetchUrl = new URL(`${config.dataFairUrl}/data-fair/api/v1/datasets/${params.datasetId}/values_agg`)
350+
351+
// Build common search parameters for both fetch and source URLs
352+
const aggsParams = new URLSearchParams()
353+
aggsParams.append('field', params.aggregationColumn)
354+
if (params.aggregation) {
355+
aggsParams.append('metric', params.aggregation.metric)
356+
aggsParams.append('metric_field', params.aggregation.column)
357+
}
358+
aggsParams.append('missing', 'Données manquantes')
359+
360+
fetchUrl.search = aggsParams.toString()
361+
362+
// Fetch detailed dataset information
363+
const response = (await axios.get(
364+
fetchUrl.toString(),
365+
axiosOptions
366+
)).data
367+
368+
// Format the fetched data into a structured content object
369+
const structuredContent = {
370+
total: response.total,
371+
totalAggregated: response.total_values,
372+
datasetId: params.datasetId,
373+
filteredViewUrl: fetchUrl.toString(),
374+
aggregations: response.aggs.map((agg: any) => ({
375+
total: agg.total,
376+
columnValue: agg.value,
377+
metricValue: agg.metric
378+
}))
379+
}
380+
381+
return { // https://modelcontextprotocol.io/specification/2025-06-18/server/tools#tool-result
382+
structuredContent,
383+
// For backwards compatibility, also return the serialized JSON in TextContent blocks
384+
content: [
385+
{
386+
type: 'text',
387+
text: JSON.stringify(structuredContent),
388+
}
389+
]
390+
}
391+
}
392+
)
299393
}
300394

301395
export default registerTools

0 commit comments

Comments
 (0)