diff --git a/packages/compass-collection/src/modules/collection-tab.ts b/packages/compass-collection/src/modules/collection-tab.ts index 405dfd333be..874e4026035 100644 --- a/packages/compass-collection/src/modules/collection-tab.ts +++ b/packages/compass-collection/src/modules/collection-tab.ts @@ -1,5 +1,5 @@ import type { Reducer, AnyAction, Action } from 'redux'; -import { analyzeDocuments, type Schema } from 'mongodb-schema'; +import { analyzeDocuments } from 'mongodb-schema'; import type { CollectionMetadata } from 'mongodb-collection-model'; import type { ThunkAction } from 'redux-thunk'; @@ -19,8 +19,10 @@ import { SCHEMA_ANALYSIS_STATE_INITIAL, type SchemaAnalysisError, type SchemaAnalysisState, + type FieldInfo, } from '../schema-analysis-types'; import { calculateSchemaDepth } from '../calculate-schema-depth'; +import { processSchema } from '../transform-schema-to-field-info'; import type { Document, MongoError } from 'mongodb'; const DEFAULT_SAMPLE_SIZE = 100; @@ -106,7 +108,7 @@ interface SchemaAnalysisStartedAction { interface SchemaAnalysisFinishedAction { type: CollectionActions.SchemaAnalysisFinished; - schema: Schema; + processedSchema: Record; sampleDocument: Document; schemaMetadata: { maxNestingDepth: number; @@ -201,7 +203,7 @@ const reducer: Reducer = ( ...state, schemaAnalysis: { status: SCHEMA_ANALYSIS_STATE_COMPLETE, - schema: action.schema, + processedSchema: action.processedSchema, sampleDocument: action.sampleDocument, schemaMetadata: action.schemaMetadata, }, @@ -420,7 +422,9 @@ export const analyzeCollectionSchema = (): CollectionThunkAction< schema.fields = schema.fields.filter( ({ path }) => !isInternalFieldPath(path[0]) ); - // TODO: Transform schema to structure that will be used by the LLM. + + // Transform schema to structure that will be used by the LLM + const processedSchema = processSchema(schema); const maxNestingDepth = await calculateSchemaDepth(schema); const { database, collection } = toNS(namespace); @@ -432,7 +436,7 @@ export const analyzeCollectionSchema = (): CollectionThunkAction< }; dispatch({ type: CollectionActions.SchemaAnalysisFinished, - schema, + processedSchema, sampleDocument: sampleDocuments[0], schemaMetadata, }); diff --git a/packages/compass-collection/src/schema-analysis-types.ts b/packages/compass-collection/src/schema-analysis-types.ts index 0d63615f77a..995493d2f6f 100644 --- a/packages/compass-collection/src/schema-analysis-types.ts +++ b/packages/compass-collection/src/schema-analysis-types.ts @@ -1,5 +1,4 @@ import type { Document } from 'mongodb'; -import { type Schema } from 'mongodb-schema'; export const SCHEMA_ANALYSIS_STATE_INITIAL = 'initial'; export const SCHEMA_ANALYSIS_STATE_ANALYZING = 'analyzing'; @@ -30,9 +29,16 @@ export type SchemaAnalysisErrorState = { error: SchemaAnalysisError; }; +export interface FieldInfo { + type: string; // MongoDB type (eg. String, Double, Array, Document) + sample_values?: unknown[]; // Primitive sample values (flattened for arrays) + array_sample_values?: unknown[]; // Sample values of the top-level array object + probability?: number; // 0.0 - 1.0 field frequency +} + export type SchemaAnalysisCompletedState = { status: typeof SCHEMA_ANALYSIS_STATE_COMPLETE; - schema: Schema; + processedSchema: Record; sampleDocument: Document; schemaMetadata: { maxNestingDepth: number; diff --git a/packages/compass-collection/src/transform-schema-to-field-info.spec.ts b/packages/compass-collection/src/transform-schema-to-field-info.spec.ts new file mode 100644 index 00000000000..61df740252b --- /dev/null +++ b/packages/compass-collection/src/transform-schema-to-field-info.spec.ts @@ -0,0 +1,868 @@ +import { expect } from 'chai'; +import { processSchema } from './transform-schema-to-field-info'; +import type { + Schema, + SchemaField, + SchemaType, + ArraySchemaType, + DocumentSchemaType, +} from 'mongodb-schema'; + +describe('processSchema', function () { + it('selects most probable type when multiple types exist', function () { + const schema: Schema = { + fields: [ + { + name: 'mixed', + path: ['mixed'], + probability: 1.0, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['mixed'], + count: 8, + probability: 0.8, + values: ['text'], + } as SchemaType, + { + name: 'Number', + bsonType: 'Number', + path: ['mixed'], + count: 2, + probability: 0.2, + values: [42], + } as SchemaType, + ], + } as SchemaField, + ], + count: 10, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + mixed: { + type: 'String', // Should pick the most probable type + sample_values: ['text'], + probability: 1.0, + }, + }); + }); + + it('filters out undefined types', function () { + const schema: Schema = { + fields: [ + { + name: 'optional', + path: ['optional'], + probability: 0.5, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['optional'], + count: 1, + probability: 0.5, + values: ['value'], + } as SchemaType, + { + name: 'Undefined', + bsonType: 'Undefined', + path: ['optional'], + count: 1, + probability: 0.5, + } as SchemaType, + ], + } as SchemaField, + ], + count: 2, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + optional: { + type: 'String', + sample_values: ['value'], + probability: 0.5, + }, + }); + }); + + it('handles fields with no types', function () { + const schema: Schema = { + fields: [ + { + name: 'empty', + path: ['empty'], + count: 0, + type: [], + hasDuplicates: false, + probability: 0.0, + types: [], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({}); + }); + + it('handles empty schema', function () { + const schema: Schema = { + fields: [], + count: 0, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({}); + }); + + it('limits sample values to 10', function () { + const manyValues = Array.from({ length: 20 }, (_, i) => `value${i}`); + + const schema: Schema = { + fields: [ + { + name: 'field', + path: ['field'], + probability: 1.0, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['field'], + count: 20, + probability: 1.0, + values: manyValues, + } as SchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result.field.sample_values).to.have.length(10); + expect(result.field.sample_values).to.deep.equal(manyValues.slice(0, 10)); + }); + + it('transforms simple primitive field', function () { + const schema: Schema = { + fields: [ + { + name: 'name', + path: ['name'], + probability: 1.0, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['name'], + count: 3, + probability: 1.0, + values: ['John', 'Jane', 'Bob'], + } as SchemaType, + ], + } as SchemaField, + { + name: 'age', + path: ['age'], + probability: 0.9, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['age'], + count: 3, + probability: 1.0, + values: [25, 30, 35], + } as SchemaType, + ], + } as SchemaField, + { + name: 'isActive', + path: ['isActive'], + probability: 0.8, + types: [ + { + name: 'Boolean', + bsonType: 'Boolean', + path: ['isActive'], + count: 3, + probability: 1.0, + values: [true, false, true], + } as SchemaType, + ], + } as SchemaField, + { + name: 'createdAt', + path: ['createdAt'], + probability: 0.7, + types: [ + { + name: 'Date', + bsonType: 'Date', + path: ['createdAt'], + count: 2, + probability: 1.0, + values: [new Date('2023-01-01'), new Date('2023-06-15')], + } as SchemaType, + ], + } as SchemaField, + ], + count: 3, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + name: { + type: 'String', + sample_values: ['John', 'Jane', 'Bob'], + probability: 1.0, + }, + age: { + type: 'Number', + sample_values: [25, 30, 35], + probability: 0.9, + }, + isActive: { + type: 'Boolean', + sample_values: [true, false, true], + probability: 0.8, + }, + createdAt: { + type: 'Date', + sample_values: [new Date('2023-01-01'), new Date('2023-06-15')], + probability: 0.7, + }, + }); + }); + + it('transforms nested document field', function () { + const schema: Schema = { + fields: [ + { + name: 'user', + path: ['user'], + probability: 1.0, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['user'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'name', + path: ['user', 'name'], + probability: 1.0, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['user', 'name'], + count: 1, + probability: 1.0, + values: ['John'], + } as SchemaType, + ], + } as SchemaField, + { + name: 'age', + path: ['user', 'age'], + probability: 0.8, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['user', 'age'], + count: 2, + probability: 1.0, + values: [25, 30], + } as SchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as SchemaField, + ], + count: 2, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'user.name': { + type: 'String', + sample_values: ['John'], + probability: 1.0, + }, + 'user.age': { + type: 'Number', + sample_values: [25, 30], + probability: 0.8, + }, + }); + }); + + it('transforms array field', function () { + const schema: Schema = { + fields: [ + { + name: 'tags', + path: ['tags'], + probability: 1.0, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['tags'], + count: 2, + probability: 1.0, + values: [['red', 'blue'], ['green']], + lengths: [2, 1], + averageLength: 1.5, + totalCount: 3, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['tags'], + count: 3, + probability: 1.0, + values: ['red', 'blue', 'green'], + } as SchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + count: 2, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'tags[]': { + type: 'String', + sample_values: ['red', 'blue', 'green'], + array_sample_values: [['red', 'blue'], ['green']], + probability: 1.0, + }, + }); + }); + + it('handles deeply nested objects (documents)', function () { + const schema: Schema = { + fields: [ + { + name: 'level1', + path: ['level1'], + probability: 1.0, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['level1'], + count: 1, + probability: 1.0, + fields: [ + { + name: 'level2', + path: ['level1', 'level2'], + probability: 1.0, + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['level1', 'level2'], + count: 1, + probability: 1.0, + fields: [ + { + name: 'value', + path: ['level1', 'level2', 'value'], + probability: 1.0, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['level1', 'level2', 'value'], + count: 1, + probability: 1.0, + values: ['deep'], + } as SchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'level1.level2.value': { + type: 'String', + sample_values: ['deep'], + probability: 1.0, + }, + }); + }); + + it('handles arrays of documents', function () { + const schema: Schema = { + fields: [ + { + name: 'items', + path: ['items'], + probability: 1.0, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['items'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + values: [ + [ + { id: 1, cost: 10.5 }, + { id: 2, cost: 25.0 }, + ], + ], + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['items'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'id', + path: ['items', 'id'], + probability: 1.0, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['items', 'id'], + count: 2, + probability: 1.0, + values: [1, 2], + } as SchemaType, + ], + } as SchemaField, + { + name: 'cost', + path: ['items', 'cost'], + probability: 1.0, + types: [ + { + name: 'Double', + bsonType: 'Double', + path: ['items', 'cost'], + count: 2, + probability: 1.0, + values: [10.5, 25.0], + } as SchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'items[].id': { + type: 'Number', + sample_values: [1, 2], + array_sample_values: [ + [ + { id: 1, cost: 10.5 }, + { id: 2, cost: 25.0 }, + ], + ], + probability: 1.0, + }, + 'items[].cost': { + type: 'Double', + sample_values: [10.5, 25.0], + array_sample_values: [ + [ + { id: 1, cost: 10.5 }, + { id: 2, cost: 25.0 }, + ], + ], + probability: 1.0, + }, + }); + }); + + it('handles triple nested arrays (3D matrix)', function () { + // cube: [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + const schema: Schema = { + fields: [ + { + name: 'cube', + path: ['cube'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + values: [ + [ + [ + [1, 2], + [3, 4], + ], + [ + [5, 6], + [7, 8], + ], + ], + ], + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 2, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 4, + values: [ + [ + [1, 2], + [3, 4], + ], + [ + [5, 6], + [7, 8], + ], + ], + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['cube'], + count: 4, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 8, + values: [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + ], + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['cube'], + count: 8, + probability: 1.0, + values: [1, 2, 3, 4, 5, 6, 7, 8], + } as SchemaType, + ], + } as ArraySchemaType, + ], + } as ArraySchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'cube[][][]': { + type: 'Number', + sample_values: [1, 2, 3, 4, 5, 6, 7, 8], + array_sample_values: [ + [ + [ + [1, 2], + [3, 4], + ], + [ + [5, 6], + [7, 8], + ], + ], + ], + probability: 1.0, + }, + }); + }); + + it('handles arrays of arrays of documents', function () { + const schema: Schema = { + fields: [ + { + name: 'matrix', + path: ['matrix'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['matrix'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + values: [[[{ x: 1, y: 2 }], [{ x: 3, y: 4 }]]], + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['matrix'], + count: 2, + probability: 1.0, + lengths: [1], + averageLength: 1, + totalCount: 2, + values: [[{ x: 1, y: 2 }], [{ x: 3, y: 4 }]], + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['matrix'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'x', + path: ['matrix', 'x'], + count: 2, + type: ['Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['matrix', 'x'], + count: 2, + probability: 1.0, + values: [1, 3], + } as SchemaType, + ], + } as SchemaField, + { + name: 'y', + path: ['matrix', 'y'], + count: 2, + type: ['Number'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Number', + bsonType: 'Number', + path: ['matrix', 'y'], + count: 2, + probability: 1.0, + values: [2, 4], + } as SchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as ArraySchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'matrix[][].x': { + type: 'Number', + sample_values: [1, 3], + array_sample_values: [[[{ x: 1, y: 2 }], [{ x: 3, y: 4 }]]], + probability: 1.0, + }, + 'matrix[][].y': { + type: 'Number', + sample_values: [2, 4], + array_sample_values: [[[{ x: 1, y: 2 }], [{ x: 3, y: 4 }]]], + probability: 1.0, + }, + }); + }); + + it('handles array of documents with nested arrays', function () { + // teams: [{ name: "Team A", members: ["Alice", "Bob"] }, { name: "Team B", members: ["Charlie"] }] + const schema: Schema = { + fields: [ + { + name: 'teams', + path: ['teams'], + count: 1, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['teams'], + count: 1, + probability: 1.0, + lengths: [2], + averageLength: 2, + totalCount: 2, + values: [ + [ + { name: 'Team A', members: ['Alice', 'Bob'] }, + { name: 'Team B', members: ['Charlie'] }, + ], + ], + types: [ + { + name: 'Document', + bsonType: 'Document', + path: ['teams'], + count: 2, + probability: 1.0, + fields: [ + { + name: 'name', + path: ['teams', 'name'], + count: 2, + type: ['String'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'String', + bsonType: 'String', + path: ['teams', 'name'], + count: 2, + probability: 1.0, + values: ['Team A', 'Team B'], + } as SchemaType, + ], + } as SchemaField, + { + name: 'members', + path: ['teams', 'members'], + count: 2, + type: ['Array'], + probability: 1.0, + hasDuplicates: false, + types: [ + { + name: 'Array', + bsonType: 'Array', + path: ['teams', 'members'], + count: 2, + probability: 1.0, + lengths: [2, 1], + averageLength: 1.5, + totalCount: 3, + values: [['Alice', 'Bob'], ['Charlie']], + types: [ + { + name: 'String', + bsonType: 'String', + path: ['teams', 'members'], + count: 3, + probability: 1.0, + values: ['Alice', 'Bob', 'Charlie'], + } as SchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + } as DocumentSchemaType, + ], + } as ArraySchemaType, + ], + } as SchemaField, + ], + count: 1, + }; + + const result = processSchema(schema); + + expect(result).to.deep.equal({ + 'teams[].name': { + type: 'String', + sample_values: ['Team A', 'Team B'], + array_sample_values: [ + [ + { name: 'Team A', members: ['Alice', 'Bob'] }, + { name: 'Team B', members: ['Charlie'] }, + ], + ], + probability: 1.0, + }, + 'teams[].members[]': { + type: 'String', + sample_values: ['Alice', 'Bob', 'Charlie'], + array_sample_values: [ + [ + { name: 'Team A', members: ['Alice', 'Bob'] }, + { name: 'Team B', members: ['Charlie'] }, + ], + ], + probability: 1.0, + }, + }); + }); +}); diff --git a/packages/compass-collection/src/transform-schema-to-field-info.ts b/packages/compass-collection/src/transform-schema-to-field-info.ts new file mode 100644 index 00000000000..92e1454b5ce --- /dev/null +++ b/packages/compass-collection/src/transform-schema-to-field-info.ts @@ -0,0 +1,154 @@ +import type { + Schema, + SchemaField, + SchemaType, + ArraySchemaType, + DocumentSchemaType, + PrimitiveSchemaType, +} from 'mongodb-schema'; +import type { FieldInfo } from './schema-analysis-types'; + +/** + * This module transforms mongodb-schema output into a flat, LLM-friendly format using + * dot notation for nested fields and bracket notation for arrays. + * + * Algorithm Overview: + * - Start with top-level fields. + * - For each field (processNamedField), process based on type (processType): + * - Primitives: Create result entry + * - Documents: Add parent field name to path using dot notation, recurse into nested fields (processNamedField) + * - Arrays: Add [] to path, recurse into element type (processType) + * + * Notation examples: + * - Nested documents: user.profile.name (dot notation) + * - Array: users[] (bracket notation) + * - Nested arrays: matrix[][] (multiple brackets) + * - Nested array of documents fields: users[].name (brackets + dots) + */ + +/** + * Transforms a raw mongodb-schema Schema into a flat Record + * using dot notation for nested fields and bracket notation for arrays. + */ +export function processSchema(schema: Schema): Record { + const result: Record = {}; + + if (!schema.fields) { + return result; + } + + // Process each top-level field + for (const field of schema.fields) { + processNamedField(field, '', result); + } + + return result; +} + +/** + * Processes a schema field and its nested types + */ +function processNamedField( + field: SchemaField, + pathPrefix: string, + result: Record, + arraySampleValues?: unknown[] +): void { + if (!field.types || field.types.length === 0) { + return; + } + + // Use the most frequent type (excluding 'Undefined') + const primaryType = getMostFrequentType(field.types); + if (!primaryType) { + return; + } + + const currentPath = pathPrefix ? `${pathPrefix}.${field.name}` : field.name; + + // Process based on the type + processType( + primaryType, + currentPath, + result, + field.probability, + arraySampleValues + ); +} + +/** + * Processes a specific schema type + */ +function processType( + type: SchemaType, + currentPath: string, + result: Record, + fieldProbability?: number, + arraySampleValues?: unknown[] +): void { + if (type.name === 'Array' || type.bsonType === 'Array') { + // Array: add [] to path and recurse into element type (while passing down array sample values) + const arrayType = type as ArraySchemaType; + const elementType = getMostFrequentType(arrayType.types || []); + + if (!elementType) { + return; + } + + const arrayPath = `${currentPath}[]`; + const sampleValues = + arraySampleValues || getSampleValues(arrayType).slice(0, 3); // Limit full-context array sample values to 3 + processType(elementType, arrayPath, result, fieldProbability, sampleValues); + } else if (type.name === 'Document' || type.bsonType === 'Document') { + // Process nested document fields + + const docType = type as DocumentSchemaType; + if (docType.fields) { + for (const nestedField of docType.fields) { + processNamedField(nestedField, currentPath, result, arraySampleValues); + } + } + } else { + // Primitive: create entry (with passed-down array sample values if we have them) + const fieldInfo: FieldInfo = { + type: type.name || type.bsonType || 'Mixed', + sample_values: getSampleValues(type), + probability: + fieldProbability || (type as PrimitiveSchemaType).probability || 1.0, + }; + + if (arraySampleValues !== undefined && arraySampleValues.length > 0) { + fieldInfo.array_sample_values = arraySampleValues; + } + + result[currentPath] = fieldInfo; + } +} + +/** + * Gets the most probable type from a list of types, excluding 'Undefined' + */ +function getMostFrequentType(types: SchemaType[]): SchemaType | null { + if (!types || types.length === 0) { + return null; + } + + // Filter out undefined types and sort by probability + const validTypes = types + .filter((type) => type.name !== 'Undefined') + .sort((a, b) => (b.probability || 0) - (a.probability || 0)); + + return validTypes[0] || null; +} + +/** + * Extracts sample values from a schema type, limiting to 10 items + */ +function getSampleValues(type: SchemaType): unknown[] { + // Only PrimitiveSchemaType and ArraySchemaType have values + if ('values' in type && type.values && type.values.length > 0) { + return type.values.slice(0, 10); + } + + return []; +}