Skip to content

Commit f81571e

Browse files
committed
WIP
1 parent 9fedf3f commit f81571e

File tree

3 files changed

+276
-32
lines changed

3 files changed

+276
-32
lines changed

packages/compass-collection/src/schema-analysis-types.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ export type SchemaAnalysisErrorState = {
3232
export interface FieldInfo {
3333
type: string; // MongoDB type (eg. String, Double, Array, Document)
3434
sample_values: unknown[]; // Real sample values (empty array if none)
35-
isArray?: boolean; // For arrays. Denotes that the type field refers to the type of the array elements
3635
probability?: number; // 0.0 - 1.0 field frequency
3736
}
3837

packages/compass-collection/src/transform-schema-to-field-info.spec.ts

Lines changed: 209 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,9 @@ describe('processSchema', function () {
149149
const result = processSchema(schema);
150150

151151
expect(result).to.deep.equal({
152-
tags: {
152+
'tags[]': {
153153
type: 'String',
154154
sample_values: [['red', 'blue'], ['green']],
155-
isArray: true,
156155
probability: 1.0,
157156
},
158157
});
@@ -327,13 +326,12 @@ describe('processSchema', function () {
327326
const result = processSchema(schema);
328327

329328
expect(result).to.deep.equal({
330-
items: {
329+
'items[]': {
331330
type: 'Document',
332331
sample_values: [], // no sample values for Documents
333-
isArray: true,
334332
probability: 1.0,
335333
},
336-
'items.id': {
334+
'items[].id': {
337335
type: 'Number',
338336
sample_values: [1, 2],
339337
probability: 1.0,
@@ -392,6 +390,211 @@ describe('processSchema', function () {
392390
expect(result.field.sample_values).to.deep.equal(manyValues.slice(0, 10));
393391
});
394392

393+
it('handles arrays of arrays of documents', function () {
394+
const schema: Schema = {
395+
fields: [
396+
{
397+
name: 'matrix',
398+
path: ['matrix'],
399+
count: 1,
400+
type: ['Array'],
401+
probability: 1.0,
402+
hasDuplicates: false,
403+
types: [
404+
{
405+
name: 'Array',
406+
bsonType: 'Array',
407+
path: ['matrix'],
408+
count: 1,
409+
probability: 1.0,
410+
lengths: [2],
411+
averageLength: 2,
412+
totalCount: 2,
413+
types: [
414+
{
415+
name: 'Array',
416+
bsonType: 'Array',
417+
path: ['matrix'],
418+
count: 2,
419+
probability: 1.0,
420+
lengths: [1],
421+
averageLength: 1,
422+
totalCount: 2,
423+
types: [
424+
{
425+
name: 'Document',
426+
bsonType: 'Document',
427+
path: ['matrix'],
428+
count: 2,
429+
probability: 1.0,
430+
fields: [
431+
{
432+
name: 'x',
433+
path: ['matrix', 'x'],
434+
count: 2,
435+
type: ['Number'],
436+
probability: 1.0,
437+
hasDuplicates: false,
438+
types: [
439+
{
440+
name: 'Number',
441+
bsonType: 'Number',
442+
path: ['matrix', 'x'],
443+
count: 2,
444+
probability: 1.0,
445+
values: [1, 3],
446+
} as SchemaType,
447+
],
448+
} as SchemaField,
449+
{
450+
name: 'y',
451+
path: ['matrix', 'y'],
452+
count: 2,
453+
type: ['Number'],
454+
probability: 1.0,
455+
hasDuplicates: false,
456+
types: [
457+
{
458+
name: 'Number',
459+
bsonType: 'Number',
460+
path: ['matrix', 'y'],
461+
count: 2,
462+
probability: 1.0,
463+
values: [2, 4],
464+
} as SchemaType,
465+
],
466+
} as SchemaField,
467+
],
468+
} as DocumentSchemaType,
469+
],
470+
} as ArraySchemaType,
471+
],
472+
} as ArraySchemaType,
473+
],
474+
} as SchemaField,
475+
],
476+
count: 1,
477+
};
478+
479+
const result = processSchema(schema);
480+
481+
expect(result).to.deep.equal({
482+
'matrix[][]': {
483+
type: 'Document',
484+
sample_values: [],
485+
probability: 1.0,
486+
},
487+
'matrix[][].x': {
488+
type: 'Number',
489+
sample_values: [1, 3],
490+
probability: 1.0,
491+
},
492+
'matrix[][].y': {
493+
type: 'Number',
494+
sample_values: [2, 4],
495+
probability: 1.0,
496+
},
497+
});
498+
});
499+
500+
it('handles deeply nested arrays (infinite recursion)', function () {
501+
// Test case: matrix: [[[{ value: 42 }]]]
502+
// Array -> Array -> Array -> Document -> value field
503+
const schema: Schema = {
504+
fields: [
505+
{
506+
name: 'deepMatrix',
507+
path: ['deepMatrix'],
508+
count: 1,
509+
type: ['Array'],
510+
probability: 1.0,
511+
hasDuplicates: false,
512+
types: [
513+
{
514+
name: 'Array',
515+
bsonType: 'Array',
516+
path: ['deepMatrix'],
517+
count: 1,
518+
probability: 1.0,
519+
lengths: [1],
520+
averageLength: 1,
521+
totalCount: 1,
522+
types: [
523+
{
524+
name: 'Array',
525+
bsonType: 'Array',
526+
path: ['deepMatrix'],
527+
count: 1,
528+
probability: 1.0,
529+
lengths: [1],
530+
averageLength: 1,
531+
totalCount: 1,
532+
types: [
533+
{
534+
name: 'Array',
535+
bsonType: 'Array',
536+
path: ['deepMatrix'],
537+
count: 1,
538+
probability: 1.0,
539+
lengths: [1],
540+
averageLength: 1,
541+
totalCount: 1,
542+
types: [
543+
{
544+
name: 'Document',
545+
bsonType: 'Document',
546+
path: ['deepMatrix'],
547+
count: 1,
548+
probability: 1.0,
549+
fields: [
550+
{
551+
name: 'value',
552+
path: ['deepMatrix', 'value'],
553+
count: 1,
554+
type: ['Number'],
555+
probability: 1.0,
556+
hasDuplicates: false,
557+
types: [
558+
{
559+
name: 'Number',
560+
bsonType: 'Number',
561+
path: ['deepMatrix', 'value'],
562+
count: 1,
563+
probability: 1.0,
564+
values: [42],
565+
} as SchemaType,
566+
],
567+
} as SchemaField,
568+
],
569+
} as DocumentSchemaType,
570+
],
571+
} as ArraySchemaType,
572+
],
573+
} as ArraySchemaType,
574+
],
575+
} as ArraySchemaType,
576+
],
577+
} as SchemaField,
578+
],
579+
count: 1,
580+
};
581+
582+
const result = processSchema(schema);
583+
584+
expect(result).to.deep.equal({
585+
'deepMatrix[][][]': {
586+
type: 'Document',
587+
sample_values: [],
588+
probability: 1.0,
589+
},
590+
'deepMatrix[][][].value': {
591+
type: 'Number',
592+
sample_values: [42],
593+
probability: 1.0,
594+
},
595+
});
596+
});
597+
395598
it('selects most probable type when multiple types exist', function () {
396599
const schema: Schema = {
397600
fields: [
@@ -496,13 +699,12 @@ describe('processSchema', function () {
496699
const result = processSchema(schema);
497700

498701
expect(result).to.deep.equal({
499-
coordinates: {
702+
'coordinates[]': {
500703
type: 'Double',
501704
sample_values: [
502705
[-18.568, -66.281],
503706
[93.074, 37.075],
504707
],
505-
isArray: true,
506708
probability: 1.0,
507709
},
508710
_id: {

packages/compass-collection/src/transform-schema-to-field-info.ts

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,63 @@ import type { FieldInfo } from './schema-analysis-types';
1919
export function processSchema(schema: Schema): Record<string, FieldInfo> {
2020
const result: Record<string, FieldInfo> = {};
2121

22+
// Helper to recursively process element types (for arrays and documents)
23+
function processElementType(
24+
elementType: SchemaType,
25+
fieldPath: string,
26+
arrayType?: ArraySchemaType,
27+
fieldProbability?: number
28+
): void {
29+
if (
30+
elementType.name === 'Document' ||
31+
elementType.bsonType === 'Document'
32+
) {
33+
// Create entry for document (leaf level)
34+
if (arrayType) {
35+
result[fieldPath] = {
36+
type: elementType.name || elementType.bsonType || 'Mixed',
37+
sample_values: extractSampleValues(arrayType),
38+
probability: fieldProbability || 1.0,
39+
};
40+
}
41+
42+
// Process document fields
43+
const docType = elementType as DocumentSchemaType;
44+
if (docType.fields) {
45+
docType.fields.forEach((nestedField) => {
46+
processField(nestedField, fieldPath);
47+
});
48+
}
49+
} else if (
50+
elementType.name === 'Array' ||
51+
elementType.bsonType === 'Array'
52+
) {
53+
// Process nested arrays recursively with bracket notation
54+
const nestedArrayType = elementType as ArraySchemaType;
55+
const innerElementType = getMostFrequentType(nestedArrayType.types || []);
56+
57+
if (innerElementType) {
58+
// Add [] to the field path for nested arrays
59+
const nestedArrayPath = `${fieldPath}[]`;
60+
processElementType(
61+
innerElementType,
62+
nestedArrayPath,
63+
nestedArrayType,
64+
fieldProbability
65+
);
66+
}
67+
} else {
68+
// Primitive type - create entry (leaf level)
69+
if (arrayType) {
70+
result[fieldPath] = {
71+
type: elementType.name || elementType.bsonType || 'Mixed',
72+
sample_values: extractSampleValues(arrayType),
73+
probability: fieldProbability || 1.0,
74+
};
75+
}
76+
}
77+
}
78+
2279
function processField(field: SchemaField, pathPrefix = ''): void {
2380
const fieldPath = pathPrefix ? `${pathPrefix}.${field.name}` : field.name;
2481

@@ -33,25 +90,16 @@ export function processSchema(schema: Schema): Record<string, FieldInfo> {
3390
const elementType = getMostFrequentType(arrayType.types || []);
3491

3592
if (elementType) {
36-
result[fieldPath] = {
37-
type: elementType.name || elementType.bsonType || 'Mixed',
38-
sample_values: extractSampleValues(arrayType),
39-
isArray: true,
40-
probability: field.probability,
41-
};
93+
// Create bracket notation path for array elements
94+
const arrayElementPath = `${fieldPath}[]`;
4295

43-
// Process nested fields if the array contains documents
44-
if (
45-
elementType.name === 'Document' ||
46-
elementType.bsonType === 'Document'
47-
) {
48-
const docType = elementType as DocumentSchemaType;
49-
if (docType.fields) {
50-
docType.fields.forEach((nestedField) => {
51-
processField(nestedField, fieldPath);
52-
});
53-
}
54-
}
96+
// Recursively process nested structures
97+
processElementType(
98+
elementType,
99+
arrayElementPath,
100+
arrayType,
101+
field.probability
102+
);
55103
}
56104
}
57105
// Handle documents (nested objects)
@@ -63,12 +111,7 @@ export function processSchema(schema: Schema): Record<string, FieldInfo> {
63111
// We can infer its presence from its children.
64112

65113
// Process nested fields (children):
66-
const docType = primaryType as DocumentSchemaType;
67-
if (docType.fields) {
68-
docType.fields.forEach((nestedField) => {
69-
processField(nestedField, fieldPath);
70-
});
71-
}
114+
processElementType(primaryType, fieldPath);
72115
}
73116
// Handle primitive types
74117
else {

0 commit comments

Comments
 (0)