diff --git a/src/schema-analyzer.ts b/src/schema-analyzer.ts index d06a1fc..631cefa 100644 --- a/src/schema-analyzer.ts +++ b/src/schema-analyzer.ts @@ -167,6 +167,9 @@ type AllSchemaParseOptions = { storeValues: boolean; signal?: AbortSignal; storedValuesLengthLimit: number; + /** Complexity limit: + * The analysis will be aborted if the threshold is exceeded. */ + distinctFieldsAbortThreshold?: number; }; export type SchemaParseOptions = Partial; @@ -469,6 +472,7 @@ export class SchemaAnalyzer { semanticTypes: SemanticTypeMap; options: AllSchemaParseOptions; documentsAnalyzed = 0; + fieldsCount = 0; schemaAnalysisRoot: SchemaAnalysisRoot = { fields: Object.create(null), count: 0 @@ -508,6 +512,14 @@ export class SchemaAnalyzer { } } + increaseFieldCount() { + if (!this.options.distinctFieldsAbortThreshold) return; + this.fieldsCount++; + if (this.fieldsCount > this.options.distinctFieldsAbortThreshold) { + throw new Error(`Schema analysis aborted: Fields count above ${this.options.distinctFieldsAbortThreshold}`); + } + } + getSemanticType(value: BSONValue, path: string[]) { // Pass value to semantic type detectors, return first match or undefined. const returnValue = Object.entries(this.semanticTypes) @@ -580,6 +592,7 @@ export class SchemaAnalyzer { count: 0, types: Object.create(null) }; + this.increaseFieldCount(); } const field = schema[fieldName]; diff --git a/test/bloated.test.ts b/test/bloated.test.ts index 0793ba4..8c96793 100644 --- a/test/bloated.test.ts +++ b/test/bloated.test.ts @@ -14,40 +14,100 @@ function generateRandomString(length: number) { } describe('bloated documents', function() { - it('really long string is cropped', async function() { - const documents = [{ - str: generateRandomString(20000) - }]; - const schema = await getSchema(documents); - const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; - assert.ok(stringLength <= 10000); - }); + describe('sizeable sample values', function() { + it('really long string is cropped', async function() { + const documents = [{ + str: generateRandomString(20000) + }]; + const schema = await getSchema(documents); + const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; + assert.ok(stringLength <= 10000); + }); - it('really long code is cropped', async function() { - const documents = [{ - code: new Code(generateRandomString(20000)) - }]; - const schema = await getSchema(documents); - const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length; - assert.ok(codeLength <= 10000); - }); + it('really long code is cropped', async function() { + const documents = [{ + code: new Code(generateRandomString(20000)) + }]; + const schema = await getSchema(documents); + const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length; + assert.ok(codeLength <= 10000); + }); + + it('really long binary is cropped', async function() { + const documents = [{ + binData: new Binary(Buffer.from(generateRandomString(20000)), 2) + }]; + const schema = await getSchema(documents); + const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary); + assert.ok(binary.length() <= 10000); + assert.strictEqual(binary.sub_type, 2); + }); - it('really long binary is cropped', async function() { - const documents = [{ - binData: new Binary(Buffer.from(generateRandomString(20000)), 2) - }]; - const schema = await getSchema(documents); - const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary); - assert.ok(binary.length() <= 10000); - assert.strictEqual(binary.sub_type, 2); + it('the limit is configurable', async function() { + const documents = [{ + str: generateRandomString(20000) + }]; + const schema = await getSchema(documents, { storedValuesLengthLimit: 5 }); + const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; + assert.ok(stringLength === 5); + }); }); - it('the limit is configurable', async function() { - const documents = [{ - str: generateRandomString(20000) - }]; - const schema = await getSchema(documents, { storedValuesLengthLimit: 5 }); - const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; - assert.ok(stringLength === 5); + describe('high complexity', function() { + it('aborts after reaching the given limit', async function() { + const documents = [{ + field1: 'abc', + field2: 'bca', + field3: 'cba', + field4: 'cab', + field5: 'bac' + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.fail('Analysis did not throw'); + } catch (error) { + assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); + } + }); + + it('aborts after reaching the given limit - nested', async function() { + const documents = [{ + field1: { + field2: { + field3: 'abc', + field4: 'bca' + }, + field5: 'cab' + } + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.fail('Analysis did not throw'); + } catch (error) { + assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4'); + } + }); + + it('does not count the same field in different documents', async function() { + const documents = [{ + field1: { + field2: { + field3: 'abc' + } + } + }, { + field1: { + field2: { + field3: 'bca' + } + } + }]; + try { + await getSchema(documents, { distinctFieldsAbortThreshold: 4 }); + assert.ok('Analysis finished'); + } catch (error) { + assert.fail('Analysis aborted unexpectedly'); + } + }); }); });