Skip to content

feat: optional field count threshold #231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/schema-analyzer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ type AllSchemaParseOptions = {
storeValues: boolean;
signal?: AbortSignal;
storedValuesLengthLimit: number;
distinctFieldsAbortThreshold?: number;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the naming, one thing we could do is add a description here if we feel the name doesn't fully encapsulate it's usage and intention.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

};
export type SchemaParseOptions = Partial<AllSchemaParseOptions>;

Expand Down Expand Up @@ -469,6 +470,7 @@ export class SchemaAnalyzer {
semanticTypes: SemanticTypeMap;
options: AllSchemaParseOptions;
documentsAnalyzed = 0;
fieldsCount = 0;
schemaAnalysisRoot: SchemaAnalysisRoot = {
fields: Object.create(null),
count: 0
Expand Down Expand Up @@ -508,6 +510,14 @@ export class SchemaAnalyzer {
}
}

increaseFieldCount() {
if (!this.options.distinctFieldsAbortThreshold) return;
this.fieldsCount++;
if (this.fieldsCount > this.options.distinctFieldsAbortThreshold) {
throw new Error(`Schema analysis aborted: Fields count above ${this.options.distinctFieldsAbortThreshold}`);
}
}

getSemanticType(value: BSONValue, path: string[]) {
// Pass value to semantic type detectors, return first match or undefined.
const returnValue = Object.entries(this.semanticTypes)
Expand Down Expand Up @@ -580,6 +590,7 @@ export class SchemaAnalyzer {
count: 0,
types: Object.create(null)
};
this.increaseFieldCount();
}
const field = schema[fieldName];

Expand Down
122 changes: 91 additions & 31 deletions test/bloated.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,40 +14,100 @@ function generateRandomString(length: number) {
}

describe('bloated documents', function() {
it('really long string is cropped', async function() {
const documents = [{
str: generateRandomString(20000)
}];
const schema = await getSchema(documents);
const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length;
assert.ok(stringLength <= 10000);
});
describe('sizeable sample values', function() {
it('really long string is cropped', async function() {
const documents = [{
str: generateRandomString(20000)
}];
const schema = await getSchema(documents);
const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length;
assert.ok(stringLength <= 10000);
});

it('really long code is cropped', async function() {
const documents = [{
code: new Code(generateRandomString(20000))
}];
const schema = await getSchema(documents);
const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length;
assert.ok(codeLength <= 10000);
});
it('really long code is cropped', async function() {
const documents = [{
code: new Code(generateRandomString(20000))
}];
const schema = await getSchema(documents);
const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length;
assert.ok(codeLength <= 10000);
});

it('really long binary is cropped', async function() {
const documents = [{
binData: new Binary(Buffer.from(generateRandomString(20000)), 2)
}];
const schema = await getSchema(documents);
const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary);
assert.ok(binary.length() <= 10000);
assert.strictEqual(binary.sub_type, 2);
});

it('really long binary is cropped', async function() {
const documents = [{
binData: new Binary(Buffer.from(generateRandomString(20000)), 2)
}];
const schema = await getSchema(documents);
const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary);
assert.ok(binary.length() <= 10000);
assert.strictEqual(binary.sub_type, 2);
it('the limit is configurable', async function() {
const documents = [{
str: generateRandomString(20000)
}];
const schema = await getSchema(documents, { storedValuesLengthLimit: 5 });
const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length;
assert.ok(stringLength === 5);
});
});

it('the limit is configurable', async function() {
const documents = [{
str: generateRandomString(20000)
}];
const schema = await getSchema(documents, { storedValuesLengthLimit: 5 });
const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length;
assert.ok(stringLength === 5);
describe('high complexity', function() {
it('aborts after reaching the given limit', async function() {
const documents = [{
field1: 'abc',
field2: 'bca',
field3: 'cba',
field4: 'cab',
field5: 'bac'
}];
try {
await getSchema(documents, { distinctFieldsAbortThreshold: 4 });
assert.fail('Analysis did not throw');
} catch (error) {
assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4');
}
});

it('aborts after reaching the given limit - nested', async function() {
const documents = [{
field1: {
field2: {
field3: 'abc',
field4: 'bca'
},
field5: 'cab'
}
}];
try {
await getSchema(documents, { distinctFieldsAbortThreshold: 4 });
assert.fail('Analysis did not throw');
} catch (error) {
assert.strictEqual((error as Error).message, 'Schema analysis aborted: Fields count above 4');
}
});

it('does not count the same field in different documents', async function() {
const documents = [{
field1: {
field2: {
field3: 'abc'
}
}
}, {
field1: {
field2: {
field3: 'bca'
}
}
}];
try {
await getSchema(documents, { distinctFieldsAbortThreshold: 4 });
assert.ok('Analysis finished');
} catch (error) {
assert.fail('Analysis aborted unexpectedly');
}
});
});
});
Loading