diff --git a/src/schema-analyzer.ts b/src/schema-analyzer.ts index 29817bf..f01cdf7 100644 --- a/src/schema-analyzer.ts +++ b/src/schema-analyzer.ts @@ -209,7 +209,6 @@ function getBSONType(value: any): SchemaBSONType { const bsonType = value?._bsontype ? value._bsontype : Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1'); - if (bsonType === 'Object') { // In the resulting schema we rename `Object` to `Document`. return 'Document'; @@ -324,10 +323,30 @@ function simplifiedSchema(fields: SchemaAnalysisFieldsMap): SimplifiedSchema { return finalizeDocumentFieldSchema(fields); } -function cropStringAt10kCharacters(value: string) { - return value.charCodeAt(10000 - 1) === value.codePointAt(10000 - 1) - ? value.slice(0, 10000) - : value.slice(0, 10000 - 1); +function cropString(value: string, limit: number) { + if (limit < 1) return ''; + return value.charCodeAt(limit - 1) === value.codePointAt(10000 - 1) + ? value.slice(0, limit) + : value.slice(0, limit - 1); +} + +function getCappedValue(bsonType: SchemaBSONType, value: BSONValue) { + if (bsonType === 'String') { + return cropString(value as string, 10000); + } + if (bsonType === 'Binary') { + value = value as Binary; + return value.buffer.length > 10000 + ? new Binary(value.buffer.slice(0, 10000), value.sub_type) + : value; + } + if (bsonType === 'Code') { + value = value as Code; + return (value.code.length >= 10000) + ? new Code(cropString(value.code, 10000), value.scope) + : value; + } + return value; } function computeHasDuplicatesForType(type: SchemaAnalysisType, unique?: number) { @@ -525,12 +544,12 @@ export class SchemaAnalyzer { } else if (this.options.storeValues && !isNullType(type)) { // When the `storeValues` option is enabled, store some example values. if (!type.values) { - type.values = bsonType === 'String' + type.values = ['String', 'Binary', 'Code'].includes(bsonType) ? Reservoir(100) : Reservoir(10000); } type.values.pushSome( - type.name === 'String' ? cropStringAt10kCharacters(value as string) : value + getCappedValue(type.bsonType, value) ); } }; diff --git a/test/bloated.test.ts b/test/bloated.test.ts new file mode 100644 index 0000000..17adfeb --- /dev/null +++ b/test/bloated.test.ts @@ -0,0 +1,44 @@ +import assert from 'assert'; +import { Binary, Code } from 'bson'; + +import type { PrimitiveSchemaType } from '../src/schema-analyzer'; +import getSchema from '../src'; + +function generateRandomString(length: number) { + const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; + let result = ''; + for (let i = 0; i < length; i++) { + result += chars.charAt(Math.floor(Math.random() * chars.length)); + } + return result; +} + +describe('bloated documents', function() { + it('really long string is cropped', async function() { + const documents = [{ + str: generateRandomString(20000) + }]; + const schema = await getSchema(documents); + const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length; + assert.ok(stringLength <= 10000); + }); + + it('really long code is cropped', async function() { + const documents = [{ + code: new Code(generateRandomString(20000)) + }]; + const schema = await getSchema(documents); + const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length; + assert.ok(codeLength <= 10000); + }); + + it('really long binary is cropped', async function() { + const documents = [{ + binData: new Binary(Buffer.from(generateRandomString(20000)), 2) + }]; + const schema = await getSchema(documents); + const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary); + assert.ok(binary.length() <= 10000); + assert.strictEqual(binary.sub_type, 2); + }); +});