Skip to content

fix: crop bloated code and binary COMPASS-8902 #221

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions src/schema-analyzer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ function getBSONType(value: any): SchemaBSONType {
const bsonType = value?._bsontype
? value._bsontype
: Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1');

if (bsonType === 'Object') {
// In the resulting schema we rename `Object` to `Document`.
return 'Document';
Expand Down Expand Up @@ -324,10 +323,30 @@ function simplifiedSchema(fields: SchemaAnalysisFieldsMap): SimplifiedSchema {
return finalizeDocumentFieldSchema(fields);
}

function cropStringAt10kCharacters(value: string) {
return value.charCodeAt(10000 - 1) === value.codePointAt(10000 - 1)
? value.slice(0, 10000)
: value.slice(0, 10000 - 1);
function cropString(value: string, limit: number) {
if (limit < 1) return '';
return value.charCodeAt(limit - 1) === value.codePointAt(10000 - 1)
? value.slice(0, limit)
: value.slice(0, limit - 1);
}

function getCappedValue(bsonType: SchemaBSONType, value: BSONValue) {
if (bsonType === 'String') {
return cropString(value as string, 10000);
}
if (bsonType === 'Binary') {
value = value as Binary;
return value.buffer.length > 10000
? new Binary(value.buffer.slice(0, 10000), value.sub_type)
: value;
}
if (bsonType === 'Code') {
value = value as Code;
return (value.code.length >= 10000)
? new Code(cropString(value.code, 10000), value.scope)
: value;
}
return value;
}

function computeHasDuplicatesForType(type: SchemaAnalysisType, unique?: number) {
Expand Down Expand Up @@ -530,7 +549,7 @@ export class SchemaAnalyzer {
}

type.values.pushSome(
type.name === 'String' ? cropStringAt10kCharacters(value as string) : value
getCappedValue(type.bsonType, value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few lines up, there's the line

          type.values = bsonType === 'String'
            ? Reservoir(100) : Reservoir(10000);

Do we want to do that for binary and code as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that makes sense, will add

);
}
};
Expand Down
44 changes: 44 additions & 0 deletions test/bloated.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import assert from 'assert';
import { Binary, Code } from 'bson';

import type { PrimitiveSchemaType } from '../src/schema-analyzer';
import getSchema from '../src';

function generateRandomString(length: number) {
const chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
let result = '';
for (let i = 0; i < length; i++) {
result += chars.charAt(Math.floor(Math.random() * chars.length));
}
return result;
}

describe('bloated documents', function() {
it('really long string is cropped', async function() {
const documents = [{
str: generateRandomString(20000)
}];
const schema = await getSchema(documents);
const stringLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as string).length;
assert.ok(stringLength <= 10000);
});

it('really long code is cropped', async function() {
const documents = [{
code: new Code(generateRandomString(20000))
}];
const schema = await getSchema(documents);
const codeLength = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Code).code.length;
assert.ok(codeLength <= 10000);
});

it('really long binary is cropped', async function() {
const documents = [{
binData: new Binary(Buffer.from(generateRandomString(20000)), 2)
}];
const schema = await getSchema(documents);
const binary = ((schema.fields[0].types[0] as PrimitiveSchemaType).values[0] as Binary);
assert.ok(binary.length() <= 10000);
assert.strictEqual(binary.sub_type, 2);
});
});
Loading