Skip to content

Commit 96841a6

Browse files
fotiniAlvanakiEvergreen Agent
authored andcommitted
SERVER-82197 Handle collation in KeyString in case of Object/array
1 parent 1d78b31 commit 96841a6

File tree

2 files changed

+197
-85
lines changed

2 files changed

+197
-85
lines changed
Lines changed: 148 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,148 @@
1-
/**
2-
* Tests $group execution with increased spilling and a non-simple collation.
3-
*
4-
* @tags: [requires_sbe]
5-
*/
6-
7-
import {assertArrayEq} from "jstests/aggregation/extras/utils.js";
8-
9-
const conn = MongoRunner.runMongod();
10-
const db = conn.getDB('test');
11-
const coll = db.group_pushdown_with_collation;
12-
coll.drop();
13-
for (let i = 0; i < 1000; i++) {
14-
if (i % 3 === 0) {
15-
assert.commandWorked(coll.insert({x: 'a'}));
16-
} else {
17-
assert.commandWorked(coll.insert({x: 'A'}));
18-
}
19-
}
20-
21-
// Test that accumulators respect the collation when the group operation spills to disk.
22-
assert.commandWorked(db.adminCommand(
23-
{setParameter: 1, internalQuerySlotBasedExecutionHashAggForceIncreasedSpilling: true}));
24-
const caseInsensitive = {
25-
collation: {locale: "en_US", strength: 2}
26-
};
27-
let results =
28-
coll.aggregate([{$group: {_id: null, result: {$addToSet: "$x"}}}], caseInsensitive).toArray();
29-
assert.eq(1, results.length, results);
30-
assert.eq({_id: null, result: ["a"]}, results[0]);
31-
32-
// Test that comparisons of the group key respect the collation when the group operation spills to
33-
// disk.
34-
coll.drop();
35-
for (let i = 0; i < 1000; i++) {
36-
if (i % 5 === 0) {
37-
if (i % 10 === 0) {
38-
assert.commandWorked(coll.insert({x: 'b', y: 'D'}));
39-
} else {
40-
assert.commandWorked(coll.insert({x: 'b', y: 'e'}));
41-
}
42-
} else {
43-
assert.commandWorked(coll.insert({x: 'B', y: 'd'}));
44-
}
45-
}
46-
47-
results =
48-
coll.aggregate(
49-
[{
50-
$group:
51-
{_id: {X: "$x", Y: "$y"}, val: {$first: {$toLower: "$y"}}, count: {$count: {}}}
52-
}],
53-
caseInsensitive)
54-
.toArray();
55-
assertArrayEq({
56-
actual: results,
57-
expected: [{val: "d", count: 900}, {val: "e", count: 100}],
58-
fieldsToSkip: ["_id"]
59-
});
60-
61-
// Re-issue the query with the simple collation and check that the grouping becomes case-sensitive.
62-
results =
63-
coll.aggregate([{
64-
$group: {_id: {X: "$x", Y: "$y"}, val: {$first: {$toLower: "$y"}}, count: {$count: {}}}
65-
}])
66-
.toArray();
67-
assertArrayEq({
68-
actual: results,
69-
expected: [{val: "d", count: 800}, {val: "d", count: 100}, {val: "e", count: 100}],
70-
fieldsToSkip: ["_id"]
71-
});
72-
73-
MongoRunner.stopMongod(conn);
1+
/**
2+
* Tests $group execution with increased spilling and a non-simple collation.
3+
*
4+
* @tags: [requires_sbe]
5+
*/
6+
7+
import {assertArrayEq} from "jstests/aggregation/extras/utils.js";
8+
9+
const conn = MongoRunner.runMongod();
10+
const db = conn.getDB('test');
11+
12+
const coll = db.group_pushdown_with_collation;
13+
coll.drop();
14+
for (let i = 0; i < 1000; i++) {
15+
if (i % 3 === 0) {
16+
assert.commandWorked(coll.insert({x: 'a'}));
17+
} else {
18+
assert.commandWorked(coll.insert({x: 'A'}));
19+
}
20+
}
21+
22+
// Test that accumulators respect the collation when the group operation spills to disk.
23+
assert.commandWorked(db.adminCommand(
24+
{setParameter: 1, internalQuerySlotBasedExecutionHashAggForceIncreasedSpilling: true}));
25+
const caseInsensitive = {
26+
collation: {locale: "en_US", strength: 2}
27+
};
28+
let results =
29+
coll.aggregate([{$group: {_id: null, result: {$addToSet: "$x"}}}], caseInsensitive).toArray();
30+
assert.eq(1, results.length, results);
31+
assert.eq({_id: null, result: ["a"]}, results[0]);
32+
33+
// Test that comparisons of the group key respect the collation when the group operation spills to
34+
// disk.
35+
coll.drop();
36+
for (let i = 0; i < 1000; i++) {
37+
if (i % 5 === 0) {
38+
if (i % 10 === 0) {
39+
assert.commandWorked(coll.insert({x: 'b', y: 'D'}));
40+
} else {
41+
assert.commandWorked(coll.insert({x: 'b', y: 'e'}));
42+
}
43+
} else {
44+
assert.commandWorked(coll.insert({x: 'B', y: 'd'}));
45+
}
46+
}
47+
48+
results =
49+
coll.aggregate(
50+
[{
51+
$group:
52+
{_id: {X: "$x", Y: "$y"}, val: {$first: {$toLower: "$y"}}, count: {$count: {}}}
53+
}],
54+
caseInsensitive)
55+
.toArray();
56+
assertArrayEq({
57+
actual: results,
58+
expected: [{val: "d", count: 900}, {val: "e", count: 100}],
59+
fieldsToSkip: ["_id"]
60+
});
61+
62+
// Re-issue the query with the simple collation and check that the grouping becomes case-sensitive.
63+
results =
64+
coll.aggregate([{
65+
$group: {_id: {X: "$x", Y: "$y"}, val: {$first: {$toLower: "$y"}}, count: {$count: {}}}
66+
}])
67+
.toArray();
68+
assertArrayEq({
69+
actual: results,
70+
expected: [{val: "d", count: 800}, {val: "d", count: 100}, {val: "e", count: 100}],
71+
fieldsToSkip: ["_id"]
72+
});
73+
74+
// Test that comparisons of the group key respect the collation when the group operation spills to
75+
// disk and the key is an array.
76+
coll.drop();
77+
assert.commandWorked(coll.insertMany([
78+
{"_id": 0, key: ["A", "b"]},
79+
{"_id": 1, key: ["A", "B"]},
80+
{"_id": 2, key: ["B"]},
81+
{"_id": 3, key: ["b"]},
82+
{"_id": 4, key: ["a", "B"]},
83+
{"_id": 5, key: ["a", "b"]},
84+
]));
85+
86+
results = coll.aggregate([{$group: {_id: "$key", count: {$count: {}}}}], caseInsensitive).toArray();
87+
assertArrayEq({
88+
actual: results,
89+
expected: [{_id: ["A", "b"], count: 4}, {_id: ["B"], count: 2}],
90+
});
91+
92+
// Re-issue the query with the simple collation and check that the grouping becomes case-sensitive.
93+
results = coll.aggregate([{$group: {_id: "$key", count: {$count: {}}}}]).toArray();
94+
95+
assertArrayEq({
96+
actual: results,
97+
expected: [
98+
{_id: ["A", "b"], count: 1},
99+
{_id: ["A", "B"], count: 1},
100+
{_id: ["B"], count: 1},
101+
{_id: ["b"], count: 1},
102+
{_id: ["a", "B"], count: 1},
103+
{_id: ["a", "b"], count: 1}
104+
],
105+
});
106+
107+
// Test that comparisons of the group key respect the collation when the group operation spills to
108+
// disk and the key is an object.
109+
coll.drop();
110+
for (let i = 0; i < 1000; i++) {
111+
if (i % 5 === 0) {
112+
assert.commandWorked(coll.insert({x: {val: 'A'}}));
113+
} else {
114+
assert.commandWorked(coll.insert({x: {val: 'a'}}));
115+
}
116+
}
117+
for (let i = 0; i < 1000; i++) {
118+
if (i % 5 === 0) {
119+
assert.commandWorked(coll.insert({x: {val: 'b'}}));
120+
} else {
121+
assert.commandWorked(coll.insert({x: {val: 'B'}}));
122+
}
123+
}
124+
125+
results = coll.aggregate([{$group: {_id: {X: "$x"}, firstX: {$first: "$x"}, count: {$count: {}}}}],
126+
caseInsensitive)
127+
.toArray();
128+
assertArrayEq({
129+
actual: results,
130+
expected: [{firstX: {val: "b"}, count: 1000}, {firstX: {val: "A"}, count: 1000}],
131+
fieldsToSkip: ["_id"]
132+
});
133+
134+
// Re-issue the query with the simple collation and check that the grouping becomes case-sensitive.
135+
results = coll.aggregate([{$group: {_id: {X: "$x"}, firstX: {$first: "$x"}, count: {$count: {}}}}])
136+
.toArray();
137+
assertArrayEq({
138+
actual: results,
139+
expected: [
140+
{firstX: {val: "b"}, count: 200},
141+
{firstX: {val: "B"}, count: 800},
142+
{firstX: {val: "a"}, count: 800},
143+
{firstX: {val: "A"}, count: 200}
144+
],
145+
fieldsToSkip: ["_id"]
146+
});
147+
148+
MongoRunner.stopMongod(conn);

src/mongo/db/exec/sbe/values/row.cpp

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,14 @@ static void serializeValue(BufBuilder& buf, TypeTags tag, Value val) {
441441
}
442442
}
443443

444+
/**
445+
* If non-null 'collator' is provided during serialization, then the encoding guarantees that values
446+
* which are equal up to the collation will encode to the same result, allowing for collation-aware
447+
* equality comparisons. However, the collator-aware encoded values are not always decodable and the
448+
* key string equality no longer implies equality of the original values. Groups store an original
449+
* copy of the group key alongside the encoded key string when there is a collation, so the key
450+
* string does not need to be decodable.
451+
*/
444452
static void serializeValueIntoKeyString(key_string::Builder& buf,
445453
TypeTags tag,
446454
Value val,
@@ -529,28 +537,45 @@ static void serializeValueIntoKeyString(key_string::Builder& buf,
529537
break;
530538
}
531539
case TypeTags::bsonSymbol: {
540+
// Note that the collation would have to apply to bsonSymbol values to match the
541+
// behavior of the Classic engine when spilling groups to disk. bsonSymbol is
542+
// deprecated, however, and SBE no longer provides strict correctness guarantees about
543+
// computations on bsonSymbol values.
532544
buf.appendBool(true);
533545
buf.appendSymbol(getStringOrSymbolView(tag, val));
534546
break;
535547
}
536548
case TypeTags::ArrayMultiSet:
537549
case TypeTags::ArraySet:
538550
case TypeTags::Array: {
539-
// TODO SERVER-61629: convert this to serialize the 'arr' directly instead of
540-
// constructing a BSONArray.
541-
BSONArrayBuilder builder;
542-
bson::convertToBsonArr(builder, value::ArrayEnumerator{tag, val});
543551
buf.appendBool(true);
544-
buf.appendArray(BSONArray(builder.done()));
552+
553+
if (collator) {
554+
// Using 'hashValue()' guarantees that two arrays whose elements are equal up to the
555+
// collation will translate to the same key string output.
556+
buf.appendNumberLong(hashValue(tag, val, collator));
557+
} else {
558+
// TODO SERVER-61629: convert this to serialize the 'arr' directly instead of
559+
// constructing a BSONArray.
560+
BSONArrayBuilder builder;
561+
bson::convertToBsonArr(builder, value::ArrayEnumerator{tag, val});
562+
buf.appendArray(BSONArray(builder.done()));
563+
}
545564
break;
546565
}
547566
case TypeTags::Object: {
548-
// TODO SERVER-61629: convert this to serialize the 'obj' directly instead of
549-
// constructing a BSONObj.
550-
BSONObjBuilder builder;
551-
bson::convertToBsonObj(builder, getObjectView(val));
552567
buf.appendBool(true);
553-
buf.appendObject(builder.done());
568+
if (collator) {
569+
// Using 'hashValue()' guarantees that two objects whose elements are equal up to
570+
// the collation will translate to the same key string output.
571+
buf.appendNumberLong(hashValue(tag, val, collator));
572+
} else {
573+
// TODO SERVER-61629: convert this to serialize the 'obj' directly instead of
574+
// constructing a BSONObj.
575+
BSONObjBuilder builder;
576+
bson::convertToBsonObj(builder, getObjectView(val));
577+
buf.appendObject(builder.done());
578+
}
554579
break;
555580
}
556581
case TypeTags::ObjectId: {
@@ -560,12 +585,24 @@ static void serializeValueIntoKeyString(key_string::Builder& buf,
560585
}
561586
case TypeTags::bsonObject: {
562587
buf.appendBool(true);
563-
buf.appendObject(BSONObj(getRawPointerView(val)));
588+
if (collator) {
589+
// Using 'hashValue()' guarantees that two objects whose elements are equal up to
590+
// the collation will translate to the same key string output.
591+
buf.appendNumberLong(hashValue(tag, val, collator));
592+
} else {
593+
buf.appendObject(BSONObj(getRawPointerView(val)));
594+
}
564595
break;
565596
}
566597
case TypeTags::bsonArray: {
567598
buf.appendBool(true);
568-
buf.appendArray(BSONArray(BSONObj(getRawPointerView(val))));
599+
if (collator) {
600+
// Using 'hashValue()' guarantees that two arrays whose elements are equal up to the
601+
// collation will translate to the same key string output.
602+
buf.appendNumberLong(hashValue(tag, val, collator));
603+
} else {
604+
buf.appendArray(BSONArray(BSONObj(getRawPointerView(val))));
605+
}
569606
break;
570607
}
571608
case TypeTags::bsonObjectId: {

0 commit comments

Comments
 (0)