Skip to content

Commit e3d36c2

Browse files
marksg07mdb-ad
authored andcommitted
MONGOCRYPT-811 Update encryptedTokens generation for text search (#1025)
1 parent 942dbd0 commit e3d36c2

File tree

5 files changed

+118
-47
lines changed

5 files changed

+118
-47
lines changed

src/mc-text-search-str-encode-private.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ typedef struct {
3535
mc_substring_set_t *substring_set;
3636
// Encoded exact string.
3737
_mongocrypt_buffer_t exact;
38+
// Total number of tags over all the sets and the exact string.
39+
uint32_t msize;
3840
} mc_str_encode_sets_t;
3941

4042
// Run StrEncode with the given spec.

src/mc-text-search-str-encode.c

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ static mc_affix_set_t *generate_prefix_or_suffix_tree(const mc_utf8_string_with_
3333
uint32_t unfolded_byte_len,
3434
uint32_t lb,
3535
uint32_t ub,
36-
bool is_prefix) {
36+
bool is_prefix,
37+
uint32_t *out_msize) {
3738
BSON_ASSERT_PARAM(base_str);
39+
BSON_ASSERT_PARAM(out_msize);
3840
// We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
3941
uint32_t encrypted_len = 16 * (uint32_t)((unfolded_byte_len + OVERHEAD_BYTES + 15) / 16);
4042
// Max len of a string that has this encrypted len.
@@ -69,23 +71,24 @@ static mc_affix_set_t *generate_prefix_or_suffix_tree(const mc_utf8_string_with_
6971
n_inserted++;
7072
}
7173
BSON_ASSERT(n_inserted == set_size);
74+
*out_msize += msize;
7275
return set;
7376
}
7477

7578
static mc_affix_set_t *generate_suffix_tree(const mc_utf8_string_with_bad_char_t *base_str,
7679
uint32_t unfolded_byte_len,
77-
const mc_FLE2SuffixInsertSpec_t *spec) {
78-
BSON_ASSERT_PARAM(base_str);
80+
const mc_FLE2SuffixInsertSpec_t *spec,
81+
uint32_t *out_msize) {
7982
BSON_ASSERT_PARAM(spec);
80-
return generate_prefix_or_suffix_tree(base_str, unfolded_byte_len, spec->lb, spec->ub, false);
83+
return generate_prefix_or_suffix_tree(base_str, unfolded_byte_len, spec->lb, spec->ub, false, out_msize);
8184
}
8285

8386
static mc_affix_set_t *generate_prefix_tree(const mc_utf8_string_with_bad_char_t *base_str,
8487
uint32_t unfolded_byte_len,
85-
const mc_FLE2PrefixInsertSpec_t *spec) {
86-
BSON_ASSERT_PARAM(base_str);
88+
const mc_FLE2PrefixInsertSpec_t *spec,
89+
uint32_t *out_msize) {
8790
BSON_ASSERT_PARAM(spec);
88-
return generate_prefix_or_suffix_tree(base_str, unfolded_byte_len, spec->lb, spec->ub, true);
91+
return generate_prefix_or_suffix_tree(base_str, unfolded_byte_len, spec->lb, spec->ub, true, out_msize);
8992
}
9093

9194
static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t ub) {
@@ -104,9 +107,11 @@ static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t
104107

105108
static mc_substring_set_t *generate_substring_tree(const mc_utf8_string_with_bad_char_t *base_str,
106109
uint32_t unfolded_byte_len,
107-
const mc_FLE2SubstringInsertSpec_t *spec) {
110+
const mc_FLE2SubstringInsertSpec_t *spec,
111+
uint32_t *out_msize) {
108112
BSON_ASSERT_PARAM(base_str);
109113
BSON_ASSERT_PARAM(spec);
114+
BSON_ASSERT_PARAM(out_msize);
110115
// We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
111116
uint32_t encrypted_len = 16 * (uint32_t)((unfolded_byte_len + OVERHEAD_BYTES + 15) / 16);
112117
// Max len of a string that has this encrypted len.
@@ -164,6 +169,7 @@ static mc_substring_set_t *generate_substring_tree(const mc_utf8_string_with_bad
164169
BSON_ASSERT(msize > n_real_substrings);
165170
mc_substring_set_increment_fake_string(set, msize - n_real_substrings);
166171
}
172+
*out_msize += msize;
167173
return set;
168174
}
169175

@@ -215,11 +221,13 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
215221
mc_str_encode_sets_t *sets = bson_malloc0(sizeof(mc_str_encode_sets_t));
216222
// Base string is the folded string plus the 0xFF character
217223
sets->base_string = base_string;
224+
// Initialize msize at 1 for the exact string, and grow it for each encoding.
225+
sets->msize = 1;
218226
if (spec->suffix.set) {
219-
sets->suffix_set = generate_suffix_tree(sets->base_string, spec->len, &spec->suffix.value);
227+
sets->suffix_set = generate_suffix_tree(sets->base_string, spec->len, &spec->suffix.value, &sets->msize);
220228
}
221229
if (spec->prefix.set) {
222-
sets->prefix_set = generate_prefix_tree(sets->base_string, spec->len, &spec->prefix.value);
230+
sets->prefix_set = generate_prefix_tree(sets->base_string, spec->len, &spec->prefix.value, &sets->msize);
223231
}
224232
if (spec->substr.set) {
225233
uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length(spec->v, spec->len);
@@ -231,7 +239,7 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
231239
mc_str_encode_sets_destroy(sets);
232240
return NULL;
233241
}
234-
sets->substring_set = generate_substring_tree(sets->base_string, spec->len, &spec->substr.value);
242+
sets->substring_set = generate_substring_tree(sets->base_string, spec->len, &spec->substr.value, &sets->msize);
235243
}
236244
// Exact string is always equal to the base string up until the bad character
237245
_mongocrypt_buffer_from_data(&sets->exact, sets->base_string->buf.data, (uint32_t)sets->base_string->buf.len - 1);

src/mongocrypt-marking.c

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -512,14 +512,21 @@ static bool _fle2_placeholder_aes_aead_encrypt(_mongocrypt_key_broker_t *kb,
512512
// ECCDerivedFromDataTokenAndContentionFactor)
513513
// FLE V2: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor)
514514
// Range V2: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor || isLeaf)
515+
// Text search: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor || msize)
516+
struct encrypted_token_metadata {
517+
mc_optional_bool_t is_leaf; // isLeaf for Range V2, none for all other cases
518+
mc_optional_uint32_t msize; // msize for text search, none for all other cases
519+
};
520+
515521
static bool _fle2_derive_encrypted_token(_mongocrypt_crypto_t *crypto,
516522
_mongocrypt_buffer_t *out,
517-
bool concatentate_leaf,
518523
const mc_CollectionsLevel1Token_t *collectionsLevel1Token,
519524
const _mongocrypt_buffer_t *escDerivedToken,
520525
const _mongocrypt_buffer_t *eccDerivedToken,
521-
mc_optional_bool_t is_leaf,
526+
struct encrypted_token_metadata token_metadata,
522527
mongocrypt_status_t *status) {
528+
// isLeaf and msize should never both be set.
529+
BSON_ASSERT(!token_metadata.is_leaf.set || !token_metadata.msize.set);
523530
mc_ECOCToken_t *ecocToken = mc_ECOCToken_new(crypto, collectionsLevel1Token, status);
524531
if (!ecocToken) {
525532
return false;
@@ -531,10 +538,10 @@ static bool _fle2_derive_encrypted_token(_mongocrypt_crypto_t *crypto,
531538
const _mongocrypt_buffer_t *p = &tmp;
532539
if (!eccDerivedToken) {
533540
// FLE2v2
534-
if (concatentate_leaf && is_leaf.set) {
541+
if (token_metadata.is_leaf.set) {
535542
// Range V2; concat isLeaf
536543
_mongocrypt_buffer_t isLeafBuf;
537-
if (!_mongocrypt_buffer_copy_from_data_and_size(&isLeafBuf, (uint8_t[]){is_leaf.value}, 1)) {
544+
if (!_mongocrypt_buffer_copy_from_data_and_size(&isLeafBuf, (uint8_t[]){token_metadata.is_leaf.value}, 1)) {
538545
CLIENT_ERR("failed to create is_leaf buffer");
539546
goto fail;
540547
}
@@ -544,6 +551,21 @@ static bool _fle2_derive_encrypted_token(_mongocrypt_crypto_t *crypto,
544551
goto fail;
545552
}
546553
_mongocrypt_buffer_cleanup(&isLeafBuf);
554+
} else if (token_metadata.msize.set) {
555+
// Text search; concat msize
556+
_mongocrypt_buffer_t msizeBuf;
557+
// msize is a 3-byte value, so copy the 3 least significant bytes into the buffer in little-endian order.
558+
uint32_t le_msize = BSON_UINT32_TO_LE(token_metadata.msize.value);
559+
if (!_mongocrypt_buffer_copy_from_data_and_size(&msizeBuf, (uint8_t *)&le_msize, 3)) {
560+
CLIENT_ERR("failed to create msize buffer");
561+
goto fail;
562+
}
563+
if (!_mongocrypt_buffer_concat(&tmp, (_mongocrypt_buffer_t[]){*escDerivedToken, msizeBuf}, 2)) {
564+
CLIENT_ERR("failed to allocate buffer");
565+
_mongocrypt_buffer_cleanup(&msizeBuf);
566+
goto fail;
567+
}
568+
_mongocrypt_buffer_cleanup(&msizeBuf);
547569
} else {
548570
p = escDerivedToken;
549571
}
@@ -753,16 +775,23 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_common(_mongocrypt_key
753775

754776
// p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor)
755777
// Or in Range V2, when using range: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor || 0x00)
756-
if (!_fle2_derive_encrypted_token(
757-
crypto,
758-
&out->encryptedTokens,
759-
true,
760-
common->collectionsLevel1Token,
761-
&out->escDerivedToken,
762-
NULL, // unused in v2
763-
// If this is a range insert, we append isLeaf to the encryptedTokens. Otherwise, we don't.
764-
placeholder->algorithm == MONGOCRYPT_FLE2_ALGORITHM_RANGE ? OPT_BOOL(false) : (mc_optional_bool_t){0},
765-
status)) {
778+
// Or in Text Search, when using msize: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor ||
779+
// 0x000000)
780+
struct encrypted_token_metadata et_meta = {{0}};
781+
if (placeholder->algorithm == MONGOCRYPT_FLE2_ALGORITHM_RANGE) {
782+
// For range, we append isLeaf to the encryptedTokens.
783+
et_meta.is_leaf = OPT_BOOL(false);
784+
} else if (placeholder->algorithm == MONGOCRYPT_FLE2_ALGORITHM_TEXT_SEARCH) {
785+
// For text search, we append msize to the encryptedTokens.
786+
et_meta.msize = OPT_U32(0);
787+
}
788+
if (!_fle2_derive_encrypted_token(crypto,
789+
&out->encryptedTokens,
790+
common->collectionsLevel1Token,
791+
&out->escDerivedToken,
792+
NULL, // unused in v2
793+
et_meta,
794+
status)) {
766795
goto fail;
767796
}
768797

@@ -1021,11 +1050,10 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_ciphertextForRange(_mo
10211050
// Or in Range V2: p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor || isLeaf)
10221051
if (!_fle2_derive_encrypted_token(kb->crypt->crypto,
10231052
&etc.encryptedTokens,
1024-
true,
10251053
edge_tokens.collectionsLevel1Token,
10261054
&etc.escDerivedToken,
10271055
NULL, // ecc unsed in FLE2v2
1028-
OPT_BOOL(is_leaf),
1056+
(struct encrypted_token_metadata){.is_leaf = OPT_BOOL(is_leaf)},
10291057
status)) {
10301058
goto fail_loop;
10311059
}
@@ -1087,6 +1115,7 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_ciphertextForRange(_mo
10871115
mc_Text##Type##TokenSet_t *out, \
10881116
const _mongocrypt_buffer_t *value, \
10891117
int64_t contentionFactor, \
1118+
uint32_t msize, \
10901119
const mc_CollectionsLevel1Token_t *collLevel1Token, \
10911120
const mc_ServerTokenDerivationLevel1Token_t *serverLevel1Token, \
10921121
mongocrypt_status_t *status) { \
@@ -1124,11 +1153,10 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_ciphertextForRange(_mo
11241153
} \
11251154
if (!_fle2_derive_encrypted_token(kb->crypt->crypto, \
11261155
&out->encryptedTokens, \
1127-
false, \
11281156
collLevel1Token, \
11291157
&out->escDerivedToken, \
11301158
NULL, \
1131-
(mc_optional_bool_t){0}, \
1159+
(struct encrypted_token_metadata){.msize = OPT_U32(msize)}, \
11321160
status)) { \
11331161
return false; \
11341162
} \
@@ -1230,6 +1258,8 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb,
12301258
&tsts->exact,
12311259
&asBsonValue,
12321260
contentionFactor,
1261+
// For the exact token, report total msize of the token set.
1262+
encodeSets->msize,
12331263
common.collectionsLevel1Token,
12341264
common.serverTokenDerivationLevel1Token,
12351265
status)) {
@@ -1258,10 +1288,12 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb,
12581288
_mongocrypt_buffer_init(&asBsonValue);
12591289
_mongocrypt_buffer_copy_from_string_as_bson_value(&asBsonValue, substring, (int)bytelen);
12601290

1291+
// For substring, prefix, and suffix tokens, report 0 as the msize.
12611292
if (!_fle2_generate_TextSubstringTokenSet(kb,
12621293
&tset,
12631294
&asBsonValue,
12641295
contentionFactor,
1296+
0 /* msize */,
12651297
common.collectionsLevel1Token,
12661298
common.serverTokenDerivationLevel1Token,
12671299
status)) {
@@ -1302,6 +1334,7 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb,
13021334
&tset,
13031335
&asBsonValue,
13041336
contentionFactor,
1337+
0 /* msize */,
13051338
common.collectionsLevel1Token,
13061339
common.serverTokenDerivationLevel1Token,
13071340
status)) {
@@ -1342,6 +1375,7 @@ static bool _fle2_generate_TextSearchTokenSets(_mongocrypt_key_broker_t *kb,
13421375
&tset,
13431376
&asBsonValue,
13441377
contentionFactor,
1378+
0 /* msize */,
13451379
common.collectionsLevel1Token,
13461380
common.serverTokenDerivationLevel1Token,
13471381
status)) {
@@ -1543,16 +1577,15 @@ static bool _mongocrypt_fle2_placeholder_to_insert_update_ciphertextForTextSearc
15431577
_mongocrypt_buffer_copy_to(&payload.edcDerivedToken, &payload.escDerivedToken);
15441578
_mongocrypt_buffer_copy_to(&payload.edcDerivedToken, &payload.serverDerivedFromDataToken);
15451579

1546-
// p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor)
1580+
// p := EncryptCTR(ECOCToken, ESCDerivedFromDataTokenAndContentionFactor | 0x000000)
15471581
// Since p is never used for text search, this just sets p to a bogus ciphertext of
15481582
// the correct length.
15491583
if (!_fle2_derive_encrypted_token(kb->crypt->crypto,
15501584
&payload.encryptedTokens,
1551-
false,
15521585
common.collectionsLevel1Token,
15531586
&payload.escDerivedToken, // bogus
15541587
NULL, // unused in FLE2v2
1555-
(mc_optional_bool_t){0},
1588+
(struct encrypted_token_metadata){.msize = OPT_U32(0)},
15561589
status)) {
15571590
goto fail;
15581591
}

test/test-mc-text-search-str-encode.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,12 @@ static void test_nofold_suffix_prefix_case(_mongocrypt_tester_t *tester,
9090
if (lb > padded_len) {
9191
ASSERT(sets->suffix_set == NULL);
9292
ASSERT(sets->prefix_set == NULL);
93+
ASSERT_CMPUINT32(sets->msize, ==, 1 /* for exact string */);
9394
goto CONTINUE;
9495
}
9596

97+
ASSERT_CMPUINT32(sets->msize, ==, n_affixes + 1 /* for exact string */);
98+
9699
TEST_PRINTF("Expecting: n_real_affixes: %u, n_affixes: %u, n_padding: %u\n",
97100
n_real_affixes,
98101
n_affixes,
@@ -263,11 +266,14 @@ static void test_nofold_substring_case(_mongocrypt_tester_t *tester,
263266

264267
if (lb > padded_len) {
265268
ASSERT(sets->substring_set == NULL);
269+
ASSERT_CMPUINT32(sets->msize, ==, 1 /* for exact string */);
266270
goto cleanup;
267271
} else {
268272
ASSERT(sets->substring_set != NULL);
269273
}
270274

275+
ASSERT_CMPUINT32(sets->msize, ==, n_substrings + 1 /* for exact string */);
276+
271277
uint32_t n_real_substrings = calc_unique_substrings(sets->base_string, lb, ub);
272278
uint32_t n_padding = n_substrings - n_real_substrings;
273279

@@ -1161,6 +1167,8 @@ static void _test_text_search_str_encode_multiple(_mongocrypt_tester_t *tester)
11611167
ASSERT_CMPUINT32(sets->exact.len, ==, 9);
11621168
ASSERT_CMPINT(0, ==, memcmp(sets->exact.data, str, 9));
11631169

1170+
ASSERT_CMPUINT32(sets->msize, ==, 1 + 3 + 5 + 3); /* exact + substring + suffix + prefix */
1171+
11641172
mc_str_encode_sets_destroy(sets);
11651173
}
11661174

0 commit comments

Comments
 (0)