24
24
25
25
// 16MiB - maximum length in bytes of a string to be encoded.
26
26
#define MAX_ENCODE_BYTE_LEN 16777216
27
+ // Number of bytes which are added to the base string before encryption.
28
+ #define OVERHEAD_BYTES 5
27
29
28
30
static mc_affix_set_t * generate_prefix_or_suffix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
29
- uint32_t unfolded_codepoint_len ,
31
+ uint32_t unfolded_byte_len ,
30
32
uint32_t lb ,
31
33
uint32_t ub ,
32
34
bool is_prefix ) {
33
35
BSON_ASSERT_PARAM (base_str );
34
- // 16 * ceil(unfolded codepoint len / 16)
35
- uint32_t cbclen = 16 * (uint32_t )((unfolded_codepoint_len + 15 ) / 16 );
36
- if (cbclen < lb ) {
36
+ // We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
37
+ uint32_t encrypted_len = 16 * (uint32_t )((unfolded_byte_len + OVERHEAD_BYTES + 15 ) / 16 );
38
+ // Max len of a string that has this encrypted len.
39
+ uint32_t padded_len = encrypted_len - OVERHEAD_BYTES ;
40
+ if (padded_len < lb ) {
37
41
// No valid substrings, return empty tree
38
42
return NULL ;
39
43
}
40
44
41
45
// Total number of substrings
42
- uint32_t msize = BSON_MIN (cbclen , ub ) - lb + 1 ;
46
+ uint32_t msize = BSON_MIN (padded_len , ub ) - lb + 1 ;
43
47
uint32_t folded_codepoint_len = base_str -> codepoint_len - 1 ; // remove one codepoint for 0xFF
44
48
uint32_t real_max_len = BSON_MIN (folded_codepoint_len , ub );
45
49
// Number of actual substrings, excluding padding
@@ -67,19 +71,19 @@ static mc_affix_set_t *generate_prefix_or_suffix_tree(const mc_utf8_string_with_
67
71
}
68
72
69
73
static mc_affix_set_t * generate_suffix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
70
- uint32_t unfolded_codepoint_len ,
74
+ uint32_t unfolded_byte_len ,
71
75
const mc_FLE2SuffixInsertSpec_t * spec ) {
72
76
BSON_ASSERT_PARAM (base_str );
73
77
BSON_ASSERT_PARAM (spec );
74
- return generate_prefix_or_suffix_tree (base_str , unfolded_codepoint_len , spec -> lb , spec -> ub , false);
78
+ return generate_prefix_or_suffix_tree (base_str , unfolded_byte_len , spec -> lb , spec -> ub , false);
75
79
}
76
80
77
81
static mc_affix_set_t * generate_prefix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
78
- uint32_t unfolded_codepoint_len ,
82
+ uint32_t unfolded_byte_len ,
79
83
const mc_FLE2PrefixInsertSpec_t * spec ) {
80
84
BSON_ASSERT_PARAM (base_str );
81
85
BSON_ASSERT_PARAM (spec );
82
- return generate_prefix_or_suffix_tree (base_str , unfolded_codepoint_len , spec -> lb , spec -> ub , true);
86
+ return generate_prefix_or_suffix_tree (base_str , unfolded_byte_len , spec -> lb , spec -> ub , true);
83
87
}
84
88
85
89
static uint32_t calc_number_of_substrings (uint32_t strlen , uint32_t lb , uint32_t ub ) {
@@ -97,13 +101,15 @@ static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t
97
101
}
98
102
99
103
static mc_substring_set_t * generate_substring_tree (const mc_utf8_string_with_bad_char_t * base_str ,
100
- uint32_t unfolded_codepoint_len ,
104
+ uint32_t unfolded_byte_len ,
101
105
const mc_FLE2SubstringInsertSpec_t * spec ) {
102
106
BSON_ASSERT_PARAM (base_str );
103
107
BSON_ASSERT_PARAM (spec );
104
- // 16 * ceil(unfolded len / 16)
105
- uint32_t cbclen = 16 * (uint32_t )((unfolded_codepoint_len + 15 ) / 16 );
106
- if (unfolded_codepoint_len > spec -> mlen || cbclen < spec -> lb ) {
108
+ // We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
109
+ uint32_t encrypted_len = 16 * (uint32_t )((unfolded_byte_len + OVERHEAD_BYTES + 15 ) / 16 );
110
+ // Max len of a string that has this encrypted len.
111
+ uint32_t padded_len = encrypted_len - OVERHEAD_BYTES ;
112
+ if (padded_len < spec -> lb ) {
107
113
// No valid substrings, return empty tree
108
114
return NULL ;
109
115
}
@@ -112,30 +118,30 @@ static mc_substring_set_t *generate_substring_tree(const mc_utf8_string_with_bad
112
118
// justifies why that calculation and this calculation are equivalent.
113
119
// At this point, it is established that:
114
120
// beta <= mlen
115
- // lb <= cbclen
121
+ // lb <= padded_len
116
122
// lb <= ub <= mlen
117
123
//
118
124
// So, the following formula for msize in the OST paper:
119
125
// maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1))
120
- // maxkgram_2 = sum_(j=lb, min(ub, cbclen ), (cbclen - j + 1))
126
+ // maxkgram_2 = sum_(j=lb, min(ub, padded_len ), (padded_len - j + 1))
121
127
// msize = min(maxkgram_1, maxkgram_2)
122
128
// can be simplified to:
123
- // msize = sum_(j=lb, min(ub, cbclen ), (min(mlen, cbclen ) - j + 1))
129
+ // msize = sum_(j=lb, min(ub, padded_len ), (min(mlen, padded_len ) - j + 1))
124
130
//
125
- // because if cbclen <= ub, then it follows that cbclen <= ub <= mlen, and so
131
+ // because if padded_len <= ub, then it follows that padded_len <= ub <= mlen, and so
126
132
// maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1)) # as above
127
- // maxkgram_2 = sum_(j=lb, cbclen , (cbclen - j + 1)) # less or equal to maxkgram_1
133
+ // maxkgram_2 = sum_(j=lb, padded_len , (padded_len - j + 1)) # less or equal to maxkgram_1
128
134
// msize = maxkgram_2
129
- // and if cbclen > ub, then it follows that:
135
+ // and if padded_len > ub, then it follows that:
130
136
// maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1)) # as above
131
- // maxkgram_2 = sum_(j=lb, ub, (cbclen - j + 1)) # same sum bounds as maxkgram_1
132
- // msize = sum_(j=lb, ub, (min(mlen, cbclen ) - j + 1))
137
+ // maxkgram_2 = sum_(j=lb, ub, (padded_len - j + 1)) # same sum bounds as maxkgram_1
138
+ // msize = sum_(j=lb, ub, (min(mlen, padded_len ) - j + 1))
133
139
// in both cases, msize can be rewritten as:
134
- // msize = sum_(j=lb, min(ub, cbclen ), (min(mlen, cbclen ) - j + 1))
140
+ // msize = sum_(j=lb, min(ub, padded_len ), (min(mlen, padded_len ) - j + 1))
135
141
136
142
uint32_t folded_codepoint_len = base_str -> codepoint_len - 1 ;
137
- // If mlen < cbclen , we only need to pad to mlen
138
- uint32_t padded_len = BSON_MIN (spec -> mlen , cbclen );
143
+ // If mlen < padded_len , we only need to pad to mlen
144
+ padded_len = BSON_MIN (spec -> mlen , padded_len );
139
145
// Total number of substrings -- i.e. the number of valid substrings IF the string spanned the full padded length
140
146
uint32_t msize = calc_number_of_substrings (padded_len , spec -> lb , spec -> ub );
141
147
uint32_t n_real_substrings = 0 ;
@@ -185,11 +191,6 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
185
191
CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
186
192
return NULL ;
187
193
}
188
- uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
189
- if (unfolded_codepoint_len == 0 ) {
190
- // Empty string: We set unfolded length to 1 so that we generate fake tokens.
191
- unfolded_codepoint_len = 1 ;
192
- }
193
194
194
195
mc_utf8_string_with_bad_char_t * base_string ;
195
196
if (spec -> casef || spec -> diacf ) {
@@ -213,12 +214,13 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
213
214
// Base string is the folded string plus the 0xFF character
214
215
sets -> base_string = base_string ;
215
216
if (spec -> suffix .set ) {
216
- sets -> suffix_set = generate_suffix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> suffix .value );
217
+ sets -> suffix_set = generate_suffix_tree (sets -> base_string , spec -> len , & spec -> suffix .value );
217
218
}
218
219
if (spec -> prefix .set ) {
219
- sets -> prefix_set = generate_prefix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> prefix .value );
220
+ sets -> prefix_set = generate_prefix_tree (sets -> base_string , spec -> len , & spec -> prefix .value );
220
221
}
221
222
if (spec -> substr .set ) {
223
+ uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
222
224
if (unfolded_codepoint_len > spec -> substr .value .mlen ) {
223
225
CLIENT_ERR ("StrEncode: String passed in was longer than the maximum length for substring indexing -- "
224
226
"String len: %u, max len: %u" ,
@@ -227,7 +229,7 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
227
229
mc_str_encode_sets_destroy (sets );
228
230
return NULL ;
229
231
}
230
- sets -> substring_set = generate_substring_tree (sets -> base_string , unfolded_codepoint_len , & spec -> substr .value );
232
+ sets -> substring_set = generate_substring_tree (sets -> base_string , spec -> len , & spec -> substr .value );
231
233
}
232
234
// Exact string is always equal to the base string up until the bad character
233
235
_mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , (uint32_t )sets -> base_string -> buf .len - 1 );
0 commit comments