18
18
#include "mc-text-search-str-encode-private.h"
19
19
#include "mongocrypt-buffer-private.h"
20
20
#include "mongocrypt.h"
21
+ #include "unicode/fold.h"
21
22
#include <bson/bson.h>
22
23
#include <stdint.h>
23
24
@@ -170,23 +171,47 @@ static uint32_t mc_get_utf8_codepoint_length(const char *buf, uint32_t len) {
170
171
return codepoint_len ;
171
172
}
172
173
173
- // TODO MONGOCRYPT-759 This helper only exists to test folded len != unfolded len; make the test actually use folding
174
- mc_str_encode_sets_t * mc_text_search_str_encode_helper (const mc_FLE2TextSearchInsertSpec_t * spec ,
175
- uint32_t unfolded_codepoint_len ,
176
- mongocrypt_status_t * status ) {
174
+ mc_str_encode_sets_t * mc_text_search_str_encode (const mc_FLE2TextSearchInsertSpec_t * spec ,
175
+ mongocrypt_status_t * status ) {
177
176
BSON_ASSERT_PARAM (spec );
177
+ if (spec -> len > MAX_ENCODE_BYTE_LEN ) {
178
+ CLIENT_ERR ("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes" ,
179
+ spec -> len ,
180
+ MAX_ENCODE_BYTE_LEN );
181
+ return NULL ;
182
+ }
178
183
179
184
if (!bson_utf8_validate (spec -> v , spec -> len , false /* allow_null */ )) {
180
185
CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
181
186
return NULL ;
182
187
}
188
+ uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
189
+ if (unfolded_codepoint_len == 0 ) {
190
+ // Empty string: We set unfolded length to 1 so that we generate fake tokens.
191
+ unfolded_codepoint_len = 1 ;
192
+ }
183
193
184
- const char * folded_str = spec -> v ;
185
- uint32_t folded_str_bytes_len = spec -> len ;
194
+ mc_utf8_string_with_bad_char_t * base_string ;
195
+ if (spec -> casef || spec -> diacf ) {
196
+ char * folded_str ;
197
+ size_t folded_str_bytes_len ;
198
+ if (!unicode_fold (spec -> v ,
199
+ spec -> len ,
200
+ (spec -> casef * kUnicodeFoldToLower ) | (spec -> diacf * kUnicodeFoldRemoveDiacritics ),
201
+ & folded_str ,
202
+ & folded_str_bytes_len ,
203
+ status )) {
204
+ return NULL ;
205
+ }
206
+ base_string = mc_utf8_string_with_bad_char_from_buffer (folded_str , (uint32_t )folded_str_bytes_len );
207
+ bson_free (folded_str );
208
+ } else {
209
+ base_string = mc_utf8_string_with_bad_char_from_buffer (spec -> v , spec -> len );
210
+ }
186
211
187
212
mc_str_encode_sets_t * sets = bson_malloc0 (sizeof (mc_str_encode_sets_t ));
188
213
// Base string is the folded string plus the 0xFF character
189
- sets -> base_string = mc_utf8_string_with_bad_char_from_buffer ( folded_str , folded_str_bytes_len ) ;
214
+ sets -> base_string = base_string ;
190
215
if (spec -> suffix .set ) {
191
216
sets -> suffix_set = generate_suffix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> suffix .value );
192
217
}
@@ -204,33 +229,11 @@ mc_str_encode_sets_t *mc_text_search_str_encode_helper(const mc_FLE2TextSearchIn
204
229
}
205
230
sets -> substring_set = generate_substring_tree (sets -> base_string , unfolded_codepoint_len , & spec -> substr .value );
206
231
}
207
- // Exact string is always the first len characters of the base string
208
- _mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , folded_str_bytes_len );
232
+ // Exact string is always equal to the base string up until the bad character
233
+ _mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , ( uint32_t ) sets -> base_string -> buf . len - 1 );
209
234
return sets ;
210
235
}
211
236
212
- mc_str_encode_sets_t * mc_text_search_str_encode (const mc_FLE2TextSearchInsertSpec_t * spec ,
213
- mongocrypt_status_t * status ) {
214
- BSON_ASSERT_PARAM (spec );
215
- if (spec -> len > MAX_ENCODE_BYTE_LEN ) {
216
- CLIENT_ERR ("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes" ,
217
- spec -> len ,
218
- MAX_ENCODE_BYTE_LEN );
219
- return NULL ;
220
- }
221
- // TODO MONGOCRYPT-759 Implement and use CFold
222
- if (!bson_utf8_validate (spec -> v , spec -> len , false /* allow_null */ )) {
223
- CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
224
- return NULL ;
225
- }
226
- uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
227
- if (unfolded_codepoint_len == 0 ) {
228
- // Empty string: We set unfolded length to 1 so that we generate fake tokens.
229
- unfolded_codepoint_len = 1 ;
230
- }
231
- return mc_text_search_str_encode_helper (spec , unfolded_codepoint_len , status );
232
- }
233
-
234
237
void mc_str_encode_sets_destroy (mc_str_encode_sets_t * sets ) {
235
238
if (!sets ) {
236
239
return ;
0 commit comments