Skip to content

Commit facf082

Browse files
authored
MONGOCRYPT-759 Implement CFold (#941)
1 parent 6b00083 commit facf082

11 files changed

+5525
-346
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ set (MONGOCRYPT_SOURCES
153153
src/os_posix/os_mutex.c
154154
src/os_win/os_dll.c
155155
src/os_posix/os_dll.c
156+
src/unicode/case-fold-map.c
157+
src/unicode/diacritic-fold-map.c
158+
src/unicode/fold.c
156159
)
157160

158161
# If MONGOCRYPT_CRYPTO is not set, choose a system default.
@@ -514,6 +517,7 @@ set (TEST_MONGOCRYPT_SOURCES
514517
test/test-mongocrypt-util.c
515518
test/test-mongocrypt.c
516519
test/test-named-kms-providers.c
520+
test/test-unicode-fold.c
517521
)
518522

519523
# Define test-mongocrypt

src/mc-text-search-str-encode-private.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,6 @@ typedef struct {
4040
// Run StrEncode with the given spec.
4141
mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec, mongocrypt_status_t *status);
4242

43-
// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding
44-
mc_str_encode_sets_t *mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec,
45-
uint32_t unfolded_len,
46-
mongocrypt_status_t *status);
47-
4843
void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets);
4944

5045
#endif /* MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H */

src/mc-text-search-str-encode.c

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "mc-text-search-str-encode-private.h"
1919
#include "mongocrypt-buffer-private.h"
2020
#include "mongocrypt.h"
21+
#include "unicode/fold.h"
2122
#include <bson/bson.h>
2223
#include <stdint.h>
2324

@@ -170,23 +171,47 @@ static uint32_t mc_get_utf8_codepoint_length(const char *buf, uint32_t len) {
170171
return codepoint_len;
171172
}
172173

173-
// TODO MONGOCRYPT-759 This helper only exists to test folded len != unfolded len; make the test actually use folding
174-
mc_str_encode_sets_t *mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec,
175-
uint32_t unfolded_codepoint_len,
176-
mongocrypt_status_t *status) {
174+
mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec,
175+
mongocrypt_status_t *status) {
177176
BSON_ASSERT_PARAM(spec);
177+
if (spec->len > MAX_ENCODE_BYTE_LEN) {
178+
CLIENT_ERR("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes",
179+
spec->len,
180+
MAX_ENCODE_BYTE_LEN);
181+
return NULL;
182+
}
178183

179184
if (!bson_utf8_validate(spec->v, spec->len, false /* allow_null */)) {
180185
CLIENT_ERR("StrEncode: String passed in was not valid UTF-8");
181186
return NULL;
182187
}
188+
uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length(spec->v, spec->len);
189+
if (unfolded_codepoint_len == 0) {
190+
// Empty string: We set unfolded length to 1 so that we generate fake tokens.
191+
unfolded_codepoint_len = 1;
192+
}
183193

184-
const char *folded_str = spec->v;
185-
uint32_t folded_str_bytes_len = spec->len;
194+
mc_utf8_string_with_bad_char_t *base_string;
195+
if (spec->casef || spec->diacf) {
196+
char *folded_str;
197+
size_t folded_str_bytes_len;
198+
if (!unicode_fold(spec->v,
199+
spec->len,
200+
(spec->casef * kUnicodeFoldToLower) | (spec->diacf * kUnicodeFoldRemoveDiacritics),
201+
&folded_str,
202+
&folded_str_bytes_len,
203+
status)) {
204+
return NULL;
205+
}
206+
base_string = mc_utf8_string_with_bad_char_from_buffer(folded_str, (uint32_t)folded_str_bytes_len);
207+
bson_free(folded_str);
208+
} else {
209+
base_string = mc_utf8_string_with_bad_char_from_buffer(spec->v, spec->len);
210+
}
186211

187212
mc_str_encode_sets_t *sets = bson_malloc0(sizeof(mc_str_encode_sets_t));
188213
// Base string is the folded string plus the 0xFF character
189-
sets->base_string = mc_utf8_string_with_bad_char_from_buffer(folded_str, folded_str_bytes_len);
214+
sets->base_string = base_string;
190215
if (spec->suffix.set) {
191216
sets->suffix_set = generate_suffix_tree(sets->base_string, unfolded_codepoint_len, &spec->suffix.value);
192217
}
@@ -204,33 +229,11 @@ mc_str_encode_sets_t *mc_text_search_str_encode_helper(const mc_FLE2TextSearchIn
204229
}
205230
sets->substring_set = generate_substring_tree(sets->base_string, unfolded_codepoint_len, &spec->substr.value);
206231
}
207-
// Exact string is always the first len characters of the base string
208-
_mongocrypt_buffer_from_data(&sets->exact, sets->base_string->buf.data, folded_str_bytes_len);
232+
// Exact string is always equal to the base string up until the bad character
233+
_mongocrypt_buffer_from_data(&sets->exact, sets->base_string->buf.data, (uint32_t)sets->base_string->buf.len - 1);
209234
return sets;
210235
}
211236

212-
mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec,
213-
mongocrypt_status_t *status) {
214-
BSON_ASSERT_PARAM(spec);
215-
if (spec->len > MAX_ENCODE_BYTE_LEN) {
216-
CLIENT_ERR("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes",
217-
spec->len,
218-
MAX_ENCODE_BYTE_LEN);
219-
return NULL;
220-
}
221-
// TODO MONGOCRYPT-759 Implement and use CFold
222-
if (!bson_utf8_validate(spec->v, spec->len, false /* allow_null */)) {
223-
CLIENT_ERR("StrEncode: String passed in was not valid UTF-8");
224-
return NULL;
225-
}
226-
uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length(spec->v, spec->len);
227-
if (unfolded_codepoint_len == 0) {
228-
// Empty string: We set unfolded length to 1 so that we generate fake tokens.
229-
unfolded_codepoint_len = 1;
230-
}
231-
return mc_text_search_str_encode_helper(spec, unfolded_codepoint_len, status);
232-
}
233-
234237
void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets) {
235238
if (!sets) {
236239
return;

0 commit comments

Comments
 (0)