diff --git a/builtin-functions/kphp-light/stdlib/server-functions.txt b/builtin-functions/kphp-light/stdlib/server-functions.txt index 5a62421a11..e424eec82c 100644 --- a/builtin-functions/kphp-light/stdlib/server-functions.txt +++ b/builtin-functions/kphp-light/stdlib/server-functions.txt @@ -69,6 +69,8 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false; function memory_get_detailed_stats() ::: int[]; +function prepare_search_query ($query ::: string) ::: string; + // ===== UNSUPPORTED ===== /** @kphp-extern-func-info stub */ @@ -130,7 +132,3 @@ function flush() ::: void; define('PHP_QUERY_RFC1738', 1); define('PHP_QUERY_RFC3986', 2); - -/** @kphp-extern-func-info stub generation-required */ -function prepare_search_query ($query ::: string) ::: string; - diff --git a/common/unicode/unicode-utils.cpp b/common/unicode/unicode-utils.cpp index 646997ab86..ebeb7b05ed 100644 --- a/common/unicode/unicode-utils.cpp +++ b/common/unicode/unicode-utils.cpp @@ -93,11 +93,10 @@ int prepare_search_string(int* input) { return output - input; } -#define MAX_NAME_SIZE 65536 -static char prep_buf[4 * MAX_NAME_SIZE + 4]; -int prep_ibuf[MAX_NAME_SIZE + 4]; -static int prep_ibuf_res[MAX_NAME_SIZE + 4]; -static int* words_ibuf[MAX_NAME_SIZE + 4]; +static char prep_buf[MAX_NAME_BYTES_SIZE]; +int prep_ibuf[MAX_NAME_CODE_POINTS_SIZE]; +static int prep_ibuf_res[MAX_NAME_CODE_POINTS_SIZE]; +static int* words_ibuf[MAX_NAME_CODE_POINTS_SIZE]; int stricmp_void(const void* x, const void* y) { const int* s1 = *(const int**)x; diff --git a/common/unicode/unicode-utils.h b/common/unicode/unicode-utils.h index fbbbe516b5..fb214488c0 100644 --- a/common/unicode/unicode-utils.h +++ b/common/unicode/unicode-utils.h @@ -4,6 +4,12 @@ #pragma once +#include + +inline constexpr size_t MAX_NAME_SIZE = 65536; +inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4; +inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4; + int unicode_toupper(int code); int unicode_tolower(int code); const char* clean_str(const char* x); diff --git a/common/unicode/utf8-utils.cpp b/common/unicode/utf8-utils.cpp index fbf65bee80..16c6be0aaa 100644 --- a/common/unicode/utf8-utils.cpp +++ b/common/unicode/utf8-utils.cpp @@ -4,6 +4,7 @@ #include "common/unicode/utf8-utils.h" +#include #include #include #include @@ -991,164 +992,165 @@ int simplify_character(int c) { } } -const int _s_1__[] = {97, 0}; -const int _v_1__[] = {1072, 0}; -const int _s_2__[] = {98, 0}; -const int _v_2__[] = {1073, 0}; -const int _s_3__[] = {99, 0}; -const int _v_3__[] = {1082, 0}; -const int _s_4__[] = {99, 104, 0}; -const int _v_4__[] = {1095, 0}; -const int _s_5__[] = {100, 0}; -const int _v_5__[] = {1076, 0}; -const int _s_6__[] = {101, 0}; -const int _v_6__[] = {1077, 0}; -const int _s_7__[] = {101, 105, 0}; -const int _v_7__[] = {1077, 1081, 0}; -const int _s_8__[] = {101, 121, 0}; -const int _v_8__[] = {1077, 1081, 0}; -const int _s_9__[] = {102, 0}; -const int _v_9__[] = {1092, 0}; -const int _s_10__[] = {103, 0}; -const int _v_10__[] = {1075, 0}; -const int _s_11__[] = {104, 0}; -const int _v_11__[] = {1093, 0}; -const int _s_12__[] = {105, 0}; -const int _v_12__[] = {1080, 0}; -const int _s_13__[] = {105, 97, 0}; -const int _v_13__[] = {1080, 1103, 0}; -const int _s_14__[] = {105, 121, 0}; -const int _v_14__[] = {1080, 1081, 0}; -const int _s_15__[] = {106, 0}; -const int _v_15__[] = {1081, 0}; -const int _s_16__[] = {106, 111, 0}; -const int _v_16__[] = {1077, 0}; -const int _s_17__[] = {106, 117, 0}; -const int _v_17__[] = {1102, 0}; -const int _s_18__[] = {106, 97, 0}; -const int _v_18__[] = {1103, 0}; -const int _s_19__[] = {107, 0}; -const int _v_19__[] = {1082, 0}; -const int _s_20__[] = {107, 104, 0}; -const int _v_20__[] = {1093, 0}; -const int _s_21__[] = {108, 0}; -const int _v_21__[] = {1083, 0}; -const int _s_22__[] = {109, 0}; -const int _v_22__[] = {1084, 0}; -const int _s_23__[] = {110, 0}; -const int _v_23__[] = {1085, 0}; -const int _s_24__[] = {111, 0}; -const int _v_24__[] = {1086, 0}; -const int _s_25__[] = {112, 0}; -const int _v_25__[] = {1087, 0}; -const int _s_26__[] = {113, 0}; -const int _v_26__[] = {1082, 0}; -const int _s_27__[] = {114, 0}; -const int _v_27__[] = {1088, 0}; -const int _s_28__[] = {115, 0}; -const int _v_28__[] = {1089, 0}; -const int _s_29__[] = {115, 104, 0}; -const int _v_29__[] = {1096, 0}; -const int _s_30__[] = {115, 104, 99, 104, 0}; -const int _v_30__[] = {1097, 0}; -const int _s_31__[] = {115, 99, 104, 0}; -const int _v_31__[] = {1097, 0}; -const int _s_32__[] = {116, 0}; -const int _v_32__[] = {1090, 0}; -const int _s_33__[] = {116, 115, 0}; -const int _v_33__[] = {1094, 0}; -const int _s_34__[] = {117, 0}; -const int _v_34__[] = {1091, 0}; -const int _s_35__[] = {118, 0}; -const int _v_35__[] = {1074, 0}; -const int _s_36__[] = {119, 0}; -const int _v_36__[] = {1074, 0}; -const int _s_37__[] = {120, 0}; -const int _v_37__[] = {1082, 1089, 0}; -const int _s_38__[] = {121, 0}; -const int _v_38__[] = {1080, 0}; -const int _s_39__[] = {121, 111, 0}; -const int _v_39__[] = {1077, 0}; -const int _s_40__[] = {121, 117, 0}; -const int _v_40__[] = {1102, 0}; -const int _s_41__[] = {121, 97, 0}; -const int _v_41__[] = {1103, 0}; -const int _s_42__[] = {122, 0}; -const int _v_42__[] = {1079, 0}; -const int _s_43__[] = {122, 104, 0}; -const int _v_43__[] = {1078, 0}; -const int _s_44__[] = {1072, 0}; -const int _v_44__[] = {97, 0}; -const int _s_45__[] = {1073, 0}; -const int _v_45__[] = {98, 0}; -const int _s_46__[] = {1074, 0}; -const int _v_46__[] = {118, 0}; -const int _s_47__[] = {1075, 0}; -const int _v_47__[] = {103, 0}; -const int _s_48__[] = {1076, 0}; -const int _v_48__[] = {100, 0}; -const int _s_49__[] = {1077, 0}; -const int _v_49__[] = {101, 0}; -const int _s_50__[] = {1105, 0}; -const int _v_50__[] = {101, 0}; -const int _s_51__[] = {1078, 0}; -const int _v_51__[] = {122, 104, 0}; -const int _s_52__[] = {1079, 0}; -const int _v_52__[] = {122, 0}; -const int _s_53__[] = {1080, 0}; -const int _v_53__[] = {105, 0}; -const int _s_54__[] = {1080, 1081, 0}; -const int _v_54__[] = {121, 0}; -const int _s_55__[] = {1080, 1103, 0}; -const int _v_55__[] = {105, 97, 0}; -const int _s_56__[] = {1081, 0}; -const int _v_56__[] = {121, 0}; -const int _s_57__[] = {1082, 0}; -const int _v_57__[] = {107, 0}; -const int _s_58__[] = {1082, 1089, 0}; -const int _v_58__[] = {120, 0}; -const int _s_59__[] = {1083, 0}; -const int _v_59__[] = {108, 0}; -const int _s_60__[] = {1084, 0}; -const int _v_60__[] = {109, 0}; -const int _s_61__[] = {1085, 0}; -const int _v_61__[] = {110, 0}; -const int _s_62__[] = {1086, 0}; -const int _v_62__[] = {111, 0}; -const int _s_63__[] = {1087, 0}; -const int _v_63__[] = {112, 0}; -const int _s_64__[] = {1088, 0}; -const int _v_64__[] = {114, 0}; -const int _s_65__[] = {1089, 0}; -const int _v_65__[] = {115, 0}; -const int _s_66__[] = {1090, 0}; -const int _v_66__[] = {116, 0}; -const int _s_67__[] = {1091, 0}; -const int _v_67__[] = {117, 0}; -const int _s_68__[] = {1092, 0}; -const int _v_68__[] = {102, 0}; -const int _s_69__[] = {1093, 0}; -const int _v_69__[] = {107, 104, 0}; -const int _s_70__[] = {1094, 0}; -const int _v_70__[] = {116, 115, 0}; -const int _s_71__[] = {1095, 0}; -const int _v_71__[] = {99, 104, 0}; -const int _s_72__[] = {1096, 0}; -const int _v_72__[] = {115, 104, 0}; -const int _s_73__[] = {1097, 0}; -const int _v_73__[] = {115, 104, 99, 104, 0}; -const int _s_74__[] = {1098, 0}; -const int _v_74__[] = {0}; -const int _s_75__[] = {1099, 0}; -const int _v_75__[] = {121, 0}; -const int _s_76__[] = {1100, 0}; -const int _v_76__[] = {0}; -const int _s_77__[] = {1101, 0}; -const int _v_77__[] = {101, 0}; -const int _s_78__[] = {1102, 0}; -const int _v_78__[] = {121, 117, 0}; -const int _s_79__[] = {1103, 0}; -const int _v_79__[] = {121, 97, 0}; +// TODO does constexpr std::array enough for safe use in runtime-light ? +constexpr std::array _s_1__{97, 0}; +constexpr std::array _v_1__{1072, 0}; +constexpr std::array _s_2__{98, 0}; +constexpr std::array _v_2__{1073, 0}; +constexpr std::array _s_3__{99, 0}; +constexpr std::array _v_3__{1082, 0}; +constexpr std::array _s_4__{99, 104, 0}; +constexpr std::array _v_4__{1095, 0}; +constexpr std::array _s_5__{100, 0}; +constexpr std::array _v_5__{1076, 0}; +constexpr std::array _s_6__{101, 0}; +constexpr std::array _v_6__{1077, 0}; +constexpr std::array _s_7__{101, 105, 0}; +constexpr std::array _v_7__{1077, 1081, 0}; +constexpr std::array _s_8__{101, 121, 0}; +constexpr std::array _v_8__{1077, 1081, 0}; +constexpr std::array _s_9__{102, 0}; +constexpr std::array _v_9__{1092, 0}; +constexpr std::array _s_10__{103, 0}; +constexpr std::array _v_10__{1075, 0}; +constexpr std::array _s_11__{104, 0}; +constexpr std::array _v_11__{1093, 0}; +constexpr std::array _s_12__{105, 0}; +constexpr std::array _v_12__{1080, 0}; +constexpr std::array _s_13__{105, 97, 0}; +constexpr std::array _v_13__{1080, 1103, 0}; +constexpr std::array _s_14__{105, 121, 0}; +constexpr std::array _v_14__{1080, 1081, 0}; +constexpr std::array _s_15__{106, 0}; +constexpr std::array _v_15__{1081, 0}; +constexpr std::array _s_16__{106, 111, 0}; +constexpr std::array _v_16__{1077, 0}; +constexpr std::array _s_17__{106, 117, 0}; +constexpr std::array _v_17__{1102, 0}; +constexpr std::array _s_18__{106, 97, 0}; +constexpr std::array _v_18__{1103, 0}; +constexpr std::array _s_19__{107, 0}; +constexpr std::array _v_19__{1082, 0}; +constexpr std::array _s_20__{107, 104, 0}; +constexpr std::array _v_20__{1093, 0}; +constexpr std::array _s_21__{108, 0}; +constexpr std::array _v_21__{1083, 0}; +constexpr std::array _s_22__{109, 0}; +constexpr std::array _v_22__{1084, 0}; +constexpr std::array _s_23__{110, 0}; +constexpr std::array _v_23__{1085, 0}; +constexpr std::array _s_24__{111, 0}; +constexpr std::array _v_24__{1086, 0}; +constexpr std::array _s_25__{112, 0}; +constexpr std::array _v_25__{1087, 0}; +constexpr std::array _s_26__{113, 0}; +constexpr std::array _v_26__{1082, 0}; +constexpr std::array _s_27__{114, 0}; +constexpr std::array _v_27__{1088, 0}; +constexpr std::array _s_28__{115, 0}; +constexpr std::array _v_28__{1089, 0}; +constexpr std::array _s_29__{115, 104, 0}; +constexpr std::array _v_29__{1096, 0}; +constexpr std::array _s_30__{115, 104, 99, 104, 0}; +constexpr std::array _v_30__{1097, 0}; +constexpr std::array _s_31__{115, 99, 104, 0}; +constexpr std::array _v_31__{1097, 0}; +constexpr std::array _s_32__{116, 0}; +constexpr std::array _v_32__{1090, 0}; +constexpr std::array _s_33__{116, 115, 0}; +constexpr std::array _v_33__{1094, 0}; +constexpr std::array _s_34__{117, 0}; +constexpr std::array _v_34__{1091, 0}; +constexpr std::array _s_35__{118, 0}; +constexpr std::array _v_35__{1074, 0}; +constexpr std::array _s_36__{119, 0}; +constexpr std::array _v_36__{1074, 0}; +constexpr std::array _s_37__{120, 0}; +constexpr std::array _v_37__{1082, 1089, 0}; +constexpr std::array _s_38__{121, 0}; +constexpr std::array _v_38__{1080, 0}; +constexpr std::array _s_39__{121, 111, 0}; +constexpr std::array _v_39__{1077, 0}; +constexpr std::array _s_40__{121, 117, 0}; +constexpr std::array _v_40__{1102, 0}; +constexpr std::array _s_41__{121, 97, 0}; +constexpr std::array _v_41__{1103, 0}; +constexpr std::array _s_42__{122, 0}; +constexpr std::array _v_42__{1079, 0}; +constexpr std::array _s_43__{122, 104, 0}; +constexpr std::array _v_43__{1078, 0}; +constexpr std::array _s_44__{1072, 0}; +constexpr std::array _v_44__{97, 0}; +constexpr std::array _s_45__{1073, 0}; +constexpr std::array _v_45__{98, 0}; +constexpr std::array _s_46__{1074, 0}; +constexpr std::array _v_46__{118, 0}; +constexpr std::array _s_47__{1075, 0}; +constexpr std::array _v_47__{103, 0}; +constexpr std::array _s_48__{1076, 0}; +constexpr std::array _v_48__{100, 0}; +constexpr std::array _s_49__{1077, 0}; +constexpr std::array _v_49__{101, 0}; +constexpr std::array _s_50__{1105, 0}; +constexpr std::array _v_50__{101, 0}; +constexpr std::array _s_51__{1078, 0}; +constexpr std::array _v_51__{122, 104, 0}; +constexpr std::array _s_52__{1079, 0}; +constexpr std::array _v_52__{122, 0}; +constexpr std::array _s_53__{1080, 0}; +constexpr std::array _v_53__{105, 0}; +constexpr std::array _s_54__{1080, 1081, 0}; +constexpr std::array _v_54__{121, 0}; +constexpr std::array _s_55__{1080, 1103, 0}; +constexpr std::array _v_55__{105, 97, 0}; +constexpr std::array _s_56__{1081, 0}; +constexpr std::array _v_56__{121, 0}; +constexpr std::array _s_57__{1082, 0}; +constexpr std::array _v_57__{107, 0}; +constexpr std::array _s_58__{1082, 1089, 0}; +constexpr std::array _v_58__{120, 0}; +constexpr std::array _s_59__{1083, 0}; +constexpr std::array _v_59__{108, 0}; +constexpr std::array _s_60__{1084, 0}; +constexpr std::array _v_60__{109, 0}; +constexpr std::array _s_61__{1085, 0}; +constexpr std::array _v_61__{110, 0}; +constexpr std::array _s_62__{1086, 0}; +constexpr std::array _v_62__{111, 0}; +constexpr std::array _s_63__{1087, 0}; +constexpr std::array _v_63__{112, 0}; +constexpr std::array _s_64__{1088, 0}; +constexpr std::array _v_64__{114, 0}; +constexpr std::array _s_65__{1089, 0}; +constexpr std::array _v_65__{115, 0}; +constexpr std::array _s_66__{1090, 0}; +constexpr std::array _v_66__{116, 0}; +constexpr std::array _s_67__{1091, 0}; +constexpr std::array _v_67__{117, 0}; +constexpr std::array _s_68__{1092, 0}; +constexpr std::array _v_68__{102, 0}; +constexpr std::array _s_69__{1093, 0}; +constexpr std::array _v_69__{107, 104, 0}; +constexpr std::array _s_70__{1094, 0}; +constexpr std::array _v_70__{116, 115, 0}; +constexpr std::array _s_71__{1095, 0}; +constexpr std::array _v_71__{99, 104, 0}; +constexpr std::array _s_72__{1096, 0}; +constexpr std::array _v_72__{115, 104, 0}; +constexpr std::array _s_73__{1097, 0}; +constexpr std::array _v_73__{115, 104, 99, 104, 0}; +constexpr std::array _s_74__{1098, 0}; +constexpr std::array _v_74__{0}; +constexpr std::array _s_75__{1099, 0}; +constexpr std::array _v_75__{121, 0}; +constexpr std::array _s_76__{1100, 0}; +constexpr std::array _v_76__{0}; +constexpr std::array _s_77__{1101, 0}; +constexpr std::array _v_77__{101, 0}; +constexpr std::array _s_78__{1102, 0}; +constexpr std::array _v_78__{121, 117, 0}; +constexpr std::array _s_79__{1103, 0}; +constexpr std::array _v_79__{121, 97, 0}; int translit_string_utf8_from_en_to_ru(int* input, int* output) { @@ -1158,8 +1160,8 @@ int translit_string_utf8_from_en_to_ru(int* input, int* output) { k++; \ } \ if (!s[k]) { \ - match_v = v; \ - match_s = s; \ + match_v = v.data(); \ + match_s = s.data(); \ } int i = 0, j = 0, k = 0; @@ -1340,8 +1342,8 @@ int translit_string_utf8_from_ru_to_en(int* input, int* output) { k++; \ } \ if (!s[k]) { \ - match_v = v; \ - match_s = s; \ + match_v = v.data(); \ + match_s = s.data(); \ } int i = 0, j = 0, k = 0; diff --git a/runtime-light/stdlib/stdlib.cmake b/runtime-light/stdlib/stdlib.cmake index 824e92cd33..30d9be0097 100644 --- a/runtime-light/stdlib/stdlib.cmake +++ b/runtime-light/stdlib/stdlib.cmake @@ -36,6 +36,7 @@ prepend( string/regex-functions.cpp string/regex-state.cpp string/string-state.cpp + string/string-functions.cpp system/system-functions.cpp system/system-state.cpp time/time-functions.cpp diff --git a/runtime-light/stdlib/string/string-functions.cpp b/runtime-light/stdlib/string/string-functions.cpp new file mode 100644 index 0000000000..958607b375 --- /dev/null +++ b/runtime-light/stdlib/string/string-functions.cpp @@ -0,0 +1,83 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2025 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#include "runtime-light/stdlib/string/string-functions.h" + +#include +#include +#include + +#include "auto/common/unicode-utils-auto.h" +#include "runtime-light/k2-platform/k2-api.h" + +namespace string_functions_impl_ { + +/* Search generated ranges for specified character */ +int32_t binary_search_ranges(int32_t code) noexcept { + if (code > MAX_UTF8_CODE_POINT) { + return 0; + } + + size_t l{0}; + size_t r{prepare_table_ranges_size}; + while (l < r) { + size_t m{((l + r + 2) >> 2) << 1}; + if (prepare_table_ranges[m] <= code) { + l = m; + } else { + r = m - 2; + } + } + + // prepare_table_ranges[l] - key + // prepare_table_ranges[l + 1] - value + int32_t t{prepare_table_ranges[l + 1]}; + if (t < 0) { + return code - prepare_table_ranges[l] + (~t); + } + if (t <= 0x10ffff) { + return t; + } + switch (t - 0x200000) { + case 0: + return (code & -2); + case 1: + return (code | 1); + case 2: + return ((code - 1) | 1); + default: + k2::exit(1); + } +} + +/* Prepares unicode 0-terminated string input for search, + leaving only digits and letters with diacritics. + Length of string can decrease. + Returns length of result. */ +void prepare_search_string(std::span& code_points) noexcept { + size_t output_size{}; + for (size_t i{}; code_points[i] != 0; ++i) { + int32_t c{code_points[i]}; + int32_t new_c{}; + if (static_cast(c) < static_cast(TABLE_SIZE)) { + new_c = static_cast(prepare_table[c]); + } else { + new_c = binary_search_ranges(c); + } + if (new_c != 0) { + // we forbid 2 whitespaces after each other and starting whitespace + if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) { + code_points[output_size++] = new_c; + } + } + } + if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) { + // throw out terminating whitespace + --output_size; + } + code_points[output_size] = 0; + code_points = code_points.first(output_size); +} + +} // namespace string_functions_impl_ diff --git a/runtime-light/stdlib/string/string-functions.h b/runtime-light/stdlib/string/string-functions.h index 28b7ad35c6..0f1480ab5d 100644 --- a/runtime-light/stdlib/string/string-functions.h +++ b/runtime-light/stdlib/string/string-functions.h @@ -4,10 +4,169 @@ #pragma once +#include +#include #include +#include +#include +#include +#include "common/unicode/unicode-utils.h" +#include "common/unicode/utf8-utils.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-common/stdlib/string/string-context.h" #include "runtime-light/k2-platform/k2-api.h" +#include "runtime-light/stdlib/diagnostics/logs.h" + +namespace string_functions_impl_ { + +inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE; +inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE; + +static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES + + __RESULT_BYTES_SPAN_SIZE_IN_BYTES < + StringLibContext::STATIC_BUFFER_LENGTH); + +inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0; +inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES; +inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES; + +inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff}; + +inline constexpr int32_t WHITESPACE{static_cast(' ')}; +inline constexpr int32_t PLUS{static_cast('+')}; + +/* Search generated ranges for specified character */ +int32_t binary_search_ranges(int32_t code) noexcept; + +/* Prepares unicode 0-terminated string input for search, + leaving only digits and letters with diacritics. + Length of string can decrease. + Returns length of result. */ +void prepare_search_string(std::span& code_points) noexcept; + +inline std::span prepare_str_unicode(std::span code_points) noexcept { + prepare_search_string(code_points); + code_points[code_points.size()] = WHITESPACE; + + auto& string_lib_ctx{StringLibContext::get()}; + auto* word_indices_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))}; + // indices of first char of every word in `code_points`. + std::span word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE}; + size_t words_count{}; + size_t i{}; + // looking for the beginnings of the words + while (i < code_points.size()) { + word_start_indices[words_count++] = i; + while (i < code_points.size() && code_points[i] != WHITESPACE) { + ++i; + } + ++i; + } + word_start_indices = word_start_indices.first(words_count); + + auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool { + while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) { + ++x; + ++y; + } + if (code_points[x] == WHITESPACE) { + return code_points[y] != WHITESPACE; + } + if (code_points[y] == WHITESPACE) { + return false; + } + return code_points[x] < code_points[y]; + }}; + + std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp); + + size_t uniq_words_count{}; + for (i = 0; i < words_count; ++i) { + // drop duplicates + if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) { + word_start_indices[uniq_words_count++] = word_start_indices[i]; + } else { + word_start_indices[uniq_words_count - 1] = word_start_indices[i]; + } + } + + auto* result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))}; + std::span result{result_begin, MAX_NAME_CODE_POINTS_SIZE}; + size_t result_size{}; + // output words with '+' separator + for (i = 0; i < uniq_words_count; ++i) { + size_t ind{word_start_indices[i]}; + while (code_points[ind] != WHITESPACE) { + result[result_size++] = code_points[ind++]; + } + result[result_size++] = PLUS; + } + result[result_size++] = 0; + + kphp::log::assertion(result_size < MAX_NAME_SIZE); + result = result.first(result_size); + return result; +} + +inline std::span clean_str_unicode(std::span source_code_points) noexcept { + std::span prepared_code_points{prepare_str_unicode(source_code_points)}; + + auto& string_lib_ctx{StringLibContext::get()}; + auto* utf8_result_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))}; + std::span utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE}; + auto length{static_cast(put_string_utf8(prepared_code_points.data(), reinterpret_cast(utf8_result.data())))}; + kphp::log::assertion(length < utf8_result.size()); + utf8_result = utf8_result.first(length); + + size_t i{}; + size_t result_size{}; + while (i < utf8_result.size()) { + char* c{reinterpret_cast(std::addressof(utf8_result[i]))}; + bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) || + !std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) || + (((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') || + !std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) || + !std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) || + !std::strncmp(c, "8233+", 5)}; + do { + if (!skip) { + utf8_result[result_size] = utf8_result[i]; + ++result_size; + } + } while (utf8_result[i++] != static_cast('+')); + } + utf8_result[result_size] = static_cast(0); + + return utf8_result; +} + +inline std::span prepare_search_query_impl(std::span x) noexcept { + if (x.empty() || x.size() >= MAX_NAME_SIZE) { + return x; + } + + auto& string_lib_ctx{StringLibContext::get()}; + auto* source_code_points_begin{reinterpret_cast(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))}; + std::span source_code_points{ + source_code_points_begin, + MAX_NAME_CODE_POINTS_SIZE, + }; + + html_string_to_utf8(reinterpret_cast(x.data()), source_code_points.data()); + return clean_str_unicode(source_code_points); +} + +} // namespace string_functions_impl_ + +inline string f$prepare_search_query(const string& query) noexcept { + std::span s{ + string_functions_impl_::prepare_search_query_impl({reinterpret_cast(query.c_str()), static_cast(query.size())})}; + return {reinterpret_cast(s.data()), static_cast(s.size())}; +} inline Optional f$setlocale(int64_t category, const string& locale) noexcept { const int32_t i32category{static_cast(category)}; diff --git a/tests/python/tests/prepare_search_query/__init__.py b/tests/python/tests/prepare_search_query/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/python/tests/prepare_search_query/data/example1 b/tests/python/tests/prepare_search_query/data/example1 new file mode 100644 index 0000000000..c59e725370 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example1 @@ -0,0 +1,17 @@ + abacaba dAbAcAbA АбАсАБа йфяывчАСПМИРТОЬЛЩЗБДЮЭ. + К4ЙГЩ ЩГ рщг №кКЙ РШ зй021к 01293г0129 г + + ++_+ +_ +_ +__ ++_ Щ+!"_ №+!_" №+!"_ №+_ "Щ+_ "Щ + + йк й3 к2 + + + +7 88 76кн 68е79 н8г9 ншп + + test test + + test test +test +test +test test TeSt tEsT diff --git a/tests/python/tests/prepare_search_query/data/example10 b/tests/python/tests/prepare_search_query/data/example10 new file mode 100644 index 0000000000..a964b269f1 Binary files /dev/null and b/tests/python/tests/prepare_search_query/data/example10 differ diff --git a/tests/python/tests/prepare_search_query/data/example10_prepared b/tests/python/tests/prepare_search_query/data/example10_prepared new file mode 100644 index 0000000000..7311741531 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example10_prepared @@ -0,0 +1 @@ +0wp+0ۏń䢎+2mj媸+4ȁ뮨戯+7+9ߎ+a+aնݑŭ+a꾙n+a􇁃6v+b+bwtr+c赢θb+d+dj+e+eʔdj􅊵mb+fǽkʀ+g+hq+hyb酢+i+ifԗ0ꋷ+isɞe+iւz+iפá+jn+kf+ne󷆤p7㮐ӷae+n綍ǥwl+o+oye+p+rji+rs+s+sd+sr+t+u+w+x挨sq+y+yrsao+yy+zl值񦍛+ņ+ȯ+ɇ+ɋf2ʄa+ρ+ϟα+є+ӈoį+ӡ᫉v+աx+׭vd+ل蒍+ٮ駗+ޛ+ᤁ+ᵯǘ+嫢򙫓+漞ӟ+꽂+뙚դb򺝆o++d++𰛤է+񱷝5+󿀆𥜧+ diff --git a/tests/python/tests/prepare_search_query/data/example1_prepared b/tests/python/tests/prepare_search_query/data/example1_prepared new file mode 100644 index 0000000000..1dc52ae97e --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example1_prepared @@ -0,0 +1 @@ +01293г0129+68е79+7+76кн+88+abacaba+dabacaba+test+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к2+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+ diff --git a/tests/python/tests/prepare_search_query/data/example2 b/tests/python/tests/prepare_search_query/data/example2 new file mode 100644 index 0000000000..9bda8c35c2 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example2 @@ -0,0 +1 @@ +Y diff --git a/tests/python/tests/prepare_search_query/data/example2_prepared b/tests/python/tests/prepare_search_query/data/example2_prepared new file mode 100644 index 0000000000..469527404f --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example2_prepared @@ -0,0 +1 @@ +y+ diff --git a/tests/python/tests/prepare_search_query/data/example3 b/tests/python/tests/prepare_search_query/data/example3 new file mode 100644 index 0000000000..6178079822 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example3 @@ -0,0 +1 @@ +b diff --git a/tests/python/tests/prepare_search_query/data/example3_prepared b/tests/python/tests/prepare_search_query/data/example3_prepared new file mode 100644 index 0000000000..071dc66971 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example3_prepared @@ -0,0 +1 @@ +b+ diff --git a/tests/python/tests/prepare_search_query/data/example4 b/tests/python/tests/prepare_search_query/data/example4 new file mode 100644 index 0000000000..36774f9fe7 --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example4 @@ -0,0 +1 @@ +⚞žPuRZC[ diff --git a/tests/python/tests/prepare_search_query/data/example4_prepared b/tests/python/tests/prepare_search_query/data/example4_prepared new file mode 100644 index 0000000000..f31ecc781f --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example4_prepared @@ -0,0 +1 @@ +urzc+žp+ diff --git a/tests/python/tests/prepare_search_query/data/example5 b/tests/python/tests/prepare_search_query/data/example5 new file mode 100644 index 0000000000..8dd45ae465 Binary files /dev/null and b/tests/python/tests/prepare_search_query/data/example5 differ diff --git a/tests/python/tests/prepare_search_query/data/example5_prepared b/tests/python/tests/prepare_search_query/data/example5_prepared new file mode 100644 index 0000000000..2daa175e5d --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example5_prepared @@ -0,0 +1 @@ +8յ+c+і􃿊aen7+ᖦ+ diff --git a/tests/python/tests/prepare_search_query/data/example6 b/tests/python/tests/prepare_search_query/data/example6 new file mode 100644 index 0000000000..95cc2606dc --- /dev/null +++ b/tests/python/tests/prepare_search_query/data/example6 @@ -0,0 +1 @@ +׬qԻė^#xܵ칈T8y+䣳 V,ڦAڍ< $res); + echo json_encode($resp); +} + +main(); diff --git a/tests/python/tests/prepare_search_query/test_prepare_search_query.py b/tests/python/tests/prepare_search_query/test_prepare_search_query.py new file mode 100644 index 0000000000..8b3dd04242 --- /dev/null +++ b/tests/python/tests/prepare_search_query/test_prepare_search_query.py @@ -0,0 +1,25 @@ +import json +import os +from python.lib.testcase import WebServerAutoTestCase + +directory_path = os.path.join(os.path.dirname(__file__), "data") +prepared_suffix = "_prepared" + + +class TestPrepareSearchQuery(WebServerAutoTestCase): + def test_prepare_search_query(self): + for file in os.listdir(directory_path): + if not os.path.basename(file).endswith(prepared_suffix): + with open(os.path.join(directory_path, file), "r") as query_file: + with open(os.path.join(directory_path, file + prepared_suffix), "r") as prepared_query_file: + query = query_file.read() + expected_prepared_query = prepared_query_file.read() + if len(expected_prepared_query) > 0 and expected_prepared_query[-1] == '\n': + expected_prepared_query = expected_prepared_query[:-1] + + headers = {"Content-Type": "text/plain; charset=utf-8"} + resp = self.web_server.http_post(headers=headers, data=query.encode("utf-8")) + + self.assertEqual(resp.status_code, 200) + result = json.loads(resp.text)["POST_BODY"] + self.assertEqual(result, expected_prepared_query)