Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions builtin-functions/kphp-light/stdlib/server-functions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false;

function memory_get_detailed_stats() ::: int[];

function prepare_search_query ($query ::: string) ::: string;

// ===== UNSUPPORTED =====

/** @kphp-extern-func-info stub */
Expand Down Expand Up @@ -130,7 +132,3 @@ function flush() ::: void;
define('PHP_QUERY_RFC1738', 1);
define('PHP_QUERY_RFC3986', 2);


/** @kphp-extern-func-info stub generation-required */
function prepare_search_query ($query ::: string) ::: string;

9 changes: 4 additions & 5 deletions common/unicode/unicode-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,10 @@ int prepare_search_string(int* input) {
return output - input;
}

#define MAX_NAME_SIZE 65536
static char prep_buf[4 * MAX_NAME_SIZE + 4];
int prep_ibuf[MAX_NAME_SIZE + 4];
static int prep_ibuf_res[MAX_NAME_SIZE + 4];
static int* words_ibuf[MAX_NAME_SIZE + 4];
static char prep_buf[MAX_NAME_BYTES_SIZE];
int prep_ibuf[MAX_NAME_CODE_POINTS_SIZE];
static int prep_ibuf_res[MAX_NAME_CODE_POINTS_SIZE];
static int* words_ibuf[MAX_NAME_CODE_POINTS_SIZE];

int stricmp_void(const void* x, const void* y) {
const int* s1 = *(const int**)x;
Expand Down
6 changes: 6 additions & 0 deletions common/unicode/unicode-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

#pragma once

#include <cstddef>

inline constexpr size_t MAX_NAME_SIZE = 65536;
inline constexpr size_t MAX_NAME_BYTES_SIZE = 4 * MAX_NAME_SIZE + 4;
inline constexpr size_t MAX_NAME_CODE_POINTS_SIZE = MAX_NAME_SIZE + 4;

int unicode_toupper(int code);
int unicode_tolower(int code);
const char* clean_str(const char* x);
326 changes: 164 additions & 162 deletions common/unicode/utf8-utils.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions runtime-light/stdlib/stdlib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ prepend(
string/regex-functions.cpp
string/regex-state.cpp
string/string-state.cpp
string/string-functions.cpp
system/system-functions.cpp
system/system-state.cpp
time/time-functions.cpp
Expand Down
83 changes: 83 additions & 0 deletions runtime-light/stdlib/string/string-functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Compiler for PHP (aka KPHP)
// Copyright (c) 2025 LLC «V Kontakte»
// Distributed under the GPL v3 License, see LICENSE.notice.txt

#include "runtime-light/stdlib/string/string-functions.h"

#include <cstddef>
#include <cstdint>
#include <span>

#include "auto/common/unicode-utils-auto.h"
#include "runtime-light/k2-platform/k2-api.h"

namespace string_functions_impl_ {

/* Search generated ranges for specified character */
int32_t binary_search_ranges(int32_t code) noexcept {
if (code > MAX_UTF8_CODE_POINT) {
return 0;
}

size_t l{0};
size_t r{prepare_table_ranges_size};
while (l < r) {
size_t m{((l + r + 2) >> 2) << 1};
if (prepare_table_ranges[m] <= code) {
l = m;
} else {
r = m - 2;
}
}

// prepare_table_ranges[l] - key
// prepare_table_ranges[l + 1] - value
int32_t t{prepare_table_ranges[l + 1]};
if (t < 0) {
return code - prepare_table_ranges[l] + (~t);
}
if (t <= 0x10ffff) {
return t;
}
switch (t - 0x200000) {
case 0:
return (code & -2);
case 1:
return (code | 1);
case 2:
return ((code - 1) | 1);
default:
k2::exit(1);
}
}

/* Prepares unicode 0-terminated string input for search,
leaving only digits and letters with diacritics.
Length of string can decrease.
Returns length of result. */
void prepare_search_string(std::span<int32_t>& code_points) noexcept {
size_t output_size{};
for (size_t i{}; code_points[i] != 0; ++i) {
int32_t c{code_points[i]};
int32_t new_c{};
if (static_cast<size_t>(c) < static_cast<size_t>(TABLE_SIZE)) {
new_c = static_cast<int32_t>(prepare_table[c]);
} else {
new_c = binary_search_ranges(c);
}
if (new_c != 0) {
// we forbid 2 whitespaces after each other and starting whitespace
if (new_c != WHITESPACE || (output_size > 0 && code_points[output_size - 1] != WHITESPACE)) {
code_points[output_size++] = new_c;
}
}
}
if (output_size > 0 && code_points[output_size - 1] == WHITESPACE) {
// throw out terminating whitespace
--output_size;
}
code_points[output_size] = 0;
code_points = code_points.first(output_size);
}

} // namespace string_functions_impl_
159 changes: 159 additions & 0 deletions runtime-light/stdlib/string/string-functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,169 @@

#pragma once

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <memory>
#include <span>

#include "common/unicode/unicode-utils.h"
#include "common/unicode/utf8-utils.h"
#include "runtime-common/core/runtime-core.h"
#include "runtime-common/stdlib/string/string-context.h"
#include "runtime-light/k2-platform/k2-api.h"
#include "runtime-light/stdlib/diagnostics/logs.h"

namespace string_functions_impl_ {

inline constexpr size_t __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE;
inline constexpr size_t __WORD_INDICES_SPAN_SIZE_IN_BYTES = sizeof(size_t) * MAX_NAME_CODE_POINTS_SIZE;
inline constexpr size_t __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES = sizeof(int32_t) * MAX_NAME_CODE_POINTS_SIZE;
inline constexpr size_t __RESULT_BYTES_SPAN_SIZE_IN_BYTES = sizeof(std::byte) * MAX_NAME_BYTES_SIZE;

static_assert(__SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES + __WORD_INDICES_SPAN_SIZE_IN_BYTES + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES +
__RESULT_BYTES_SPAN_SIZE_IN_BYTES <
StringLibContext::STATIC_BUFFER_LENGTH);

inline constexpr size_t SOURCE_CODE_POINTS_SPAN_BEGIN = 0;
inline constexpr size_t WORD_INDICES_SPAN_BEGIN = SOURCE_CODE_POINTS_SPAN_BEGIN + __SOURCE_CODE_POINTS_SPAN_SIZE_IN_BYTES;
inline constexpr size_t RESULT_CODE_POINTS_SPAN_BEGIN = WORD_INDICES_SPAN_BEGIN + __WORD_INDICES_SPAN_SIZE_IN_BYTES;
inline constexpr size_t RESULT_BYTES_SPAN_BEGIN = RESULT_CODE_POINTS_SPAN_BEGIN + __RESULT_CODE_POINTS_SPAN_SIZE_IN_BYTES;

inline constexpr int32_t MAX_UTF8_CODE_POINT{0x10ffff};

inline constexpr int32_t WHITESPACE{static_cast<int32_t>(' ')};
inline constexpr int32_t PLUS{static_cast<int32_t>('+')};

/* Search generated ranges for specified character */
int32_t binary_search_ranges(int32_t code) noexcept;

/* Prepares unicode 0-terminated string input for search,
leaving only digits and letters with diacritics.
Length of string can decrease.
Returns length of result. */
void prepare_search_string(std::span<int32_t>& code_points) noexcept;

inline std::span<int32_t> prepare_str_unicode(std::span<int32_t> code_points) noexcept {
prepare_search_string(code_points);
code_points[code_points.size()] = WHITESPACE;

auto& string_lib_ctx{StringLibContext::get()};
auto* word_indices_begin{reinterpret_cast<size_t*>(std::next(string_lib_ctx.static_buf.get(), WORD_INDICES_SPAN_BEGIN))};
// indices of first char of every word in `code_points`.
std::span<size_t> word_start_indices{word_indices_begin, MAX_NAME_CODE_POINTS_SIZE};
size_t words_count{};
size_t i{};
// looking for the beginnings of the words
while (i < code_points.size()) {
word_start_indices[words_count++] = i;
while (i < code_points.size() && code_points[i] != WHITESPACE) {
++i;
}
++i;
}
word_start_indices = word_start_indices.first(words_count);

auto word_less_cmp{[&code_points](size_t x, size_t y) noexcept -> bool {
while (code_points[x] != WHITESPACE && code_points[x] == code_points[y]) {
++x;
++y;
}
if (code_points[x] == WHITESPACE) {
return code_points[y] != WHITESPACE;
}
if (code_points[y] == WHITESPACE) {
return false;
}
return code_points[x] < code_points[y];
}};

std::sort(word_start_indices.begin(), word_start_indices.end(), word_less_cmp);

size_t uniq_words_count{};
for (i = 0; i < words_count; ++i) {
// drop duplicates
if (uniq_words_count == 0 || word_less_cmp(word_start_indices[uniq_words_count - 1], word_start_indices[i])) {
word_start_indices[uniq_words_count++] = word_start_indices[i];
} else {
word_start_indices[uniq_words_count - 1] = word_start_indices[i];
}
}

auto* result_begin{reinterpret_cast<int32_t*>(std::next(string_lib_ctx.static_buf.get(), RESULT_CODE_POINTS_SPAN_BEGIN))};
std::span<int32_t> result{result_begin, MAX_NAME_CODE_POINTS_SIZE};
size_t result_size{};
// output words with '+' separator
for (i = 0; i < uniq_words_count; ++i) {
size_t ind{word_start_indices[i]};
while (code_points[ind] != WHITESPACE) {
result[result_size++] = code_points[ind++];
}
result[result_size++] = PLUS;
}
result[result_size++] = 0;

kphp::log::assertion(result_size < MAX_NAME_SIZE);
result = result.first(result_size);
return result;
}

inline std::span<const std::byte> clean_str_unicode(std::span<int32_t> source_code_points) noexcept {
std::span<int32_t> prepared_code_points{prepare_str_unicode(source_code_points)};

auto& string_lib_ctx{StringLibContext::get()};
auto* utf8_result_begin{reinterpret_cast<std::byte*>(std::next(string_lib_ctx.static_buf.get(), RESULT_BYTES_SPAN_BEGIN))};
std::span<std::byte> utf8_result{utf8_result_begin, MAX_NAME_BYTES_SIZE};
auto length{static_cast<size_t>(put_string_utf8(prepared_code_points.data(), reinterpret_cast<char*>(utf8_result.data())))};
kphp::log::assertion(length < utf8_result.size());
utf8_result = utf8_result.first(length);

size_t i{};
size_t result_size{};
while (i < utf8_result.size()) {
char* c{reinterpret_cast<char*>(std::addressof(utf8_result[i]))};
bool skip{!std::strncmp(c, "amp+", 4) || !std::strncmp(c, "gt+", 3) || !std::strncmp(c, "lt+", 3) || !std::strncmp(c, "quot+", 5) ||
!std::strncmp(c, "ft+", 3) || !std::strncmp(c, "feat+", 5) ||
(((c[0] == '1' && c[1] == '9') || (c[0] == '2' && c[1] == '0')) && ('0' <= c[2] && c[2] <= '9') && ('0' <= c[3] && c[3] <= '9') && c[4] == '+') ||
!std::strncmp(c, "092+", 4) || !std::strncmp(c, "33+", 3) || !std::strncmp(c, "34+", 3) || !std::strncmp(c, "36+", 3) ||
!std::strncmp(c, "39+", 3) || !std::strncmp(c, "60+", 3) || !std::strncmp(c, "62+", 3) || !std::strncmp(c, "8232+", 5) ||
!std::strncmp(c, "8233+", 5)};
do {
if (!skip) {
utf8_result[result_size] = utf8_result[i];
++result_size;
}
} while (utf8_result[i++] != static_cast<std::byte>('+'));
}
utf8_result[result_size] = static_cast<std::byte>(0);

return utf8_result;
}

inline std::span<const std::byte> prepare_search_query_impl(std::span<const std::byte> x) noexcept {
if (x.empty() || x.size() >= MAX_NAME_SIZE) {
return x;
}

auto& string_lib_ctx{StringLibContext::get()};
auto* source_code_points_begin{reinterpret_cast<int32_t*>(std::next(string_lib_ctx.static_buf.get(), SOURCE_CODE_POINTS_SPAN_BEGIN))};
std::span<int32_t> source_code_points{
source_code_points_begin,
MAX_NAME_CODE_POINTS_SIZE,
};

html_string_to_utf8(reinterpret_cast<const char*>(x.data()), source_code_points.data());
return clean_str_unicode(source_code_points);
}

} // namespace string_functions_impl_

inline string f$prepare_search_query(const string& query) noexcept {
std::span<const std::byte> s{
string_functions_impl_::prepare_search_query_impl({reinterpret_cast<const std::byte*>(query.c_str()), static_cast<size_t>(query.size())})};
return {reinterpret_cast<const char*>(s.data()), static_cast<string::size_type>(s.size())};
}

inline Optional<string> f$setlocale(int64_t category, const string& locale) noexcept {
const int32_t i32category{static_cast<int32_t>(category)};
Expand Down
Empty file.
17 changes: 17 additions & 0 deletions tests/python/tests/prepare_search_query/data/example1
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
abacaba dAbAcAbA АбАсАБа йфяывчАСПМИРТОЬЛЩЗБДЮЭ.
К4ЙГЩ ЩГ рщг №кКЙ РШ зй021к 01293г0129 г

++_+ +_ +_ +__ ++_ Щ+!"_ №+!_" №+!"_ №+_ "Щ+_ "Щ

йк й3 к2



7 88 76кн 68е79 н8г9 ншп

test test

test test
test
test
test test TeSt tEsT
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0wp+0ۏń䢎+2mj媸+4ȁ뮨戯+7+9ߎ+a+aնݑŭ+a꾙n+a􇁃6v+b+bwtr+c赢θb+d+dj+e+eʔdj􅊵mb+fǽkʀ+g+hq+hyb酢+i+ifԗ0ꋷ+isɞe+iւz+iפá+jn+kf+ne󷆤p7㮐ӷae+n綍ǥwl+o+oye+p+rji+rs+s+sd+sr+t+u+w+x挨sq+y+yrsao+yy+zl值񦍛+ņ+ȯ+ɇ+ɋf2ʄa+ρ+ϟα+є+ӈoį+ӡ᫉v+աx+׭vd+ل蒍+ٮ駗+ޛ+ᤁ+ᵯǘ+嫢򙫓+漞ӟ+꽂+뙚դb򺝆o++d++𰛤է+񱷝5+󿀆𥜧+
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
01293г0129+68е79+7+76кн+88+abacaba+dabacaba+test+абасаба+г+зй021к+й3+йк+йфяывчаспмиртоьлщзбдюэ+к2+к4йгщ+ккй+н8г9+ншп+рш+рщг+щ+щг+
1 change: 1 addition & 0 deletions tests/python/tests/prepare_search_query/data/example2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Y
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
y+
1 change: 1 addition & 0 deletions tests/python/tests/prepare_search_query/data/example3
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b+
1 change: 1 addition & 0 deletions tests/python/tests/prepare_search_query/data/example4
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
⚞žPuRZC[
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
urzc+žp+
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
8յ+c+і􃿊aen7+ᖦ+
1 change: 1 addition & 0 deletions tests/python/tests/prepare_search_query/data/example6
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
׬qԻė^#xܵ칈T8y+䣳 V,ڦAڍ<<u{ZӉ᱀I
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
u+v+x칈t8y+zӊ᱀i+׬qիė+ڦaڍ+䣳+
1 change: 1 addition & 0 deletions tests/python/tests/prepare_search_query/data/example7
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Y.j3ư; XD]ǩힵa7MBAe㭑tjri ]sɀnbo䪿Nζ̗ɾR8kà}掄≜XC\n묭[r˂ꏬKSD 䋼f;ֱ6'<'Ǔ$☜6UӗRSfQD§.w
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
6+6u+a+d+e㭑+j3+ksd+nζɾr8k+n묭+o+r+sɀnb+tjri+w+x+xc+y+à+ư+ǔ+ǩힵa7mb+ӗrsfqd+䋼f+䪿+掄+ꏬ++
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
exk+n2+բq+ֆо+즕tŏ+
2 changes: 2 additions & 0 deletions tests/python/tests/prepare_search_query/data/example9
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
l~'<Oɩ{$լdv~X
󝓌Ր<n諪jψ۠𒅓uؠOH@ß3lz鷣e`Ĝ 0ؤdGk駺
Vԯ*#C!AwO𳬷JU,ݟJ(.^'1ٸ⻐ODDz2˫;-ʁ퇊ڴv榉ʰκ'ǒʹ5)(캄
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0ؤdgk駺+1ٸ+awo𳬷ju+c+j+l+n諪jψ𒅓+oddz2+oɩ+uؠoh+vԯ+x+ß3lz鷣e+ĝ+ǒʹ5+ʁ퇊ڴv榉ʰκ+լdv+ݟ+캄+󝓌ր+
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
entry: script
components:
script:
image: KPHP
scope: Request
links: {}
10 changes: 10 additions & 0 deletions tests/python/tests/prepare_search_query/php/index.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php

function main() {
$raw_post_data = file_get_contents('php://input');
$res = prepare_search_query($raw_post_data);
$resp = array("POST_BODY" => $res);
echo json_encode($resp);
}

main();
Loading
Loading