Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
01b4486
fix preg_replace_callback function
Shamzik Dec 10, 2025
a38a0e2
replace set_matches with template<bool is_offset_capture, bool is_unm…
Shamzik Dec 12, 2025
d797a2a
fixes
Shamzik Dec 12, 2025
c9afc9f
details namespace
Shamzik Dec 12, 2025
39518a5
revert template
Shamzik Dec 17, 2025
98d44d2
format
Shamzik Dec 17, 2025
4305ae1
move all into details namespace
Shamzik Dec 19, 2025
519a553
stash refactoring
Shamzik Dec 25, 2025
bbde33b
finalize refactoring
Shamzik Dec 29, 2025
e5bce5a
namespaces
Shamzik Dec 25, 2025
6c1017b
move valid_preg_replace_mixed into details namespace
Shamzik Dec 25, 2025
7b54150
inline reserve_buffer
Shamzik Dec 29, 2025
a2a222c
kphp namespace
Shamzik Dec 25, 2025
ddc9bd8
fix valid_regex_flags
Shamzik Dec 29, 2025
a49d222
add test
Shamzik Dec 29, 2025
1b8bc30
rc -> ret_code
Shamzik Dec 29, 2025
56ea4e4
format
Shamzik Dec 29, 2025
b7c8ac8
fixes
Shamzik Jan 12, 2026
cb78255
types
Shamzik Jan 12, 2026
cdfa1bd
string replacement
Shamzik Jan 12, 2026
84b643a
fixes
Shamzik Jan 12, 2026
e95b37f
fix count in preg_replace_callback
Shamzik Jan 12, 2026
a03504d
Merge branch 'master' into kshamazov/pcre2_functions
Shamzik Jan 16, 2026
7b89ef1
make compiled_regex to be public
Shamzik Jan 19, 2026
9b73a7e
include fix
Shamzik Jan 19, 2026
2be8393
rename usings
Shamzik Jan 19, 2026
26be506
fix
Shamzik Jan 19, 2026
1a69881
make group_name_iterator to be safe
Shamzik Jan 19, 2026
97e6c7e
index bytes
Shamzik Jan 19, 2026
a9b0e2c
make m_entry_size to be const
Shamzik Jan 19, 2026
54f16e0
name -> group_names
Shamzik Jan 19, 2026
00d8e0e
collect_group_names refactoring
Shamzik Jan 19, 2026
a5710fa
remove const from raw pcre2 types
Shamzik Jan 21, 2026
1fd041f
mv RegexInstanceState definition into header
Shamzik Jan 21, 2026
35a674d
add comments
Shamzik Jan 21, 2026
ecfb15d
make replacement to be optional
Shamzik Jan 21, 2026
d65e21d
merge assrtions
Shamzik Jan 21, 2026
621f35a
details::offset_pair constants
Shamzik Jan 21, 2026
81c35d0
substitute refactoring
Shamzik Jan 21, 2026
1adcbad
check buffer.data()
Shamzik Jan 21, 2026
1ec2278
remove default argument value for get_range
Shamzik Jan 21, 2026
3eecfe2
rename m_base_options to m_user_options
Shamzik Jan 21, 2026
d7e4d0b
format
Shamzik Jan 21, 2026
424aba2
std::formatter
Shamzik Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime-light/stdlib/stdlib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ prepend(
rpc/rpc-tl-request.cpp
serialization/serialization-state.cpp
server/http-functions.cpp
string/pcre2-functions.cpp
string/regex-functions.cpp
string/regex-state.cpp
string/string-state.cpp
Expand Down
148 changes: 148 additions & 0 deletions runtime-light/stdlib/string/pcre2-functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Compiler for PHP (aka KPHP)
// Copyright (c) 2025 LLC «V Kontakte»
// Distributed under the GPL v3 License, see LICENSE.notice.txt

#include "runtime-light/stdlib/string/pcre2-functions.h"

namespace kphp::pcre2 {

namespace {

int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept {
// all multibyte utf8 runes consist of subsequent bytes,
// these subsequent bytes start with 10 bit pattern
// 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000)
while (offset < subject.size() && ((static_cast<unsigned char>(subject[offset])) & 0xc0) == 0x80) {
offset++;
}
return offset;
}

} // namespace

std::expected<regex, compile_error> regex::compile(std::string_view pattern, uint32_t options, pcre2_compile_context_8* ctx) noexcept {
int32_t errorcode{};
PCRE2_SIZE erroroffset{};

pcre2_code_8* re{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.length(), options, &errorcode, &erroroffset, ctx)};

if (!re) {
return std::unexpected{compile_error{{.code = errorcode}, erroroffset}};
}
return regex{*re};
}

regex::group_name_range regex::names() const noexcept {
uint32_t count{};
uint32_t entry_size{};
PCRE2_SPTR8 table{};

pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count));

if (count == 0) {
return {.b = group_name_iterator{nullptr, 0}, .e = group_name_iterator{nullptr, 0}};
}

pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(entry_size));
pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMETABLE, std::addressof(table));

return {.b = group_name_iterator{table, entry_size}, .e = group_name_iterator{std::next(table, static_cast<size_t>(count) * entry_size), entry_size}};
}

std::expected<size_t, error> regex::substitute_match(std::string_view subject, pcre2_match_data_8& data, std::string_view replacement, char* buffer,
size_t& buffer_len, uint32_t match_options, pcre2_match_context_8* ctx) const noexcept {
uint32_t substitute_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_MATCHED | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | match_options};

auto ret_code{pcre2_substitute_8(m_code.get(), reinterpret_cast<PCRE2_SPTR8>(subject.data()), subject.length(), 0, substitute_options, std::addressof(data),
ctx, reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.length(), reinterpret_cast<PCRE2_UCHAR8*>(buffer),
std::addressof(buffer_len))};

if (ret_code < 0) {
return std::unexpected<error>{{.code = ret_code}};
}

return static_cast<size_t>(ret_code);
}

std::optional<match_view::offset_range> match_view::get_range(size_t i) const noexcept {
if (i >= m_num_groups) {
return std::nullopt;
}

kphp::log::assertion(m_ovector_ptr);
// ovector is an array of offset pairs
PCRE2_SIZE start{m_ovector_ptr[2 * i]};
PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]};

if (start == PCRE2_UNSET) {
return std::nullopt;
}
return offset_range{.start = start, .end = end};
}

matcher::matcher(const regex& re, std::string_view subject, size_t match_from, pcre2_match_context_8* ctx, pcre2_match_data_8& data, uint32_t options) noexcept
: m_re{re},
m_subject{subject},
m_ctx{ctx},
m_current_offset{match_from},
m_match_data{data},
m_base_options{options},
m_is_utf{re.is_utf()} {}

std::expected<std::optional<match_view>, error> matcher::next() noexcept {
while (m_current_offset <= m_subject.length()) {
uint32_t current_attempt_options{m_base_options | m_match_options};

auto ret_code{pcre2_match_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), m_current_offset,
current_attempt_options, std::addressof(m_match_data), m_ctx)};

if (ret_code == PCRE2_ERROR_NOMATCH) {
if (m_match_options != 0) {
// If the anchored non-empty match failed, advance 1 unit and try again
m_match_options = 0;
m_current_offset++;
if (m_is_utf) {
m_current_offset = skip_utf8_subsequent_bytes(m_current_offset, m_subject);
}
continue;
}
return std::nullopt;
}

// From https://www.pcre.org/current/doc/html/pcre2_match.html
// The return from pcre2_match() is one more than the highest numbered capturing pair that has been set
// (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors.
if (ret_code < 0) [[unlikely]] {
return std::unexpected{error{.code = ret_code}};
}

m_last_success_options = current_attempt_options;

size_t matched_groups_count{};
if (ret_code == 0) {
matched_groups_count = pcre2_get_ovector_count_8(std::addressof(m_match_data));
} else {
matched_groups_count = static_cast<size_t>(ret_code);
}

const PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(std::addressof(m_match_data))};

size_t start{ovector[0]};
size_t end{ovector[1]};

if (start == end) {
// Found an empty match; set flags to try finding a non-empty match at same position
m_match_options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
} else {
m_match_options = 0;
}
m_current_offset = end;

return match_view{m_subject, ovector, matched_groups_count};
}

return std::nullopt;
}

} // namespace kphp::pcre2
Loading
Loading