diff --git a/runtime-light/coroutine/await-set.h b/runtime-light/coroutine/await-set.h index 3d3bdd76f7..9604097e28 100644 --- a/runtime-light/coroutine/await-set.h +++ b/runtime-light/coroutine/await-set.h @@ -48,7 +48,7 @@ class await_set { } auto next() noexcept { - return detail::await_set::await_set_awaitable{*m_await_broker.get()}; + return detail::await_set::await_set_awaitable{*m_await_broker}; } bool empty() const noexcept { diff --git a/runtime-light/stdlib/string/pcre2-functions.h b/runtime-light/stdlib/string/pcre2-functions.h new file mode 100644 index 0000000000..36a8f6bcf3 --- /dev/null +++ b/runtime-light/stdlib/string/pcre2-functions.h @@ -0,0 +1,365 @@ +// Compiler for PHP (aka KPHP) +// Copyright (c) 2025 LLC «V Kontakte» +// Distributed under the GPL v3 License, see LICENSE.notice.txt + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "runtime-light/stdlib/diagnostics/logs.h" +#include "runtime-light/stdlib/string/regex-include.h" + +namespace kphp::pcre2 { + +namespace details { + +inline int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept { + // all multibyte utf8 runes consist of subsequent bytes, + // these subsequent bytes start with 10 bit pattern + // 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000) + while (offset < subject.size() && ((static_cast(subject[offset])) & 0xc0) == 0x80) { + offset++; + } + return offset; +} + +} // namespace details + +using regex_general_context_t = std::unique_ptr; +using regex_compile_context_t = std::unique_ptr; +using regex_match_context_t = std::unique_ptr; +using regex_match_data_t = std::unique_ptr; +using regex_code_t = std::unique_ptr; + +struct error { + int32_t code{}; +}; + +struct compile_error : kphp::pcre2::error { + size_t offset{}; +}; + +struct group_name { + std::string_view name; + size_t index{}; +}; + +class group_name_iterator { + const PCRE2_UCHAR8* m_ptr{nullptr}; + size_t m_entry_size{}; + +public: + using iterator_category = std::forward_iterator_tag; + using value_type = kphp::pcre2::group_name; + using difference_type = std::ptrdiff_t; + using pointer = kphp::pcre2::group_name*; + using reference = kphp::pcre2::group_name; + + group_name_iterator() = delete; + group_name_iterator(const PCRE2_UCHAR8* current_entry, size_t entry_size) noexcept + : m_ptr{current_entry}, + m_entry_size{entry_size} {} + + kphp::pcre2::group_name operator*() const noexcept { + enum class index_bytes { upper, lower, count }; + + const auto index{static_cast(m_ptr[static_cast(index_bytes::upper)] << 8 | m_ptr[static_cast(index_bytes::lower)])}; + const auto* name_ptr{reinterpret_cast(std::next(m_ptr, static_cast(index_bytes::count)))}; + return {.name = std::string_view{name_ptr}, .index = index}; + } + + group_name_iterator& operator++() noexcept { + std::advance(m_ptr, m_entry_size); + return *this; + } + + group_name_iterator operator++(int) noexcept { // NOLINT + group_name_iterator tmp{*this}; + ++*this; + return tmp; + } + + bool operator==(const group_name_iterator& other) const noexcept { + return m_ptr == other.m_ptr; + } +}; + +class regex { + kphp::pcre2::regex_code_t m_code; + +public: + friend class match_view; + friend class matcher; + + static std::expected compile(std::string_view pattern, const kphp::pcre2::regex_compile_context_t& ctx, + uint32_t options = 0) noexcept { + int32_t errorcode{}; + PCRE2_SIZE erroroffset{}; + + kphp::pcre2::regex_code_t re{pcre2_compile_8(reinterpret_cast(pattern.data()), pattern.length(), options, std::addressof(errorcode), + std::addressof(erroroffset), ctx.get()), + pcre2_code_free_8}; + + if (!re) { + return std::unexpected{kphp::pcre2::compile_error{{.code = errorcode}, erroroffset}}; + } + return kphp::pcre2::regex{std::move(re)}; + } + + struct group_name_range { + kphp::pcre2::group_name_iterator b; + kphp::pcre2::group_name_iterator e; + + kphp::pcre2::group_name_iterator begin() const noexcept { + return b; + } + kphp::pcre2::group_name_iterator end() const noexcept { + return e; + } + + bool empty() const noexcept { + return b == e; + } + }; + + group_name_range names() const noexcept { + uint32_t count{}; + uint32_t entry_size{}; + PCRE2_SPTR8 table{}; + + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0); + + if (count == 0) { + return {.b = group_name_iterator{nullptr, 0}, .e = group_name_iterator{nullptr, 0}}; + } + + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(entry_size)) == 0); + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMETABLE, std::addressof(table)) == 0); + + return {.b = group_name_iterator{table, entry_size}, .e = group_name_iterator{std::next(table, static_cast(count) * entry_size), entry_size}}; + } + + uint32_t capture_count() const noexcept { + uint32_t count{}; + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_CAPTURECOUNT, std::addressof(count)) == 0); + return count; + } + + uint32_t name_count() const noexcept { + uint32_t count{}; + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0); + return count; + } + + bool is_utf() const noexcept { + uint32_t compile_options{}; + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_ARGOPTIONS, std::addressof(compile_options)) == 0); + return (compile_options & PCRE2_UTF) != 0; + } + +private: + explicit regex(kphp::pcre2::regex_code_t&& code) noexcept + : m_code{std::move(code)} {} +}; + +class match_view { + const kphp::pcre2::regex& m_re; + std::string_view m_subject; + const kphp::pcre2::regex_match_data_t& m_match_data; + uint32_t m_match_options{}; + size_t m_num_groups{}; + +public: + match_view(const regex& re, std::string_view subject, const kphp::pcre2::regex_match_data_t& match_data, uint32_t match_options, size_t num_groups) noexcept + : m_re{re}, + m_subject{subject}, + m_match_data{match_data}, + m_match_options{match_options}, + m_num_groups{num_groups} {} + + int32_t size() const noexcept { + return m_num_groups; + } + + struct offset_range { + size_t start{}; + size_t end{}; + }; + + std::optional get_group(size_t i) const noexcept { + if (auto range{get_range(i)}; range.has_value()) { + return m_subject.substr(range->start, range->end - range->start); + } + return std::nullopt; + } + + struct group_content { + std::string_view text; + size_t offset{}; + }; + + std::optional get_group_content(size_t i) const noexcept { + if (auto range{get_range(i)}; range.has_value()) { + return group_content{.text = m_subject.substr(range->start, range->end - range->start), .offset = range->start}; + } + return std::nullopt; + } + + size_t match_start() const noexcept { + return pcre2_get_ovector_pointer_8(m_match_data.get())[0]; + } + size_t match_end() const noexcept { + return pcre2_get_ovector_pointer_8(m_match_data.get())[1]; + } + + /** + * @param buffer_len Input: capacity of buffer. Output: actual length of result. + * @return expected: The number of replacements (should be 1). + */ + std::expected substitute(std::string_view replacement, char* buffer, size_t& buffer_len, + const kphp::pcre2::regex_match_context_t& ctx) const noexcept { + uint32_t substitute_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_MATCHED | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | + PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | m_match_options}; + + auto ret_code{pcre2_substitute_8(m_re.m_code.get(), reinterpret_cast(m_subject.data()), m_subject.length(), 0, substitute_options, + m_match_data.get(), ctx.get(), reinterpret_cast(replacement.data()), replacement.length(), + reinterpret_cast(buffer), std::addressof(buffer_len))}; + + if (ret_code < 0) { + return std::unexpected{{.code = ret_code}}; + } + + return static_cast(ret_code); + } + +private: + std::optional get_range(size_t i = 0) const noexcept { + if (i >= m_num_groups) { + return std::nullopt; + } + + const auto* ovector_ptr{pcre2_get_ovector_pointer_8(m_match_data.get())}; + // ovector is an array of offset pairs + PCRE2_SIZE start{ovector_ptr[2 * i]}; + PCRE2_SIZE end{ovector_ptr[(2 * i) + 1]}; + + if (start == PCRE2_UNSET) { + return std::nullopt; + } + return offset_range{.start = start, .end = end}; + } +}; + +class matcher { + const kphp::pcre2::regex& m_re; + std::string_view m_subject; + const kphp::pcre2::regex_match_context_t& m_ctx; + PCRE2_SIZE m_current_offset{}; + const kphp::pcre2::regex_match_data_t& m_match_data; + uint32_t m_base_options{}; + uint32_t m_match_options{}; + bool m_is_utf{false}; + +public: + matcher(const kphp::pcre2::regex& re, std::string_view subject, size_t match_from, const kphp::pcre2::regex_match_context_t& ctx, + const kphp::pcre2::regex_match_data_t& data, uint32_t options = 0) noexcept + : m_re{re}, + m_subject{subject}, + m_ctx{ctx}, + m_current_offset{match_from}, + m_match_data{data}, + m_base_options{options}, + m_is_utf{re.is_utf()} {} + + std::expected, kphp::pcre2::error> next() noexcept { + while (m_current_offset <= m_subject.length()) { + uint32_t current_attempt_options{m_base_options | m_match_options}; + + auto ret_code{pcre2_match_8(m_re.m_code.get(), reinterpret_cast(m_subject.data()), m_subject.length(), m_current_offset, + current_attempt_options, m_match_data.get(), m_ctx.get())}; + + if (ret_code == PCRE2_ERROR_NOMATCH) { + if (m_match_options != 0) { + // If the anchored non-empty match failed, advance 1 unit and try again + m_match_options = 0; + m_current_offset++; + if (m_is_utf) { + m_current_offset = kphp::pcre2::details::skip_utf8_subsequent_bytes(m_current_offset, m_subject); + } + continue; + } + return std::nullopt; + } + + // From https://www.pcre.org/current/doc/html/pcre2_match.html + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. + if (ret_code < 0) [[unlikely]] { + return std::unexpected{error{.code = ret_code}}; + } + + size_t matched_groups_count{}; + if (ret_code == 0) { + matched_groups_count = pcre2_get_ovector_count_8(m_match_data.get()); + } else { + matched_groups_count = static_cast(ret_code); + } + + const PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data.get())}; + + size_t start{ovector[0]}; + size_t end{ovector[1]}; + + if (start == end) { + // Found an empty match; set flags to try finding a non-empty match at same position + m_match_options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } else { + m_match_options = 0; + } + m_current_offset = end; + + return kphp::pcre2::match_view{m_re, m_subject, m_match_data, current_attempt_options, matched_groups_count}; + } + + return std::nullopt; + } +}; + +} // namespace kphp::pcre2 + +namespace std { + +template<> +struct formatter { + static constexpr size_t ERROR_BUFFER_LENGTH{256}; + + template + constexpr auto parse(ParseContext& ctx) const noexcept { + return ctx.begin(); + } + + template + auto format(kphp::pcre2::error error, FmtContext& ctx) const noexcept { + std::array buffer{}; + auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + return format_to(ctx.out(), "unknown error ({})", error.code); + case PCRE2_ERROR_NOMEMORY: + return format_to(ctx.out(), "[truncated] {}", buffer.data()); + default: + kphp::log::error("unsupported regex error code: {}", ret_code); + } + } + return format_to(ctx.out(), "{}", buffer.data()); + } +}; + +} // namespace std diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 060fd8e0db..1c40ba3f0a 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -29,118 +29,9 @@ #include "runtime-light/stdlib/string/regex-include.h" #include "runtime-light/stdlib/string/regex-state.h" -namespace std { - -template<> -struct formatter { - static constexpr size_t ERROR_BUFFER_LENGTH{256}; - - template - constexpr auto parse(ParseContext& ctx) const noexcept { - return ctx.begin(); - } - - template - auto format(kphp::regex::details::pcre2_error error, FmtContext& ctx) const noexcept { - std::array buffer{}; - auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; - if (ret_code < 0) [[unlikely]] { - switch (ret_code) { - case PCRE2_ERROR_BADDATA: - return format_to(ctx.out(), "unknown error ({})", error.code); - case PCRE2_ERROR_NOMEMORY: - return format_to(ctx.out(), "[truncated] {}", buffer.data()); - default: - kphp::log::error("unsupported regex error code: {}", ret_code); - } - } - return format_to(ctx.out(), "{}", buffer.data()); - } -}; - -} // namespace std - namespace { -enum class trailing_unmatch : uint8_t { skip, include }; - using backref = std::string_view; -using regex_pcre2_group_names_t = kphp::stl::vector; - -struct RegexInfo final { - const string& regex; - std::string_view subject; - std::string_view replacement; - - // PCRE compile options of the regex - uint32_t compile_options{}; - // number of groups including entire match - uint32_t capture_count{}; - // compiled regex - pcre2_code_8* regex_code{nullptr}; - - // vector of group names - regex_pcre2_group_names_t group_names; - - int64_t match_count{}; - uint32_t match_options{PCRE2_NO_UTF_CHECK}; - - int64_t replace_count{}; - uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; - // contains a string after replacements if replace_count > 0, nullopt otherwise - std::optional opt_replace_result; - - RegexInfo() = delete; - - RegexInfo(const string& regex_, std::string_view subject_, std::string_view replacement_) noexcept - : regex(regex_), - subject(subject_), - replacement(replacement_) {} -}; - -class pcre2_match_view { -public: - pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, size_t num_groups) noexcept - : m_subject_data{subject}, - m_ovector_ptr{ovector}, - m_num_groups{num_groups} {} - - int32_t size() const noexcept { - return m_num_groups; - } - - std::optional get_group(size_t i) const noexcept { - if (i >= m_num_groups) { - return std::nullopt; - } - - kphp::log::assertion(m_ovector_ptr); - // ovector is an array of offset pairs - PCRE2_SIZE start{m_ovector_ptr[2 * i]}; - PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]}; - - if (start == PCRE2_UNSET) { - return std::nullopt; - } - - return m_subject_data.substr(start, end - start); - } - -private: - std::string_view m_subject_data; - const PCRE2_SIZE* m_ovector_ptr; - size_t m_num_groups; -}; - -template -requires((std::is_same_v && ...) && sizeof...(Args) > 0) -bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { - const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; - if (!valid) [[unlikely]] { - kphp::log::warning("invalid flags: {}", flags); - } - return valid; -} bool correct_offset(int64_t& offset, std::string_view subject) noexcept { if (offset < 0) [[unlikely]] { @@ -153,16 +44,6 @@ bool correct_offset(int64_t& offset, std::string_view subject) noexcept { return offset <= subject.size(); } -int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept { - // all multibyte utf8 runes consist of subsequent bytes, - // these subsequent bytes start with 10 bit pattern - // 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000) - while (offset < subject.size() && ((static_cast(subject[offset])) & 0xc0) == 0x80) { - offset++; - } - return offset; -} - std::optional try_get_backref(std::string_view preg_replacement) noexcept { if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) { return std::nullopt; @@ -296,32 +177,244 @@ class preg_replacement_parser { } }; -bool compile_regex(RegexInfo& regex_info) noexcept { - const vk::final_action finalizer{[®ex_info]() noexcept { - if (regex_info.regex_code != nullptr) [[likely]] { - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); - ++regex_info.capture_count; // to also count entire match +array to_mixed_array(const kphp::regex::details::match_results_wrapper& wrapper) noexcept { + const bool numeric_only{wrapper.name_count() == 0}; + + array result_map{array_size{static_cast(wrapper.max_potential_size()), numeric_only}}; + for (auto [key, value] : wrapper) { + result_map.set_value(key, value); + } + return result_map; +} + +// *** importrant *** +// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches +void set_all_matches(const kphp::pcre2::regex& re, const kphp::stl::vector& group_names, + const kphp::pcre2::match_view& match_view, int64_t flags, std::optional> opt_all_matches) noexcept { + const auto is_pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + + // early return in case we don't actually need to set matches + if (!opt_all_matches.has_value()) { + return; + } + + auto last_unmatched_policy{is_pattern_order ? kphp::regex::details::trailing_unmatch::include : kphp::regex::details::trailing_unmatch::skip}; + mixed matches{to_mixed_array({match_view, group_names, re.capture_count(), re.name_count(), last_unmatched_policy, is_offset_capture, is_unmatched_as_null})}; + + mixed& all_matches{(*opt_all_matches).get()}; + if (is_pattern_order) [[likely]] { + for (const auto& it : std::as_const(matches)) { + all_matches[it.get_key()].push_back(it.get_value()); + } + } else { + all_matches.push_back(matches); + } +} + +std::optional replace_regex(kphp::regex::details::info& regex_info, const kphp::pcre2::regex& re, uint64_t limit) noexcept { + regex_info.replace_count = 0; + + if (limit == 0) { + return regex_info.subject; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + auto& runtime_ctx{RuntimeContext::get()}; + PCRE2_SIZE buffer_length{ + std::max({regex_info.subject.size(), static_cast(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; + runtime_ctx.static_SB.clean().reserve(buffer_length); + + size_t last_pos{}; + string output_str{}; + + kphp::pcre2::matcher pcre2_matcher{ + re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.regex_pcre2_match_data, regex_info.match_options}; + while (regex_info.replace_count < limit) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + auto& match_view{*opt_match_view}; + + output_str.append(std::next(regex_info.subject.c_str(), last_pos), match_view.match_start() - last_pos); + + PCRE2_SIZE replacement_length{buffer_length}; + auto sub_res{match_view.substitute({regex_info.replacement.c_str(), regex_info.replacement.size()}, runtime_ctx.static_SB.buffer(), replacement_length, + regex_state.match_context)}; + if (!sub_res.has_value() && sub_res.error().code == PCRE2_ERROR_NOMEMORY) [[unlikely]] { + runtime_ctx.static_SB.reserve(replacement_length); + buffer_length = replacement_length; + sub_res = match_view.substitute({regex_info.replacement.c_str(), regex_info.replacement.size()}, runtime_ctx.static_SB.buffer(), replacement_length, + regex_state.match_context); + } + if (!sub_res.has_value()) [[unlikely]] { + kphp::log::warning("pcre2_substitute error {}", sub_res.error()); + return std::nullopt; + } + + output_str.append(runtime_ctx.static_SB.buffer(), replacement_length); + + last_pos = match_view.match_end(); + ++regex_info.replace_count; + } + + output_str.append(std::next(regex_info.subject.c_str(), last_pos), regex_info.subject.size() - last_pos); + + return output_str; +} + +std::optional> split_regex(kphp::regex::details::info& regex_info, const kphp::pcre2::regex& re, int64_t limit, bool no_empty, bool delim_capture, + bool offset_capture) noexcept { + if (limit == 0) { + limit = kphp::regex::PREG_NOLIMIT; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + array output{}; + + kphp::pcre2::matcher pcre2_matcher{ + re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.regex_pcre2_match_data, regex_info.match_options}; + size_t offset{}; + for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + kphp::pcre2::match_view match_view{*opt_match_view}; + + if (const auto size{match_view.match_start() - offset}; !no_empty || size != 0) { + string val{std::next(regex_info.subject.c_str(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + ++out_parts_count; + } + + if (delim_capture) { + for (size_t i{1}; i < match_view.size(); i++) { + auto opt_submatch{match_view.get_group(i)}; + auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; + const auto size{submatch_string_view.size()}; + if (!no_empty || size != 0) { + string val; + if (opt_submatch.has_value()) [[likely]] { + val = string{submatch_string_view.data(), static_cast(size)}; + } + + mixed output_val; + if (offset_capture) { + output_val = + array::create(std::move(val), opt_submatch + .transform([®ex_info](auto submatch_string_view) noexcept { + return static_cast(std::distance(regex_info.subject.c_str(), submatch_string_view.data())); + }) + .value_or(-1)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + } + } + + offset = match_view.match_end(); + } + + const auto size{regex_info.subject.size() - offset}; + if (!no_empty || size != 0) { + string val{std::next(regex_info.subject.c_str(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); } else { - regex_info.capture_count = 0; + output_val = std::move(val); } - }}; + output.emplace_back(std::move(output_val)); + } + + return output; +} + +} // namespace + +namespace kphp::regex { + +namespace details { + +match_results_wrapper::iterator::reference match_results_wrapper::iterator::operator*() const noexcept { + auto content_opt{m_parent.m_view.get_group_content(m_group_idx)}; + + mixed val_mixed; + + mixed unmatched_val{m_parent.m_is_unmatched_as_null ? mixed{} : mixed{string{}}}; + + if (m_parent.m_is_offset_capture) { + val_mixed = content_opt ? array::create(string{content_opt->text.data(), static_cast(content_opt->text.size())}, + static_cast(content_opt->offset)) + : array::create(unmatched_val, static_cast(-1)); + } else { + val_mixed = content_opt ? string{content_opt->text.data(), static_cast(content_opt->text.size())} : unmatched_val; + } + + mixed key_mixed; + if (m_yield_name) { + auto name{m_parent.m_group_names[m_group_idx].name}; + key_mixed = string{name.data(), static_cast(name.size())}; + } else { + key_mixed = static_cast(m_group_idx); + } + + return {key_mixed, val_mixed}; +} + +std::optional> compile_regex(info& regex_info) noexcept { auto& regex_state{RegexInstanceState::get()}; if (!regex_state.compile_context) [[unlikely]] { - return false; + return std::nullopt; } // check runtime cache if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { const auto& [compile_options, regex_code]{opt_ref->get()}; regex_info.compile_options = compile_options; - regex_info.regex_code = regex_code.get(); - return true; + return regex_code; } if (regex_info.regex.empty()) { kphp::log::warning("empty regex"); - return false; + return std::nullopt; } char end_delim{}; @@ -360,7 +453,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } default: { kphp::log::warning("wrong regex delimiter {}", start_delim); - return false; + return std::nullopt; } } @@ -426,17 +519,17 @@ bool compile_regex(RegexInfo& regex_info) noexcept { if (regex_body.empty()) { kphp::log::warning("no ending regex delimiter: {}", regex_info.regex.c_str()); - return false; + return std::nullopt; } // UTF-8 validation if (static_cast(compile_options & PCRE2_UTF)) { if (!mb_UTF8_check(regex_info.regex.c_str())) [[unlikely]] { kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex.c_str()); - return false; + return std::nullopt; } - if (!mb_UTF8_check(regex_info.subject.data())) [[unlikely]] { - kphp::log::warning("invalid UTF-8 subject: {}", regex_info.subject); - return false; + if (!mb_UTF8_check(regex_info.subject.c_str())) [[unlikely]] { + kphp::log::warning("invalid UTF-8 subject: {}", regex_info.subject.c_str()); + return std::nullopt; } } @@ -445,453 +538,107 @@ bool compile_regex(RegexInfo& regex_info) noexcept { regex_info.compile_options = compile_options; // compile pcre2_code - int32_t error_number{}; - PCRE2_SIZE error_offset{}; - regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), regex_info.compile_options, - std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), - pcre2_code_free_8}; - if (!regex_code) [[unlikely]] { - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, kphp::regex::details::pcre2_error{.code = error_number}); - return false; + auto expected_re{pcre2::regex::compile(regex_body, regex_state.compile_context, regex_info.compile_options)}; + if (!expected_re.has_value()) [[unlikely]] { + const auto& err{expected_re.error()}; + kphp::log::warning("can't compile pcre2 regex due to error: {}", static_cast(err)); + return std::nullopt; } - regex_info.regex_code = regex_code.get(); + auto& re{*expected_re}; // add compiled code to runtime cache - regex_state.add_compiled_regex(regex_info.regex, compile_options, std::move(regex_code)); - - return true; + return regex_state.add_compiled_regex(regex_info.regex, compile_options, std::move(re))->get().regex_code; } -bool collect_group_names(RegexInfo& regex_info) noexcept { - if (regex_info.regex_code == nullptr) [[unlikely]] { - return false; - } - +kphp::stl::vector collect_group_names(const pcre2::regex& re) noexcept { + // vector of group names + kphp::stl::vector group_names; // initialize an array of strings to hold group names - regex_info.group_names.resize(regex_info.capture_count); - - uint32_t name_count{}; - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMECOUNT, std::addressof(name_count)); - if (name_count == 0) { - return true; - } - - PCRE2_SPTR8 name_table{}; - uint32_t name_entry_size{}; - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMETABLE, std::addressof(name_table)); - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMEENTRYSIZE, std::addressof(name_entry_size)); - - PCRE2_SPTR8 entry{name_table}; - for (auto i{0}; i < name_count; ++i) { - const auto group_number{static_cast((entry[0] << 8) | entry[1])}; - PCRE2_SPTR8 group_name{std::next(entry, 2)}; - regex_info.group_names[group_number] = reinterpret_cast(group_name); - std::advance(entry, name_entry_size); - } - - return true; -} - -class matcher { -public: - matcher(const RegexInfo& info, size_t match_from) noexcept - : m_regex_info{info}, - m_match_options{info.match_options}, - m_current_offset{match_from} { - kphp::log::assertion(info.regex_code != nullptr); - - const auto& regex_state{RegexInstanceState::get()}; - m_match_data = regex_state.regex_pcre2_match_data.get(); - kphp::log::assertion(m_match_data); - } - - std::expected, kphp::regex::details::pcre2_error> next() noexcept { - const auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); - - const auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; - - while (true) { - // Try to find match - int32_t ret_code{pcre2_match_8(m_regex_info.regex_code, reinterpret_cast(m_regex_info.subject.data()), m_regex_info.subject.size(), - m_current_offset, m_match_options, m_match_data, regex_state.match_context.get())}; - // From https://www.pcre.org/current/doc/html/pcre2_match.html - // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set - // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. - if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - return std::unexpected{kphp::regex::details::pcre2_error{.code = ret_code}}; - } - size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; - - if (match_count == 0) { - // If match is not found - if (m_match_options == m_regex_info.match_options || m_current_offset == m_regex_info.subject.size()) { - // Here we are sure that there are no more matches here - return std::nullopt; - } - // Here we know that we were looking for a non-empty and anchored match, - // and we're going to try searching from the next character with the default options. - ++m_current_offset; - m_current_offset = - static_cast(m_regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, m_regex_info.subject) : m_current_offset; - m_match_options = m_regex_info.match_options; - continue; - } - - // Match found - PCRE2_SIZE match_start{ovector[0]}; - PCRE2_SIZE match_end{ovector[1]}; - - m_current_offset = match_end; - if (match_end == match_start) { - // If an empty match is found, try searching for a non-empty attached match next time. - m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; - } else { - // Else use default options - m_match_options = m_regex_info.match_options; - } - return pcre2_match_view{m_regex_info.subject, ovector, match_count}; - } - } - -private: - const RegexInfo& m_regex_info; - uint64_t m_match_options{}; - PCRE2_SIZE m_current_offset{}; - pcre2_match_data_8* m_match_data{nullptr}; -}; - -// returns the ending offset of the entire match -PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional> opt_matches, - trailing_unmatch last_unmatched_policy) noexcept { - if (regex_info.regex_code == nullptr || regex_info.match_count <= 0) [[unlikely]] { - return PCRE2_UNSET; - } - - const auto& regex_state{RegexInstanceState::get()}; - - // get the ouput vector from the match data - const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; - const auto end_offset{ovector[1]}; - // early return in case we don't need to actually set matches - if (!opt_matches.has_value()) { - return end_offset; - } - - const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; - // calculate last matched group - int64_t last_matched_group{-1}; - for (auto i{0}; i < regex_info.match_count; ++i) { - if (ovector[static_cast(2 * i)] != PCRE2_UNSET) { - last_matched_group = i; - } - } - // retrieve the named groups count - uint32_t named_groups_count{}; - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMECOUNT, std::addressof(named_groups_count)); - - // reserve enough space for output - array output{array_size{static_cast(regex_info.group_names.size() + named_groups_count), named_groups_count == 0}}; - for (auto i{0}; i < regex_info.group_names.size(); ++i) { - // skip unmatched groups at the end unless unmatched_as_null is set - if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !is_unmatched_as_null) [[unlikely]] { - break; - } - - const auto match_start_offset{ovector[static_cast(2 * i)]}; - const auto match_end_offset{ovector[static_cast(2 * i + 1)]}; - - mixed match_val; // NULL value - if (match_start_offset != PCRE2_UNSET) { // handle matched group - const auto match_size{match_end_offset - match_start_offset}; - match_val = string{std::next(regex_info.subject.data(), match_start_offset), static_cast(match_size)}; - } else if (!is_unmatched_as_null) { // handle unmatched group - match_val = string{}; - } - - mixed output_val; - if (is_offset_capture) { - output_val = array::create(std::move(match_val), static_cast(match_start_offset)); - } else { - output_val = std::move(match_val); - } - - if (regex_info.group_names[i] != nullptr) { - output.set_value(string{regex_info.group_names[i]}, output_val); - } - output.emplace_back(output_val); - } + group_names.resize(re.capture_count() + 1); - (*opt_matches).get() = std::move(output); - return end_offset; -} - -// returns the ending offset of the entire match -// *** importrant *** -// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches -PCRE2_SIZE set_all_matches(const RegexInfo& regex_info, int64_t flags, std::optional> opt_all_matches) noexcept { - const auto pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; - - // early return in case we don't actually need to set matches - if (!opt_all_matches.has_value()) { - return set_matches(regex_info, flags, std::nullopt, pattern_order ? trailing_unmatch::include : trailing_unmatch::skip); + if (re.name_count() == 0) { + return group_names; } - mixed matches; - PCRE2_SIZE offset{set_matches(regex_info, flags, matches, pattern_order ? trailing_unmatch::include : trailing_unmatch::skip)}; - if (offset == PCRE2_UNSET) [[unlikely]] { - return offset; - } - - mixed& all_matches{(*opt_all_matches).get()}; - if (pattern_order) [[likely]] { - for (const auto& it : std::as_const(matches)) { - all_matches[it.get_key()].push_back(it.get_value()); - } - } else { - all_matches.push_back(matches); + for (auto group_name : re.names()) { + group_names[group_name.index] = group_name; } - return offset; + return group_names; } -bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { - regex_info.replace_count = 0; - if (regex_info.regex_code == nullptr) [[unlikely]] { - return false; - } - - const auto& regex_state{RegexInstanceState::get()}; - auto& runtime_ctx{RuntimeContext::get()}; - if (!regex_state.match_context) [[unlikely]] { - return false; - } - - const PCRE2_SIZE buffer_length{std::max({static_cast(regex_info.subject.size()), - static_cast(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; - runtime_ctx.static_SB.clean().reserve(buffer_length); - PCRE2_SIZE output_length{buffer_length}; - - // replace all occurences - if (limit == std::numeric_limits::max()) [[likely]] { - regex_info.replace_count = pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), 0, - regex_info.replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(), - reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), - reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); - - if (regex_info.replace_count < 0) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = static_cast(regex_info.replace_count)}); - return false; - } - } else { // replace only 'limit' times - size_t substitute_offset{}; - int64_t replacement_diff_acc{}; - PCRE2_SIZE length_after_replace{buffer_length}; - string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; - - matcher pcre2_matcher{regex_info, {}}; - for (; regex_info.replace_count < limit; ++regex_info.replace_count) { - auto expected_opt_match_view{pcre2_matcher.next()}; - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - return false; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - auto match_view{*opt_match_view}; - auto opt_entire_pattern_match{match_view.get_group(0)}; - if (!opt_entire_pattern_match.has_value()) [[unlikely]] { - return false; - } - auto entire_pattern_match_string_view{*opt_entire_pattern_match}; - const auto match_start_offset{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data())}; - const auto match_end_offset{match_start_offset + entire_pattern_match_string_view.size()}; - - length_after_replace = buffer_length; - if (auto replace_one_ret_code{pcre2_substitute_8( - regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, - regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), - regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; - replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = replace_one_ret_code}); - return false; - } - - replacement_diff_acc += regex_info.replacement.size() - (match_end_offset - match_start_offset); - substitute_offset = match_end_offset + replacement_diff_acc; - str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast(length_after_replace)}; - } - - output_length = length_after_replace; - } - - if (regex_info.replace_count > 0) { - runtime_ctx.static_SB.set_pos(output_length); - regex_info.opt_replace_result.emplace(runtime_ctx.static_SB.str()); - } - - return true; -} - -std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bool no_empty, bool delim_capture, bool offset_capture) noexcept { - if (limit == 0) { - limit = kphp::regex::PREG_NOLIMIT; - } - - const auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - array output{}; - - matcher pcre2_matcher{regex_info, {}}; - size_t offset{}; - for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { - auto expected_opt_match_view{pcre2_matcher.next()}; - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - return std::nullopt; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - pcre2_match_view match_view{*opt_match_view}; - - auto opt_entire_pattern_match{match_view.get_group(0)}; - if (!opt_entire_pattern_match.has_value()) [[unlikely]] { - return std::nullopt; - } - auto entire_pattern_match_string_view{*opt_entire_pattern_match}; - - if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { - string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - ++out_parts_count; - } - - if (delim_capture) { - for (size_t i{1}; i < match_view.size(); i++) { - auto opt_submatch{match_view.get_group(i)}; - auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; - const auto size{submatch_string_view.size()}; - if (!no_empty || size != 0) { - string val; - if (opt_submatch.has_value()) [[likely]] { - val = string{submatch_string_view.data(), static_cast(size)}; - } - - mixed output_val; - if (offset_capture) { - output_val = - array::create(std::move(val), opt_submatch - .transform([®ex_info](auto submatch_string_view) noexcept { - return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); - }) - .value_or(-1)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - } - } - } - - offset = std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) + entire_pattern_match_string_view.size(); - } - - const auto size{regex_info.subject.size() - offset}; - if (!no_empty || size != 0) { - string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } +} // namespace details - output.emplace_back(std::move(output_val)); - } - - return output; -} - -} // namespace +} // namespace kphp::regex Optional f$preg_match(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::info regex_info{pattern, subject, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { return false; } - if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { + if (!correct_offset(offset, {regex_info.subject.c_str(), regex_info.subject.size()})) [[unlikely]] { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { - return false; - } - if (!collect_group_names(regex_info)) [[unlikely]] { + auto opt_re{kphp::regex::details::compile_regex(regex_info)}; + if (!opt_re.has_value()) [[unlikely]] { return false; } + const kphp::pcre2::regex& re{opt_re->get()}; + auto group_names{kphp::regex::details::collect_group_names(re)}; const auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(regex_info.regex_code != nullptr && regex_state.match_context); - - auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; + kphp::log::assertion(regex_state.match_context != nullptr); + + auto expected_opt_match_view{kphp::pcre2::matcher{re, + {regex_info.subject.c_str(), regex_info.subject.size()}, + static_cast(offset), + regex_state.match_context, + regex_state.regex_pcre2_match_data, + regex_info.match_options} + .next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; - regex_info.match_count = opt_match_view.transform(&pcre2_match_view::size).value_or(0); - - std::optional> matches{}; if (opt_matches.has_value()) { + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + kphp::log::assertion(std::holds_alternative>(opt_matches.val())); auto& inner_ref{std::get>(opt_matches.val()).get()}; inner_ref = array{}; - matches.emplace(inner_ref); + opt_match_view.transform([is_offset_capture, is_unmatched_as_null, &inner_ref, &group_names, &re](const auto& match_view) { + inner_ref = to_mixed_array({match_view, group_names, re.capture_count(), re.name_count(), kphp::regex::details::trailing_unmatch::skip, is_offset_capture, + is_unmatched_as_null}); + return 0; + }); } - set_matches(regex_info, flags, matches, trailing_unmatch::skip); - return regex_info.match_count > 0 ? 1 : 0; + return opt_match_view.has_value() ? 1 : 0; } Optional f$preg_match_all(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { int64_t entire_match_count{}; - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::info regex_info{pattern, subject, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, kphp::regex::PREG_OFFSET_CAPTURE, - kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, + kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { return false; } - if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { + if (!correct_offset(offset, {regex_info.subject.c_str(), regex_info.subject.size()})) [[unlikely]] { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { - return false; - } - if (!collect_group_names(regex_info)) [[unlikely]] { + auto opt_re{kphp::regex::details::compile_regex(regex_info)}; + if (!opt_re.has_value()) [[unlikely]] { return false; } + const auto& re{*opt_re}; + auto group_names{kphp::regex::details::collect_group_names(re)}; std::optional> matches{}; if (opt_matches.has_value()) { @@ -905,29 +652,39 @@ Optional f$preg_match_all(const string& pattern, const string& subject, if (matches.has_value() && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { auto& inner_ref{(*matches).get()}; const array init_val{}; - for (const auto* group_name : regex_info.group_names) { - if (group_name != nullptr) { - inner_ref.set_value(string{group_name}, init_val); + for (const auto [name, index] : group_names) { + if (!name.empty()) { + inner_ref.set_value(string{name.data(), static_cast(name.size())}, init_val); } inner_ref.push_back(init_val); } } - matcher pcre2_matcher{regex_info, static_cast(offset)}; + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(regex_state.match_context != nullptr); + + kphp::pcre2::matcher pcre2_matcher{re, + {regex_info.subject.c_str(), regex_info.subject.size()}, + static_cast(offset), + regex_state.match_context, + regex_state.regex_pcre2_match_data, + regex_info.match_options}; + + while (true) { + auto expected_opt_match_view{pcre2_matcher.next()}; - auto expected_opt_match_view{pcre2_matcher.next()}; - while (expected_opt_match_view.has_value() && expected_opt_match_view->has_value()) { - pcre2_match_view match_view{**expected_opt_match_view}; - regex_info.match_count = match_view.size(); - set_all_matches(regex_info, flags, matches); - if (regex_info.match_count > 0) { - ++entire_match_count; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); + return false; } - expected_opt_match_view = pcre2_matcher.next(); - } - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); - return false; + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + kphp::pcre2::match_view match_view{*opt_match_view}; + set_all_matches(re, group_names, match_view, flags, matches); + ++entire_match_count; } return entire_match_count; @@ -951,7 +708,7 @@ Optional f$preg_replace(const string& pattern, const string& replacement // we need to replace PHP's back references with PCRE2 ones auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; - kphp::stl::string pcre2_replacement{}; + string pcre2_replacement{}; for (const auto& term : parser) { if (std::holds_alternative(term)) { auto c{std::get(term)}; @@ -961,23 +718,27 @@ Optional f$preg_replace(const string& pattern, const string& replacement } } else { auto backreference{std::get(term)}; - pcre2_replacement.reserve(pcre2_replacement.size() + backreference.size() + 3); + pcre2_replacement.reserve_at_least(pcre2_replacement.size() + backreference.size() + 3); pcre2_replacement.append("${"); - pcre2_replacement.append(backreference); + pcre2_replacement.append(backreference.data(), backreference.size()); pcre2_replacement.append("}"); } } - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; + kphp::regex::details::info regex_info{pattern, subject, pcre2_replacement}; - if (!compile_regex(regex_info)) [[unlikely]] { + auto opt_re{kphp::regex::details::compile_regex(regex_info)}; + if (!opt_re.has_value()) [[unlikely]] { return {}; } - if (!replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) { + const auto& re{opt_re->get()}; + auto opt_replace_result{ + replace_regex(regex_info, re, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))}; + if (!opt_replace_result.has_value()) { return {}; } count = regex_info.replace_count; - return regex_info.opt_replace_result.value_or(subject); + return std::move(*opt_replace_result); } Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit, @@ -1116,17 +877,19 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed } Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::info regex_info{pattern, subject, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, - kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, + kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { - return false; + auto opt_re{kphp::regex::details::compile_regex(regex_info)}; + if (!opt_re.has_value()) [[unlikely]] { + return {}; } - auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // - (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, // + const auto& re{opt_re->get()}; + auto opt_output{split_regex(regex_info, re, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // + (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, // (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; if (!opt_output.has_value()) [[unlikely]] { return false; diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 9152cc3ace..4af9383154 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -15,15 +17,211 @@ #include "runtime-light/coroutine/task.h" #include "runtime-light/coroutine/type-traits.h" #include "runtime-light/stdlib/diagnostics/logs.h" +#include "runtime-light/stdlib/string/regex-include.h" +#include "runtime-light/stdlib/string/regex-state.h" namespace kphp::regex { +inline constexpr int64_t PREG_NO_FLAGS = 0; + namespace details { -struct pcre2_error { - int32_t code{}; +enum class trailing_unmatch : uint8_t { skip, include }; + +struct info final { + const string& regex; + const string& subject; + string replacement; + + // PCRE compile options of the regex + uint32_t compile_options{}; + + int64_t match_count{}; + uint32_t match_options{PCRE2_NO_UTF_CHECK}; + + int64_t replace_count{}; + uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; + + info() = delete; + + info(const string& regex_, const string& subject_, string replacement_) noexcept + : regex(regex_), + subject(subject_), + replacement(std::move(replacement_)) {} +}; + +class match_results_wrapper { + const pcre2::match_view& m_view; + const kphp::stl::vector& m_group_names; + uint32_t m_capture_count; + uint32_t m_name_count; + trailing_unmatch m_last_unmatched_policy; + bool m_is_offset_capture; + bool m_is_unmatched_as_null; + +public: + match_results_wrapper(const pcre2::match_view& match_view, const kphp::stl::vector& names, + uint32_t capture_count, uint32_t name_count, trailing_unmatch last_unmatched_policy, bool is_offset_capture, + bool is_unmatched_as_null) noexcept + : m_view{match_view}, + m_group_names{names}, + m_capture_count{capture_count}, + m_name_count{name_count}, + m_last_unmatched_policy{last_unmatched_policy}, + m_is_offset_capture{is_offset_capture}, + m_is_unmatched_as_null{is_unmatched_as_null} {} + + uint32_t match_count() const noexcept { + if (!m_is_unmatched_as_null && m_last_unmatched_policy == trailing_unmatch::skip) { + return m_view.size(); + } + return m_capture_count + 1; + } + + size_t max_potential_size() const noexcept { + return match_count() + m_name_count; + } + + uint32_t name_count() const noexcept { + return m_name_count; + } + + class iterator { + const match_results_wrapper& m_parent; + uint32_t m_group_idx; + bool m_yield_name{false}; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = std::pair; + using difference_type = std::ptrdiff_t; + using pointer = value_type*; + using reference = value_type; + + iterator(const match_results_wrapper& parent, uint32_t group_idx) noexcept + : m_parent{parent}, + m_group_idx{group_idx} { + if (m_group_idx < m_parent.m_group_names.size() && !m_parent.m_group_names[m_group_idx].name.empty()) { + m_yield_name = true; + } + } + + reference operator*() const noexcept; + + iterator& operator++() noexcept { + if (m_yield_name) { + m_yield_name = false; + } else { + m_group_idx++; + + if (m_group_idx < m_parent.m_group_names.size() && !m_parent.m_group_names[m_group_idx].name.empty()) { + m_yield_name = true; + } + } + return *this; + } + + bool operator==(const iterator& other) const noexcept { + return m_group_idx == other.m_group_idx && m_yield_name == other.m_yield_name; + } + bool operator!=(const iterator& other) const noexcept { + return !(*this == other); + } + }; + + iterator begin() const noexcept { + return iterator{*this, 0}; + } + + iterator end() const noexcept { + return iterator{*this, match_count()}; + } }; +template +requires((std::is_same_v && ...) && sizeof...(Args) > 0) +bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { + const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; + if (!valid) [[unlikely]] { + kphp::log::warning("invalid flags: {}", flags); + } + return valid; +} + +std::optional> compile_regex(info& regex_info) noexcept; + +kphp::stl::vector collect_group_names(const pcre2::regex& re) noexcept; + +template> F> +kphp::coro::task> replace_callback(info& regex_info, const pcre2::regex& re, + const kphp::stl::vector& group_names, + F callback, uint64_t limit) noexcept { + regex_info.replace_count = 0; + + if (limit == 0) { + co_return regex_info.subject; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + co_return std::nullopt; + } + + size_t last_pos{}; + string output_str{}; + + pcre2::matcher pcre2_matcher{ + re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.regex_pcre2_match_data, regex_info.match_options}; + while (regex_info.replace_count < limit) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + log::warning("can't replace with callback by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + co_return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + auto& match_view{*opt_match_view}; + + output_str.append(std::next(regex_info.subject.c_str(), last_pos), match_view.match_start() - last_pos); + + last_pos = match_view.match_end(); + + // retrieve the named groups count + uint32_t named_groups_count{re.name_count()}; + + array matches{array_size{static_cast(match_view.size() + named_groups_count), named_groups_count == 0}}; + for (auto [key, value] : match_results_wrapper{match_view, group_names, re.capture_count(), re.name_count(), trailing_unmatch::skip, false, false}) { + matches.set_value(key, value.to_string()); + } + string replacement{}; + if constexpr (kphp::coro::is_async_function_v>) { + replacement = co_await std::invoke(callback, std::move(matches)); + } else { + replacement = std::invoke(callback, std::move(matches)); + } + + output_str.append(replacement); + + ++regex_info.replace_count; + } + + output_str.append(std::next(regex_info.subject.c_str(), last_pos), regex_info.subject.size() - last_pos); + + co_return output_str; +} + +inline bool valid_preg_replace_mixed(const mixed& param) noexcept { + if (!param.is_array() && !param.is_string()) [[unlikely]] { + kphp::log::warning("invalid parameter: expected to be string or array"); + return false; + } + return true; +} + } // namespace details inline constexpr int64_t PREG_NO_ERROR = 0; @@ -33,7 +231,6 @@ inline constexpr int64_t PREG_RECURSION_LIMIT = 3; inline constexpr int64_t PREG_BAD_UTF8_ERROR = 4; inline constexpr int64_t PREG_BAD_UTF8_OFFSET_ERROR = 5; -inline constexpr int64_t PREG_NO_FLAGS = 0; inline constexpr auto PREG_PATTERN_ORDER = static_cast(1U << 0U); inline constexpr auto PREG_SET_ORDER = static_cast(1U << 1U); inline constexpr auto PREG_OFFSET_CAPTURE = static_cast(1U << 2U); @@ -87,34 +284,93 @@ kphp::coro::task> f$preg_replace_callback(string pattern, F cal Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { static_assert(std::same_as>, string>); - // the performance of this function can be enhanced: - // 1. don't use public f$preg_match and f$preg_replace; - // 2. use match_regex and replace_regex directly; - // 3. reuse match_data from match_regex in replace_regex. - array matches{}; - { // fill matches array or early return - mixed mixed_matches{}; - const auto match_result{f$preg_match(pattern, subject, mixed_matches, flags, 0)}; - if (!match_result.has_value()) [[unlikely]] { + + int64_t count{}; + vk::final_action count_finalizer{[&count, &opt_count]() noexcept { + if (opt_count.has_value()) { + kphp::log::assertion(std::holds_alternative>(opt_count.val())); + auto& inner_ref{std::get>(opt_count.val()).get()}; + inner_ref = count; + } + }}; + + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { + kphp::log::warning("invalid limit {} in preg_replace_callback", limit); + co_return Optional{}; + } + + kphp::regex::details::info regex_info{pattern, subject, {}}; + + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { + co_return Optional{}; + } + auto opt_re{kphp::regex::details::compile_regex(regex_info)}; + if (!opt_re.has_value()) [[unlikely]] { + co_return Optional{}; + } + const auto& re{opt_re->get()}; + auto group_names{kphp::regex::details::collect_group_names(re)}; + auto unsigned_limit{limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)}; + regex_info.replace_count = 0; + + if (limit == 0) { + count = regex_info.replace_count; + co_return regex_info.subject; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + co_return Optional{}; + } + + size_t last_pos{}; + string output_str{}; + + kphp::pcre2::matcher pcre2_matcher{ + re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.regex_pcre2_match_data, regex_info.match_options}; + while (regex_info.replace_count < unsigned_limit) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't replace with callback by pcre2 regex due to match error: {}", expected_opt_match_view.error()); co_return Optional{}; - } else if (match_result.val() == 0) { // no matches, so just return the subject - co_return std::move(subject); } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + auto& match_view{*opt_match_view}; + + output_str.append(std::next(regex_info.subject.c_str(), last_pos), match_view.match_start() - last_pos); + + last_pos = match_view.match_end(); + + // retrieve the named groups count + uint32_t named_groups_count{re.name_count()}; - matches = array{mixed_matches.as_array().size()}; - for (auto& elem : std::as_const(mixed_matches.as_array())) { - matches.set_value(elem.get_key(), std::move(elem.get_value().as_string())); + array matches{array_size{static_cast(match_view.size() + named_groups_count), named_groups_count == 0}}; + for (auto [key, value] : kphp::regex::details::match_results_wrapper{match_view, group_names, re.capture_count(), re.name_count(), + kphp::regex::details::trailing_unmatch::skip, false, false}) { + matches.set_value(key, value.to_string()); + } + string replacement{}; + if constexpr (kphp::coro::is_async_function_v>) { + replacement = co_await std::invoke(callback, std::move(matches)); + } else { + replacement = std::invoke(callback, std::move(matches)); } - } - string replacement{}; - if constexpr (kphp::coro::is_async_function_v>) { - replacement = co_await std::invoke(callback, std::move(matches)); - } else { - replacement = std::invoke(callback, std::move(matches)); + output_str.append(replacement); + + ++regex_info.replace_count; } - co_return f$preg_replace(pattern, replacement, subject, limit, opt_count); + output_str.append(std::next(regex_info.subject.c_str(), last_pos), regex_info.subject.size() - last_pos); + + count = regex_info.replace_count; + co_return output_str; } template diff --git a/runtime-light/stdlib/string/regex-include.h b/runtime-light/stdlib/string/regex-include.h index b0670a06ed..41a894c046 100644 --- a/runtime-light/stdlib/string/regex-include.h +++ b/runtime-light/stdlib/string/regex-include.h @@ -8,9 +8,3 @@ #define PCRE2_CODE_UNIT_WIDTH 8 #include "pcre2/pcre2.h" - -using regex_pcre2_general_context_t = std::unique_ptr; -using regex_pcre2_compile_context_t = std::unique_ptr; -using regex_pcre2_match_context_t = std::unique_ptr; -using regex_pcre2_match_data_t = std::unique_ptr; -using regex_pcre2_code_t = std::unique_ptr; diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index ea61a9cf9f..077fab4eaa 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -14,7 +14,7 @@ #include "runtime-common/core/allocator/script-allocator.h" #include "runtime-common/core/runtime-core.h" #include "runtime-common/core/std/containers.h" -#include "runtime-light/stdlib/string/regex-include.h" +#include "runtime-light/stdlib/string/pcre2-functions.h" struct RegexInstanceState final : private vk::not_copyable { private: @@ -26,7 +26,7 @@ struct RegexInstanceState final : private vk::not_copyable { // PCRE compile options of the regex uint32_t compile_options{}; // compiled regex - regex_pcre2_code_t regex_code; + kphp::pcre2::regex regex_code; }; kphp::stl::unordered_map regex_pcre2_code_cache; @@ -35,10 +35,10 @@ struct RegexInstanceState final : private vk::not_copyable { static constexpr size_t OVECTOR_SIZE{MAX_SUBPATTERNS_COUNT + 1}; static constexpr size_t REPLACE_BUFFER_SIZE{size_t{16U} * size_t{1024U}}; - const regex_pcre2_general_context_t regex_pcre2_general_context; - const regex_pcre2_compile_context_t compile_context; - const regex_pcre2_match_context_t match_context; - regex_pcre2_match_data_t regex_pcre2_match_data; + const kphp::pcre2::regex_general_context_t regex_pcre2_general_context; + const kphp::pcre2::regex_compile_context_t compile_context; + const kphp::pcre2::regex_match_context_t match_context; + kphp::pcre2::regex_match_data_t regex_pcre2_match_data; RegexInstanceState() noexcept; @@ -50,7 +50,7 @@ struct RegexInstanceState final : private vk::not_copyable { } std::optional> add_compiled_regex(string regex, uint32_t compile_options, - regex_pcre2_code_t regex_code) noexcept { + kphp::pcre2::regex regex_code) noexcept { return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) .first->second; } diff --git a/tests/phpt/dl/002_preg_replace_callback.php b/tests/phpt/dl/002_preg_replace_callback.php index 16c97a8946..7519cefee4 100644 --- a/tests/phpt/dl/002_preg_replace_callback.php +++ b/tests/phpt/dl/002_preg_replace_callback.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark