diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 0b4420d496..ae1ef2bf0c 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -29,118 +29,9 @@ #include "runtime-light/stdlib/string/regex-include.h" #include "runtime-light/stdlib/string/regex-state.h" -namespace std { - -template<> -struct formatter { - static constexpr size_t ERROR_BUFFER_LENGTH{256}; - - template - constexpr auto parse(ParseContext& ctx) const noexcept { - return ctx.begin(); - } - - template - auto format(kphp::regex::details::pcre2_error error, FmtContext& ctx) const noexcept { - std::array buffer{}; - auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; - if (ret_code < 0) [[unlikely]] { - switch (ret_code) { - case PCRE2_ERROR_BADDATA: - return format_to(ctx.out(), "unknown error ({})", error.code); - case PCRE2_ERROR_NOMEMORY: - return format_to(ctx.out(), "[truncated] {}", buffer.data()); - default: - kphp::log::error("unsupported regex error code: {}", ret_code); - } - } - return format_to(ctx.out(), "{}", buffer.data()); - } -}; - -} // namespace std - namespace { -enum class trailing_unmatch : uint8_t { skip, include }; - using backref = std::string_view; -using regex_pcre2_group_names_t = kphp::stl::vector; - -struct RegexInfo final { - const string& regex; - std::string_view subject; - std::string_view replacement; - - // PCRE compile options of the regex - uint32_t compile_options{}; - // number of groups including entire match - uint32_t capture_count{}; - // compiled regex - pcre2_code_8* regex_code{nullptr}; - - // vector of group names - regex_pcre2_group_names_t group_names; - - int64_t match_count{}; - uint32_t match_options{PCRE2_NO_UTF_CHECK}; - - int64_t replace_count{}; - uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; - // contains a string after replacements if replace_count > 0, nullopt otherwise - std::optional opt_replace_result; - - RegexInfo() = delete; - - RegexInfo(const string& regex_, std::string_view subject_, std::string_view replacement_) noexcept - : regex(regex_), - subject(subject_), - replacement(replacement_) {} -}; - -class pcre2_match_view { -public: - pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, size_t num_groups) noexcept - : m_subject_data{subject}, - m_ovector_ptr{ovector}, - m_num_groups{num_groups} {} - - int32_t size() const noexcept { - return m_num_groups; - } - - std::optional get_group(size_t i) const noexcept { - if (i >= m_num_groups) { - return std::nullopt; - } - - kphp::log::assertion(m_ovector_ptr); - // ovector is an array of offset pairs - PCRE2_SIZE start{m_ovector_ptr[2 * i]}; - PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]}; - - if (start == PCRE2_UNSET) { - return std::nullopt; - } - - return m_subject_data.substr(start, end - start); - } - -private: - std::string_view m_subject_data; - const PCRE2_SIZE* m_ovector_ptr; - size_t m_num_groups; -}; - -template -requires((std::is_same_v && ...) && sizeof...(Args) > 0) -bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { - const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; - if (!valid) [[unlikely]] { - kphp::log::warning("invalid flags: {}", flags); - } - return valid; -} bool correct_offset(int64_t& offset, std::string_view subject) noexcept { if (offset < 0) [[unlikely]] { @@ -296,7 +187,244 @@ class preg_replacement_parser { } }; -bool compile_regex(RegexInfo& regex_info) noexcept { +// returns the ending offset of the entire match +// *** importrant *** +// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches +PCRE2_SIZE set_all_matches(const kphp::regex::details::Info& regex_info, int64_t flags, std::optional> opt_all_matches) noexcept { + const auto is_pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + + if (regex_info.match_count <= 0) [[unlikely]] { + return PCRE2_UNSET; + } + + const auto& regex_state{RegexInstanceState::get()}; + + // get the ouput vector from the match data + const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; + kphp::regex::details::match_view match_view{regex_info.subject, ovector, static_cast(regex_info.match_count)}; + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + return PCRE2_UNSET; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + const auto match_start_offset{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data())}; + const auto match_end_offset{match_start_offset + entire_pattern_match_string_view.size()}; + + // early return in case we don't actually need to set matches + if (!opt_all_matches.has_value()) { + return match_end_offset; + } + + auto last_unmatched_policy{is_pattern_order ? kphp::regex::details::trailing_unmatch::include : kphp::regex::details::trailing_unmatch::skip}; + auto opt_dumped_matches{kphp::regex::details::dump_matches(regex_info, match_view, last_unmatched_policy, is_offset_capture, is_unmatched_as_null)}; + if (!opt_dumped_matches.has_value()) [[unlikely]] { + return PCRE2_UNSET; + } + mixed matches{std::move(*opt_dumped_matches)}; + + mixed& all_matches{(*opt_all_matches).get()}; + if (is_pattern_order) [[likely]] { + for (const auto& it : std::as_const(matches)) { + all_matches[it.get_key()].push_back(it.get_value()); + } + } else { + all_matches.push_back(matches); + } + + return match_end_offset; +} + +std::optional> split_regex(kphp::regex::details::Info& regex_info, int64_t limit, bool no_empty, bool delim_capture, + bool offset_capture) noexcept { + if (limit == 0) { + limit = kphp::regex::PREG_NOLIMIT; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + array output{}; + + kphp::regex::details::matcher pcre2_matcher{regex_info, {}}; + size_t offset{}; + for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + kphp::regex::details::match_view match_view{*opt_match_view}; + + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + return std::nullopt; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + + if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + ++out_parts_count; + } + + if (delim_capture) { + for (size_t i{1}; i < match_view.size(); i++) { + auto opt_submatch{match_view.get_group(i)}; + auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; + const auto size{submatch_string_view.size()}; + if (!no_empty || size != 0) { + string val; + if (opt_submatch.has_value()) [[likely]] { + val = string{submatch_string_view.data(), static_cast(size)}; + } + + mixed output_val; + if (offset_capture) { + output_val = + array::create(std::move(val), opt_submatch + .transform([®ex_info](auto submatch_string_view) noexcept { + return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); + }) + .value_or(-1)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + } + } + + offset = std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) + entire_pattern_match_string_view.size(); + } + + const auto size{regex_info.subject.size() - offset}; + if (!no_empty || size != 0) { + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + + return output; +} + +} // namespace + +namespace kphp::regex { + +namespace details { + +std::optional match_view::get_group(size_t i) const noexcept { + if (i >= m_num_groups) { + return std::nullopt; + } + + kphp::log::assertion(m_ovector_ptr); + // ovector is an array of offset pairs + PCRE2_SIZE start{m_ovector_ptr[2 * i]}; + PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]}; + + if (start == PCRE2_UNSET) { + return std::nullopt; + } + + return m_subject_data.substr(start, end - start); +} + +matcher::matcher(const Info& info, size_t match_from) noexcept + : m_regex_info{info}, + m_match_options{info.match_options}, + m_current_offset{match_from} { + kphp::log::assertion(info.regex_code != nullptr); + + const auto& regex_state{RegexInstanceState::get()}; + m_match_data = regex_state.regex_pcre2_match_data.get(); + kphp::log::assertion(m_match_data); +} + +std::expected, details::pcre2_error> matcher::next() noexcept { + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); + + const auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; + + while (true) { + // Try to find match + int32_t ret_code{pcre2_match_8(m_regex_info.regex_code, reinterpret_cast(m_regex_info.subject.data()), m_regex_info.subject.size(), + m_current_offset, m_match_options, m_match_data, regex_state.match_context.get())}; + // From https://www.pcre.org/current/doc/html/pcre2_match.html + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. + if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { + return std::unexpected{details::pcre2_error{.code = ret_code}}; + } + size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; + + if (match_count == 0) { + // If match is not found + if (m_match_options == m_regex_info.match_options || m_current_offset == m_regex_info.subject.size()) { + // Here we are sure that there are no more matches here + return std::nullopt; + } + // Here we know that we were looking for a non-empty and anchored match, + // and we're going to try searching from the next character with the default options. + ++m_current_offset; + m_current_offset = + static_cast(m_regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, m_regex_info.subject) : m_current_offset; + m_match_options = m_regex_info.match_options; + continue; + } + + // Match found + PCRE2_SIZE match_start{ovector[0]}; + PCRE2_SIZE match_end{ovector[1]}; + + m_current_offset = match_end; + if (match_end == match_start) { + // If an empty match is found, try searching for a non-empty attached match next time. + m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } else { + // Else use default options + m_match_options = m_regex_info.match_options; + } + return details::match_view{m_regex_info.subject, ovector, match_count}; + } +} + +std::pair reserve_buffer(std::string_view subject) noexcept { + auto& runtime_ctx{RuntimeContext::get()}; + + const PCRE2_SIZE buffer_length{std::max( + {static_cast(subject.size()), static_cast(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; + runtime_ctx.static_SB.clean().reserve(buffer_length); + return {runtime_ctx.static_SB, buffer_length}; +} + +bool compile_regex(Info& regex_info) noexcept { const vk::final_action finalizer{[®ex_info]() noexcept { if (regex_info.regex_code != nullptr) [[likely]] { pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); @@ -462,7 +590,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { return true; } -bool collect_group_names(RegexInfo& regex_info) noexcept { +bool collect_group_names(Info& regex_info) noexcept { if (regex_info.regex_code == nullptr) [[unlikely]] { return false; } @@ -492,101 +620,13 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { return true; } -class matcher { -public: - matcher(const RegexInfo& info, size_t match_from) noexcept - : m_regex_info{info}, - m_match_options{info.match_options}, - m_current_offset{match_from} { - kphp::log::assertion(info.regex_code != nullptr); - - const auto& regex_state{RegexInstanceState::get()}; - m_match_data = regex_state.regex_pcre2_match_data.get(); - kphp::log::assertion(m_match_data); - } - - std::expected, kphp::regex::details::pcre2_error> next() noexcept { - const auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); - - const auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; - - while (true) { - // Try to find match - int32_t ret_code{pcre2_match_8(m_regex_info.regex_code, reinterpret_cast(m_regex_info.subject.data()), m_regex_info.subject.size(), - m_current_offset, m_match_options, m_match_data, regex_state.match_context.get())}; - // From https://www.pcre.org/current/doc/html/pcre2_match.html - // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set - // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. - if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - return std::unexpected{kphp::regex::details::pcre2_error{.code = ret_code}}; - } - size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; - - if (match_count == 0) { - // If match is not found - if (m_match_options == m_regex_info.match_options || m_current_offset == m_regex_info.subject.size()) { - // Here we are sure that there are no more matches here - return std::nullopt; - } - // Here we know that we were looking for a non-empty and anchored match, - // and we're going to try searching from the next character with the default options. - ++m_current_offset; - m_current_offset = - static_cast(m_regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, m_regex_info.subject) : m_current_offset; - m_match_options = m_regex_info.match_options; - continue; - } - - // Match found - PCRE2_SIZE match_start{ovector[0]}; - PCRE2_SIZE match_end{ovector[1]}; - - m_current_offset = match_end; - if (match_end == match_start) { - // If an empty match is found, try searching for a non-empty attached match next time. - m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; - } else { - // Else use default options - m_match_options = m_regex_info.match_options; - } - return pcre2_match_view{m_regex_info.subject, ovector, match_count}; - } - } - -private: - const RegexInfo& m_regex_info; - uint64_t m_match_options{}; - PCRE2_SIZE m_current_offset{}; - pcre2_match_data_8* m_match_data{nullptr}; -}; - -// returns the ending offset of the entire match -PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional> opt_matches, - trailing_unmatch last_unmatched_policy) noexcept { - if (regex_info.regex_code == nullptr || regex_info.match_count <= 0) [[unlikely]] { - return PCRE2_UNSET; - } - - const auto& regex_state{RegexInstanceState::get()}; - - // get the ouput vector from the match data - const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; - const auto end_offset{ovector[1]}; - // early return in case we don't need to actually set matches - if (!opt_matches.has_value()) { - return end_offset; +std::optional> dump_matches(const Info& regex_info, const details::match_view& match, details::trailing_unmatch last_unmatched_policy, + bool is_offset_capture, bool is_unmatched_as_null) noexcept { + if (regex_info.regex_code == nullptr) [[unlikely]] { + return std::nullopt; } - const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; - // calculate last matched group - int64_t last_matched_group{-1}; - for (auto i{0}; i < regex_info.match_count; ++i) { - if (ovector[static_cast(2 * i)] != PCRE2_UNSET) { - last_matched_group = i; - } - } + int64_t last_matched_group{match.size() - 1}; // retrieve the named groups count uint32_t named_groups_count{}; pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMECOUNT, std::addressof(named_groups_count)); @@ -595,24 +635,27 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional array output{array_size{static_cast(regex_info.group_names.size() + named_groups_count), named_groups_count == 0}}; for (auto i{0}; i < regex_info.group_names.size(); ++i) { // skip unmatched groups at the end unless unmatched_as_null is set - if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !is_unmatched_as_null) [[unlikely]] { + if (last_unmatched_policy == details::trailing_unmatch::skip && i > last_matched_group && !is_unmatched_as_null) [[unlikely]] { break; } - const auto match_start_offset{ovector[static_cast(2 * i)]}; - const auto match_end_offset{ovector[static_cast(2 * i + 1)]}; + auto opt_submatch{match.get_group(i)}; - mixed match_val; // NULL value - if (match_start_offset != PCRE2_UNSET) { // handle matched group - const auto match_size{match_end_offset - match_start_offset}; - match_val = string{std::next(regex_info.subject.data(), match_start_offset), static_cast(match_size)}; - } else if (!is_unmatched_as_null) { // handle unmatched group - match_val = string{}; + auto match_val{is_unmatched_as_null ? mixed{} : string{}}; + if (opt_submatch.has_value()) { // handle matched group + auto submatch_string_view{*opt_submatch}; + const auto match_size{submatch_string_view.size()}; + match_val = string{submatch_string_view.data(), static_cast(match_size)}; } mixed output_val; if (is_offset_capture) { - output_val = array::create(std::move(match_val), static_cast(match_start_offset)); + output_val = + array::create(std::move(match_val), opt_submatch + .transform([®ex_info](auto submatch_string_view) noexcept { + return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); + }) + .value_or(-1)); } else { output_val = std::move(match_val); } @@ -623,40 +666,10 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional output.emplace_back(output_val); } - (*opt_matches).get() = std::move(output); - return end_offset; -} - -// returns the ending offset of the entire match -// *** importrant *** -// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches -PCRE2_SIZE set_all_matches(const RegexInfo& regex_info, int64_t flags, std::optional> opt_all_matches) noexcept { - const auto pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; - - // early return in case we don't actually need to set matches - if (!opt_all_matches.has_value()) { - return set_matches(regex_info, flags, std::nullopt, pattern_order ? trailing_unmatch::include : trailing_unmatch::skip); - } - - mixed matches; - PCRE2_SIZE offset{set_matches(regex_info, flags, matches, pattern_order ? trailing_unmatch::include : trailing_unmatch::skip)}; - if (offset == PCRE2_UNSET) [[unlikely]] { - return offset; - } - - mixed& all_matches{(*opt_all_matches).get()}; - if (pattern_order) [[likely]] { - for (const auto& it : std::as_const(matches)) { - all_matches[it.get_key()].push_back(it.get_value()); - } - } else { - all_matches.push_back(matches); - } - - return offset; + return output; } -bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { +bool replace_regex(Info& regex_info, uint64_t limit, size_t substitute_offset) noexcept { regex_info.replace_count = 0; if (regex_info.regex_code == nullptr) [[unlikely]] { return false; @@ -675,22 +688,22 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { // replace all occurences if (limit == std::numeric_limits::max()) [[likely]] { - regex_info.replace_count = pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), 0, - regex_info.replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(), - reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), - reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); + regex_info.replace_count = + pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), substitute_offset, + regex_info.replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(), + reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), + reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = static_cast(regex_info.replace_count)}); return false; } } else { // replace only 'limit' times - size_t substitute_offset{}; int64_t replacement_diff_acc{}; PCRE2_SIZE length_after_replace{buffer_length}; string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; - matcher pcre2_matcher{regex_info, {}}; + kphp::regex::details::matcher pcre2_matcher{regex_info, {}}; for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { @@ -717,7 +730,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = replace_one_ret_code}); + kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); return false; } @@ -737,159 +750,73 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return true; } -std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bool no_empty, bool delim_capture, bool offset_capture) noexcept { - if (limit == 0) { - limit = kphp::regex::PREG_NOLIMIT; - } +} // namespace details - const auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - array output{}; - - matcher pcre2_matcher{regex_info, {}}; - size_t offset{}; - for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { - auto expected_opt_match_view{pcre2_matcher.next()}; - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - return std::nullopt; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - pcre2_match_view match_view{*opt_match_view}; - - auto opt_entire_pattern_match{match_view.get_group(0)}; - if (!opt_entire_pattern_match.has_value()) [[unlikely]] { - return std::nullopt; - } - auto entire_pattern_match_string_view{*opt_entire_pattern_match}; - - if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { - string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - ++out_parts_count; - } - - if (delim_capture) { - for (size_t i{1}; i < match_view.size(); i++) { - auto opt_submatch{match_view.get_group(i)}; - auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; - const auto size{submatch_string_view.size()}; - if (!no_empty || size != 0) { - string val; - if (opt_submatch.has_value()) [[likely]] { - val = string{submatch_string_view.data(), static_cast(size)}; - } - - mixed output_val; - if (offset_capture) { - output_val = - array::create(std::move(val), opt_submatch - .transform([®ex_info](auto submatch_string_view) noexcept { - return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); - }) - .value_or(-1)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - } - } - } - - offset = std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) + entire_pattern_match_string_view.size(); - } - - const auto size{regex_info.subject.size() - offset}; - if (!no_empty || size != 0) { - string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - } - - return output; -} - -} // namespace +} // namespace kphp::regex Optional f$preg_match(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::Info regex_info{pattern, {subject.c_str(), subject.size()}, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { return false; } if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { + if (!kphp::regex::details::compile_regex(regex_info)) [[unlikely]] { return false; } - if (!collect_group_names(regex_info)) [[unlikely]] { + if (!kphp::regex::details::collect_group_names(regex_info)) [[unlikely]] { return false; } const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(regex_info.regex_code != nullptr && regex_state.match_context); - auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; + auto expected_opt_match_view{kphp::regex::details::matcher{regex_info, static_cast(offset)}.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; - regex_info.match_count = opt_match_view.transform(&pcre2_match_view::size).value_or(0); - - std::optional> matches{}; if (opt_matches.has_value()) { + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + kphp::log::assertion(std::holds_alternative>(opt_matches.val())); auto& inner_ref{std::get>(opt_matches.val()).get()}; inner_ref = array{}; - matches.emplace(inner_ref); + opt_match_view.transform([is_offset_capture, is_unmatched_as_null, &inner_ref, ®ex_info](const auto& match_view) { + auto opt_dumped_matches{ + kphp::regex::details::dump_matches(regex_info, match_view, kphp::regex::details::trailing_unmatch::skip, is_offset_capture, is_unmatched_as_null)}; + if (opt_dumped_matches.has_value()) [[likely]] { + inner_ref = std::move(*opt_dumped_matches); + } + return 0; + }); } - set_matches(regex_info, flags, matches, trailing_unmatch::skip); - return regex_info.match_count > 0 ? 1 : 0; + return opt_match_view.has_value() ? 1 : 0; } Optional f$preg_match_all(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { int64_t entire_match_count{}; - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::Info regex_info{pattern, {subject.c_str(), subject.size()}, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, kphp::regex::PREG_OFFSET_CAPTURE, - kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, + kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { return false; } if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { + if (!kphp::regex::details::compile_regex(regex_info)) [[unlikely]] { return false; } - if (!collect_group_names(regex_info)) [[unlikely]] { + if (!kphp::regex::details::collect_group_names(regex_info)) [[unlikely]] { return false; } @@ -913,11 +840,11 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } } - matcher pcre2_matcher{regex_info, static_cast(offset)}; + kphp::regex::details::matcher pcre2_matcher{regex_info, static_cast(offset)}; auto expected_opt_match_view{pcre2_matcher.next()}; while (expected_opt_match_view.has_value() && expected_opt_match_view->has_value()) { - pcre2_match_view match_view{**expected_opt_match_view}; + kphp::regex::details::match_view match_view{**expected_opt_match_view}; regex_info.match_count = match_view.size(); set_all_matches(regex_info, flags, matches); if (regex_info.match_count > 0) { @@ -968,12 +895,13 @@ Optional f$preg_replace(const string& pattern, const string& replacement } } - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; + kphp::regex::details::Info regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; - if (!compile_regex(regex_info)) [[unlikely]] { + if (!kphp::regex::details::compile_regex(regex_info)) [[unlikely]] { return {}; } - if (!replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) { + if (!kphp::regex::details::replace_regex(regex_info, + limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) { return {}; } count = regex_info.replace_count; @@ -991,7 +919,7 @@ Optional f$preg_replace(const mixed& pattern, const string& replacement, } }}; - if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { + if (!kphp::regex::details::valid_preg_replace_mixed(pattern)) [[unlikely]] { return {}; } @@ -1027,7 +955,7 @@ Optional f$preg_replace(const mixed& pattern, const mixed& replacement, } }}; - if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(replacement)) [[unlikely]] { + if (!kphp::regex::details::valid_preg_replace_mixed(pattern) || !kphp::regex::details::valid_preg_replace_mixed(replacement)) [[unlikely]] { return {}; } @@ -1075,8 +1003,8 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed } }}; - if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(replacement) || !regex_impl_::valid_preg_replace_mixed(subject)) - [[unlikely]] { + if (!kphp::regex::details::valid_preg_replace_mixed(pattern) || !kphp::regex::details::valid_preg_replace_mixed(replacement) || + !kphp::regex::details::valid_preg_replace_mixed(subject)) [[unlikely]] { return {}; } @@ -1102,13 +1030,13 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed } Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { - RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + kphp::regex::details::Info regex_info{pattern, {subject.c_str(), subject.size()}, {}}; - if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, - kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, + kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { return false; } - if (!compile_regex(regex_info)) [[unlikely]] { + if (!kphp::regex::details::compile_regex(regex_info)) [[unlikely]] { return false; } auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index a2a52d3479..94e200ff7d 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -15,16 +16,123 @@ #include "runtime-light/coroutine/task.h" #include "runtime-light/coroutine/type-traits.h" #include "runtime-light/stdlib/diagnostics/logs.h" +#include "runtime-light/stdlib/string/regex-include.h" +#include "runtime-light/stdlib/string/regex-state.h" -namespace kphp::regex { +namespace kphp::regex::details { -namespace details { +enum class trailing_unmatch : uint8_t { skip, include }; + +using pcre2_group_names_t = kphp::stl::vector; + +template +using dumped_match_t = std::conditional_t, std::conditional_t>; +template +using dumped_matches_t = array>; struct pcre2_error { int32_t code{}; }; -} // namespace details +class match_view { +public: + match_view(std::string_view subject, const PCRE2_SIZE* ovector, size_t num_groups) noexcept + : m_subject_data{subject}, + m_ovector_ptr{ovector}, + m_num_groups{num_groups} {} + + int32_t size() const noexcept { + return m_num_groups; + } + + std::optional get_group(size_t i) const noexcept; + +private: + std::string_view m_subject_data; + const PCRE2_SIZE* m_ovector_ptr; + size_t m_num_groups; +}; + +struct Info final { + const string& regex; + std::string_view subject; + std::string_view replacement; + + // PCRE compile options of the regex + uint32_t compile_options{}; + // number of groups including entire match + uint32_t capture_count{}; + // compiled regex + pcre2_code_8* regex_code{nullptr}; + + // vector of group names + details::pcre2_group_names_t group_names; + + int64_t match_count{}; + uint32_t match_options{PCRE2_NO_UTF_CHECK}; + + int64_t replace_count{}; + uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; + // contains a string after replacements if replace_count > 0, nullopt otherwise + std::optional opt_replace_result; + + Info() = delete; + + Info(const string& regex_, std::string_view subject_, std::string_view replacement_) noexcept + : regex(regex_), + subject(subject_), + replacement(replacement_) {} +}; + +class matcher { +public: + matcher(const Info& info, size_t match_from) noexcept; + + std::expected, details::pcre2_error> next() noexcept; + +private: + const Info& m_regex_info; + uint64_t m_match_options{}; + PCRE2_SIZE m_current_offset{}; + pcre2_match_data_8* m_match_data{nullptr}; +}; + +std::pair reserve_buffer(std::string_view subject) noexcept; + +} // namespace kphp::regex::details + +namespace std { + +template<> +struct formatter { + static constexpr size_t ERROR_BUFFER_LENGTH{256}; + + template + constexpr auto parse(ParseContext& ctx) const noexcept { + return ctx.begin(); + } + + template + auto format(kphp::regex::details::pcre2_error error, FmtContext& ctx) const noexcept { + std::array buffer{}; + auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + return format_to(ctx.out(), "unknown error ({})", error.code); + case PCRE2_ERROR_NOMEMORY: + return format_to(ctx.out(), "[truncated] {}", buffer.data()); + default: + kphp::log::error("unsupported regex error code: {}", ret_code); + } + } + return format_to(ctx.out(), "{}", buffer.data()); + } +}; + +} // namespace std + +namespace kphp::regex { inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; @@ -44,9 +152,101 @@ inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); inline constexpr int64_t PREG_NOLIMIT = -1; -} // namespace kphp::regex +namespace details { + +template +requires((std::is_same_v && ...) && sizeof...(Args) > 0) +bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { + const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; + if (!valid) [[unlikely]] { + kphp::log::warning("invalid flags: {}", flags); + } + return valid; +} + +bool compile_regex(Info& regex_info) noexcept; + +bool collect_group_names(Info& regex_info) noexcept; + +std::optional> dump_matches(const Info& regex_info, const details::match_view& match, details::trailing_unmatch last_unmatched_policy, + bool is_offset_capture, bool is_unmatched_as_null) noexcept; -namespace regex_impl_ { +bool replace_regex(Info& regex_info, uint64_t limit, size_t substitute_offset = 0) noexcept; + +template> F> +kphp::coro::task replace_callback(Info& regex_info, F callback, uint64_t limit) noexcept { + regex_info.replace_count = 0; + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + co_return false; + } + + auto [sb, buffer_length]{details::reserve_buffer(regex_info.subject)}; + + size_t substitute_offset{}; + int64_t replacement_diff_acc{}; + PCRE2_SIZE length_after_replace{buffer_length}; + string subject{regex_info.subject.data(), static_cast(regex_info.subject.size())}; + + matcher pcre2_matcher{regex_info, {}}; + for (; limit == std::numeric_limits::max() || regex_info.replace_count < limit; ++regex_info.replace_count) { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + log::warning("can't replace with callback by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + co_return false; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + auto& match_view{*opt_match_view}; + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + co_return false; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + const auto match_start_offset{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data())}; + const auto match_end_offset{match_start_offset + entire_pattern_match_string_view.size()}; + regex_info.match_count = match_view.size(); + + auto opt_dumped_matches{dump_matches(regex_info, match_view, details::trailing_unmatch::skip, false, false)}; + if (!opt_dumped_matches.has_value()) [[unlikely]] { + co_return false; + } + + const auto& dumped_matches{*opt_dumped_matches}; + auto matches{array{dumped_matches.size()}}; + for (const auto& elem : dumped_matches) { + matches.set_value(elem.get_key(), elem.get_value().to_string()); + } + string replacement{}; + if constexpr (kphp::coro::is_async_function_v>) { + replacement = co_await std::invoke(callback, std::move(matches)); + } else { + replacement = std::invoke(callback, std::move(matches)); + } + + Info info{regex_info.regex, {subject.c_str(), subject.size()}, {replacement.c_str(), replacement.size()}}; + if (!replace_regex(info, 1, substitute_offset)) [[unlikely]] { + co_return false; + } + auto str_after_replace{info.opt_replace_result.value_or(subject)}; + length_after_replace = str_after_replace.size(); + + replacement_diff_acc += static_cast(str_after_replace.size()) - static_cast(subject.size()); + log::debug("match_end={}, replacement_diff_acc={}", match_end_offset, replacement_diff_acc); + substitute_offset = match_end_offset + replacement_diff_acc; + subject = std::move(str_after_replace); + } + + if (regex_info.replace_count > 0) { + sb.set_pos(length_after_replace); + regex_info.opt_replace_result.emplace(sb.str()); + } + + co_return true; +} inline bool valid_preg_replace_mixed(const mixed& param) noexcept { if (!param.is_array() && !param.is_string()) [[unlikely]] { @@ -56,7 +256,9 @@ inline bool valid_preg_replace_mixed(const mixed& param) noexcept { return true; } -} // namespace regex_impl_ +} // namespace details + +} // namespace kphp::regex using regexp = string; @@ -99,41 +301,47 @@ kphp::coro::task> f$preg_replace_callback(string pattern, F cal Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { static_assert(std::same_as>, string>); - // the performance of this function can be enhanced: - // 1. don't use public f$preg_match and f$preg_replace; - // 2. use match_regex and replace_regex directly; - // 3. reuse match_data from match_regex in replace_regex. - array matches{}; - { // fill matches array or early return - mixed mixed_matches{}; - const auto match_result{f$preg_match(pattern, subject, mixed_matches, flags, 0)}; - if (!match_result.has_value()) [[unlikely]] { - co_return Optional{}; - } else if (match_result.val() == 0) { // no matches, so just return the subject - co_return std::move(subject); - } - matches = array{mixed_matches.as_array().size()}; - for (auto& elem : std::as_const(mixed_matches.as_array())) { - matches.set_value(elem.get_key(), std::move(elem.get_value().as_string())); + int64_t count{}; + vk::final_action count_finalizer{[&count, &opt_count]() noexcept { + if (opt_count.has_value()) { + kphp::log::assertion(std::holds_alternative>(opt_count.val())); + auto& inner_ref{std::get>(opt_count.val()).get()}; + inner_ref = count; } - } + }}; - string replacement{}; - if constexpr (kphp::coro::is_async_function_v>) { - replacement = co_await std::invoke(callback, std::move(matches)); - } else { - replacement = std::invoke(callback, std::move(matches)); + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { + kphp::log::warning("invalid limit {} in preg_replace_callback", limit); + co_return Optional{}; } - co_return f$preg_replace(pattern, replacement, subject, limit, opt_count); + kphp::regex::details::Info regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { + co_return Optional{}; + } + if (!kphp::regex::details::compile_regex(regex_info)) [[unlikely]] { + co_return Optional{}; + } + if (!kphp::regex::details::collect_group_names(regex_info)) [[unlikely]] { + co_return Optional{}; + } + if (!co_await kphp::regex::details::replace_callback( + regex_info, std::move(callback), limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) + [[unlikely]] { + co_return Optional{}; + } + count = regex_info.replace_count; + co_return regex_info.opt_replace_result.value_or(subject); } template kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { - if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { + if (!kphp::regex::details::valid_preg_replace_mixed(pattern)) [[unlikely]] { co_return Optional{}; } @@ -172,7 +380,7 @@ template kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { - if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(subject)) [[unlikely]] { + if (!kphp::regex::details::valid_preg_replace_mixed(pattern) || !kphp::regex::details::valid_preg_replace_mixed(subject)) [[unlikely]] { co_return mixed{}; } diff --git a/tests/phpt/dl/002_preg_replace_callback.php b/tests/phpt/dl/002_preg_replace_callback.php index 16c97a8946..7519cefee4 100644 --- a/tests/phpt/dl/002_preg_replace_callback.php +++ b/tests/phpt/dl/002_preg_replace_callback.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark