|
| 1 | +// Compiler for PHP (aka KPHP) |
| 2 | +// Copyright (c) 2025 LLC «V Kontakte» |
| 3 | +// Distributed under the GPL v3 License, see LICENSE.notice.txt |
| 4 | + |
| 5 | +#pragma once |
| 6 | + |
| 7 | +#include <cstddef> |
| 8 | +#include <cstdint> |
| 9 | +#include <expected> |
| 10 | +#include <format> |
| 11 | +#include <iterator> |
| 12 | +#include <optional> |
| 13 | +#include <span> |
| 14 | +#include <string_view> |
| 15 | + |
| 16 | +#include "runtime-light/stdlib/diagnostics/logs.h" |
| 17 | +// correctly include PCRE2 lib |
| 18 | +#include "runtime-light/stdlib/string/regex-include.h" |
| 19 | + |
| 20 | +namespace kphp::pcre2 { |
| 21 | + |
| 22 | +namespace details { |
| 23 | + |
| 24 | +namespace offset_pair { |
| 25 | + |
| 26 | +inline constexpr size_t START{0}; |
| 27 | +inline constexpr size_t END{1}; |
| 28 | +inline constexpr size_t SIZE{2}; |
| 29 | + |
| 30 | +} // namespace offset_pair |
| 31 | + |
| 32 | +inline int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept { |
| 33 | + // all multibyte utf8 runes consist of subsequent bytes, |
| 34 | + // these subsequent bytes start with 10 bit pattern |
| 35 | + // 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000) |
| 36 | + while (offset < subject.size() && ((static_cast<unsigned char>(subject[offset])) & 0xc0) == 0x80) { |
| 37 | + offset++; |
| 38 | + } |
| 39 | + return offset; |
| 40 | +} |
| 41 | + |
| 42 | +} // namespace details |
| 43 | + |
| 44 | +using general_context = std::unique_ptr<pcre2_general_context_8, decltype(std::addressof(pcre2_general_context_free_8))>; |
| 45 | +using compile_context = std::unique_ptr<pcre2_compile_context_8, decltype(std::addressof(pcre2_compile_context_free_8))>; |
| 46 | +using match_context = std::unique_ptr<pcre2_match_context_8, decltype(std::addressof(pcre2_match_context_free_8))>; |
| 47 | +using match_data = std::unique_ptr<pcre2_match_data_8, decltype(std::addressof(pcre2_match_data_free_8))>; |
| 48 | +using code = std::unique_ptr<pcre2_code_8, decltype(std::addressof(pcre2_code_free_8))>; |
| 49 | + |
| 50 | +struct error { |
| 51 | + int32_t code{}; |
| 52 | +}; |
| 53 | + |
| 54 | +struct compile_error : kphp::pcre2::error { |
| 55 | + size_t offset{}; |
| 56 | +}; |
| 57 | + |
| 58 | +struct group_name { |
| 59 | + std::string_view name; |
| 60 | + size_t index{}; |
| 61 | +}; |
| 62 | + |
| 63 | +class regex { |
| 64 | + kphp::pcre2::code m_code; |
| 65 | + |
| 66 | + class group_name_iterator { |
| 67 | + const PCRE2_UCHAR8* m_ptr{nullptr}; |
| 68 | + const size_t m_entry_size{}; |
| 69 | + |
| 70 | + public: |
| 71 | + using iterator_category = std::forward_iterator_tag; |
| 72 | + using value_type = kphp::pcre2::group_name; |
| 73 | + using difference_type = std::ptrdiff_t; |
| 74 | + using pointer = kphp::pcre2::group_name*; |
| 75 | + using reference = kphp::pcre2::group_name; |
| 76 | + |
| 77 | + group_name_iterator() = delete; |
| 78 | + group_name_iterator(const PCRE2_UCHAR8* current_entry, size_t entry_size) noexcept |
| 79 | + : m_ptr{current_entry}, |
| 80 | + m_entry_size{entry_size} { |
| 81 | + kphp::log::assertion(current_entry != nullptr); |
| 82 | + } |
| 83 | + |
| 84 | + kphp::pcre2::group_name operator*() const noexcept { |
| 85 | + static constexpr size_t UPPER = 0; |
| 86 | + static constexpr size_t LOWER = 1; |
| 87 | + |
| 88 | + const auto index{static_cast<size_t>(m_ptr[UPPER] << 8 | m_ptr[LOWER])}; |
| 89 | + const auto* name_ptr{reinterpret_cast<const char*>(std::next(m_ptr, 2 * sizeof(PCRE2_UCHAR8)))}; |
| 90 | + return {.name = std::string_view{name_ptr}, .index = index}; |
| 91 | + } |
| 92 | + |
| 93 | + group_name_iterator& operator++() noexcept { |
| 94 | + std::advance(m_ptr, m_entry_size); |
| 95 | + return *this; |
| 96 | + } |
| 97 | + |
| 98 | + group_name_iterator operator++(int) noexcept { // NOLINT |
| 99 | + group_name_iterator tmp{*this}; |
| 100 | + ++*this; |
| 101 | + return tmp; |
| 102 | + } |
| 103 | + |
| 104 | + bool operator==(const group_name_iterator& other) const noexcept { |
| 105 | + return m_ptr == other.m_ptr; |
| 106 | + } |
| 107 | + }; |
| 108 | + |
| 109 | +public: |
| 110 | + friend class match_view; |
| 111 | + friend class matcher; |
| 112 | + |
| 113 | + static std::expected<regex, kphp::pcre2::compile_error> compile(std::string_view pattern, kphp::pcre2::compile_context& ctx, uint32_t options = 0) noexcept { |
| 114 | + int32_t errorcode{}; |
| 115 | + PCRE2_SIZE erroroffset{}; |
| 116 | + |
| 117 | + kphp::pcre2::code re{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.length(), options, std::addressof(errorcode), |
| 118 | + std::addressof(erroroffset), ctx.get()), |
| 119 | + pcre2_code_free_8}; |
| 120 | + |
| 121 | + if (!re) { |
| 122 | + return std::unexpected{kphp::pcre2::compile_error{{.code = errorcode}, erroroffset}}; |
| 123 | + } |
| 124 | + return kphp::pcre2::regex{std::move(re)}; |
| 125 | + } |
| 126 | + |
| 127 | + struct group_name_range { |
| 128 | + group_name_iterator b; |
| 129 | + group_name_iterator e; |
| 130 | + |
| 131 | + group_name_iterator begin() const noexcept { |
| 132 | + return b; |
| 133 | + } |
| 134 | + group_name_iterator end() const noexcept { |
| 135 | + return e; |
| 136 | + } |
| 137 | + |
| 138 | + bool empty() const noexcept { |
| 139 | + return b == e; |
| 140 | + } |
| 141 | + }; |
| 142 | + |
| 143 | + group_name_range group_names() const noexcept { |
| 144 | + uint32_t count{}; |
| 145 | + uint32_t entry_size{}; |
| 146 | + PCRE2_SPTR8 table{}; |
| 147 | + |
| 148 | + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0); |
| 149 | + |
| 150 | + if (count == 0) { |
| 151 | + return {.b = group_name_iterator{nullptr, 0}, .e = group_name_iterator{nullptr, 0}}; |
| 152 | + } |
| 153 | + |
| 154 | + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(entry_size)) == 0 && |
| 155 | + pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMETABLE, std::addressof(table)) == 0); |
| 156 | + |
| 157 | + return {.b = group_name_iterator{table, entry_size}, .e = group_name_iterator{std::next(table, static_cast<size_t>(count) * entry_size), entry_size}}; |
| 158 | + } |
| 159 | + |
| 160 | + uint32_t capture_count() const noexcept { |
| 161 | + uint32_t count{}; |
| 162 | + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_CAPTURECOUNT, std::addressof(count)) == 0); |
| 163 | + return count; |
| 164 | + } |
| 165 | + |
| 166 | + uint32_t name_count() const noexcept { |
| 167 | + uint32_t count{}; |
| 168 | + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0); |
| 169 | + return count; |
| 170 | + } |
| 171 | + |
| 172 | + bool is_utf() const noexcept { |
| 173 | + uint32_t compile_options{}; |
| 174 | + kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_ARGOPTIONS, std::addressof(compile_options)) == 0); |
| 175 | + return (compile_options & PCRE2_UTF) != 0; |
| 176 | + } |
| 177 | + |
| 178 | +private: |
| 179 | + explicit regex(kphp::pcre2::code&& code) noexcept |
| 180 | + : m_code{std::move(code)} {} |
| 181 | +}; |
| 182 | + |
| 183 | +class match_view { |
| 184 | + const kphp::pcre2::regex& m_re; |
| 185 | + std::string_view m_subject; |
| 186 | + kphp::pcre2::match_data& m_match_data; |
| 187 | + uint32_t m_match_options{}; |
| 188 | + size_t m_num_groups{}; |
| 189 | + |
| 190 | +public: |
| 191 | + match_view(const regex& re, std::string_view subject, kphp::pcre2::match_data& match_data, uint32_t match_options, size_t num_groups) noexcept |
| 192 | + : m_re{re}, |
| 193 | + m_subject{subject}, |
| 194 | + m_match_data{match_data}, |
| 195 | + m_match_options{match_options}, |
| 196 | + m_num_groups{num_groups} {} |
| 197 | + |
| 198 | + int32_t size() const noexcept { |
| 199 | + return m_num_groups; |
| 200 | + } |
| 201 | + |
| 202 | + struct offset_range { |
| 203 | + size_t start{}; |
| 204 | + size_t end{}; |
| 205 | + }; |
| 206 | + |
| 207 | + std::optional<std::string_view> get_group(size_t i) const noexcept { |
| 208 | + if (auto range{get_range(i)}; range.has_value()) { |
| 209 | + return m_subject.substr(range->start, range->end - range->start); |
| 210 | + } |
| 211 | + return std::nullopt; |
| 212 | + } |
| 213 | + |
| 214 | + struct group_content { |
| 215 | + std::string_view text; |
| 216 | + size_t offset{}; |
| 217 | + }; |
| 218 | + |
| 219 | + std::optional<group_content> get_group_content(size_t i) const noexcept { |
| 220 | + if (auto range{get_range(i)}; range.has_value()) { |
| 221 | + return group_content{.text = m_subject.substr(range->start, range->end - range->start), .offset = range->start}; |
| 222 | + } |
| 223 | + return std::nullopt; |
| 224 | + } |
| 225 | + |
| 226 | + size_t match_start() const noexcept { |
| 227 | + return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::START]; |
| 228 | + } |
| 229 | + size_t match_end() const noexcept { |
| 230 | + return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::END]; |
| 231 | + } |
| 232 | + |
| 233 | + std::expected<size_t, std::pair<size_t, kphp::pcre2::error>> substitute(std::string_view replacement, std::span<char> buffer, |
| 234 | + kphp::pcre2::match_context& ctx) const noexcept { |
| 235 | + kphp::log::assertion(buffer.data() != nullptr); |
| 236 | + |
| 237 | + uint32_t substitute_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_MATCHED | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | |
| 238 | + PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | m_match_options}; |
| 239 | + |
| 240 | + auto buffer_len{buffer.size()}; |
| 241 | + auto ret_code{pcre2_substitute_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), 0, substitute_options, |
| 242 | + m_match_data.get(), ctx.get(), reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.length(), |
| 243 | + reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), std::addressof(buffer_len))}; |
| 244 | + |
| 245 | + if (ret_code < 0) { |
| 246 | + return std::unexpected<std::pair<size_t, kphp::pcre2::error>>{{buffer_len, {.code = ret_code}}}; |
| 247 | + } |
| 248 | + |
| 249 | + return buffer_len; |
| 250 | + } |
| 251 | + |
| 252 | +private: |
| 253 | + std::optional<offset_range> get_range(size_t i) const noexcept { |
| 254 | + if (i >= m_num_groups) { |
| 255 | + return std::nullopt; |
| 256 | + } |
| 257 | + |
| 258 | + const auto* ovector_ptr{pcre2_get_ovector_pointer_8(m_match_data.get())}; |
| 259 | + // ovector is an array of offset pairs |
| 260 | + PCRE2_SIZE start{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::START]}; |
| 261 | + PCRE2_SIZE end{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::END]}; |
| 262 | + |
| 263 | + if (start == PCRE2_UNSET) { |
| 264 | + return std::nullopt; |
| 265 | + } |
| 266 | + return offset_range{.start = start, .end = end}; |
| 267 | + } |
| 268 | +}; |
| 269 | + |
| 270 | +class matcher { |
| 271 | + const kphp::pcre2::regex& m_re; |
| 272 | + std::string_view m_subject; |
| 273 | + kphp::pcre2::match_context& m_ctx; |
| 274 | + PCRE2_SIZE m_current_offset{}; |
| 275 | + kphp::pcre2::match_data& m_match_data; |
| 276 | + uint32_t m_user_options{}; |
| 277 | + uint32_t m_match_options{}; |
| 278 | + bool m_is_utf{false}; |
| 279 | + |
| 280 | +public: |
| 281 | + matcher(const kphp::pcre2::regex& re, std::string_view subject, size_t match_from, kphp::pcre2::match_context& ctx, kphp::pcre2::match_data& data, |
| 282 | + uint32_t options = 0) noexcept |
| 283 | + : m_re{re}, |
| 284 | + m_subject{subject}, |
| 285 | + m_ctx{ctx}, |
| 286 | + m_current_offset{match_from}, |
| 287 | + m_match_data{data}, |
| 288 | + m_user_options{options}, |
| 289 | + m_is_utf{re.is_utf()} {} |
| 290 | + |
| 291 | + std::expected<std::optional<kphp::pcre2::match_view>, kphp::pcre2::error> next() noexcept { |
| 292 | + while (m_current_offset <= m_subject.length()) { |
| 293 | + uint32_t current_attempt_options{m_user_options | m_match_options}; |
| 294 | + |
| 295 | + auto ret_code{pcre2_match_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), m_current_offset, |
| 296 | + current_attempt_options, m_match_data.get(), m_ctx.get())}; |
| 297 | + |
| 298 | + if (ret_code == PCRE2_ERROR_NOMATCH) { |
| 299 | + if (m_match_options != 0) { |
| 300 | + // If the anchored non-empty match failed, advance 1 unit and try again |
| 301 | + m_match_options = 0; |
| 302 | + m_current_offset++; |
| 303 | + if (m_is_utf) { |
| 304 | + m_current_offset = kphp::pcre2::details::skip_utf8_subsequent_bytes(m_current_offset, m_subject); |
| 305 | + } |
| 306 | + continue; |
| 307 | + } |
| 308 | + return std::nullopt; |
| 309 | + } |
| 310 | + |
| 311 | + // From https://www.pcre.org/current/doc/html/pcre2_match.html |
| 312 | + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set |
| 313 | + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. |
| 314 | + if (ret_code < 0) [[unlikely]] { |
| 315 | + return std::unexpected{error{.code = ret_code}}; |
| 316 | + } |
| 317 | + |
| 318 | + size_t matched_groups_count{}; |
| 319 | + if (ret_code == 0) { |
| 320 | + matched_groups_count = pcre2_get_ovector_count_8(m_match_data.get()); |
| 321 | + } else { |
| 322 | + matched_groups_count = static_cast<size_t>(ret_code); |
| 323 | + } |
| 324 | + |
| 325 | + const PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data.get())}; |
| 326 | + |
| 327 | + size_t start{ovector[kphp::pcre2::details::offset_pair::START]}; |
| 328 | + size_t end{ovector[kphp::pcre2::details::offset_pair::END]}; |
| 329 | + |
| 330 | + if (start == end) { |
| 331 | + // Found an empty match; set flags to try finding a non-empty match at same position |
| 332 | + m_match_options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
| 333 | + } else { |
| 334 | + m_match_options = 0; |
| 335 | + } |
| 336 | + m_current_offset = end; |
| 337 | + |
| 338 | + return kphp::pcre2::match_view{m_re, m_subject, m_match_data, current_attempt_options, matched_groups_count}; |
| 339 | + } |
| 340 | + |
| 341 | + return std::nullopt; |
| 342 | + } |
| 343 | +}; |
| 344 | + |
| 345 | +} // namespace kphp::pcre2 |
| 346 | + |
| 347 | +template<> |
| 348 | +struct std::formatter<kphp::pcre2::error> { |
| 349 | + template<typename ParseContext> |
| 350 | + constexpr auto parse(ParseContext& ctx) const noexcept { |
| 351 | + return ctx.begin(); |
| 352 | + } |
| 353 | + |
| 354 | + template<typename FmtContext> |
| 355 | + auto format(kphp::pcre2::error error, FmtContext& ctx) const noexcept { |
| 356 | + static constexpr size_t ERROR_BUFFER_LENGTH{256}; |
| 357 | + |
| 358 | + std::array<char, ERROR_BUFFER_LENGTH> buffer; // NOLINT |
| 359 | + auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size())}; |
| 360 | + if (ret_code < 0) [[unlikely]] { |
| 361 | + switch (ret_code) { |
| 362 | + case PCRE2_ERROR_BADDATA: |
| 363 | + return format_to(ctx.out(), "unknown error ({})", error.code); |
| 364 | + case PCRE2_ERROR_NOMEMORY: |
| 365 | + return format_to(ctx.out(), "[truncated] {}", buffer.data()); |
| 366 | + default: |
| 367 | + kphp::log::error("unsupported regex error code: {}", ret_code); |
| 368 | + } |
| 369 | + } |
| 370 | + return format_to(ctx.out(), "{}", buffer.data()); |
| 371 | + } |
| 372 | +}; |
0 commit comments