Skip to content

Commit 9a800aa

Browse files
authored
[k2] refactor pcre2 functions (#1501)
1 parent bcdece1 commit 9a800aa

File tree

9 files changed

+1039
-650
lines changed

9 files changed

+1039
-650
lines changed

runtime-light/coroutine/await-set.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class await_set {
4848
}
4949

5050
auto next() noexcept {
51-
return detail::await_set::await_set_awaitable<return_type>{*m_await_broker.get()};
51+
return detail::await_set::await_set_awaitable<return_type>{*m_await_broker};
5252
}
5353

5454
bool empty() const noexcept {
Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
// Compiler for PHP (aka KPHP)
2+
// Copyright (c) 2025 LLC «V Kontakte»
3+
// Distributed under the GPL v3 License, see LICENSE.notice.txt
4+
5+
#pragma once
6+
7+
#include <cstddef>
8+
#include <cstdint>
9+
#include <expected>
10+
#include <format>
11+
#include <iterator>
12+
#include <optional>
13+
#include <span>
14+
#include <string_view>
15+
16+
#include "runtime-light/stdlib/diagnostics/logs.h"
17+
// correctly include PCRE2 lib
18+
#include "runtime-light/stdlib/string/regex-include.h"
19+
20+
namespace kphp::pcre2 {
21+
22+
namespace details {
23+
24+
namespace offset_pair {
25+
26+
inline constexpr size_t START{0};
27+
inline constexpr size_t END{1};
28+
inline constexpr size_t SIZE{2};
29+
30+
} // namespace offset_pair
31+
32+
inline int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept {
33+
// all multibyte utf8 runes consist of subsequent bytes,
34+
// these subsequent bytes start with 10 bit pattern
35+
// 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000)
36+
while (offset < subject.size() && ((static_cast<unsigned char>(subject[offset])) & 0xc0) == 0x80) {
37+
offset++;
38+
}
39+
return offset;
40+
}
41+
42+
} // namespace details
43+
44+
using general_context = std::unique_ptr<pcre2_general_context_8, decltype(std::addressof(pcre2_general_context_free_8))>;
45+
using compile_context = std::unique_ptr<pcre2_compile_context_8, decltype(std::addressof(pcre2_compile_context_free_8))>;
46+
using match_context = std::unique_ptr<pcre2_match_context_8, decltype(std::addressof(pcre2_match_context_free_8))>;
47+
using match_data = std::unique_ptr<pcre2_match_data_8, decltype(std::addressof(pcre2_match_data_free_8))>;
48+
using code = std::unique_ptr<pcre2_code_8, decltype(std::addressof(pcre2_code_free_8))>;
49+
50+
struct error {
51+
int32_t code{};
52+
};
53+
54+
struct compile_error : kphp::pcre2::error {
55+
size_t offset{};
56+
};
57+
58+
struct group_name {
59+
std::string_view name;
60+
size_t index{};
61+
};
62+
63+
class regex {
64+
kphp::pcre2::code m_code;
65+
66+
class group_name_iterator {
67+
const PCRE2_UCHAR8* m_ptr{nullptr};
68+
const size_t m_entry_size{};
69+
70+
public:
71+
using iterator_category = std::forward_iterator_tag;
72+
using value_type = kphp::pcre2::group_name;
73+
using difference_type = std::ptrdiff_t;
74+
using pointer = kphp::pcre2::group_name*;
75+
using reference = kphp::pcre2::group_name;
76+
77+
group_name_iterator() = delete;
78+
group_name_iterator(const PCRE2_UCHAR8* current_entry, size_t entry_size) noexcept
79+
: m_ptr{current_entry},
80+
m_entry_size{entry_size} {
81+
kphp::log::assertion(current_entry != nullptr);
82+
}
83+
84+
kphp::pcre2::group_name operator*() const noexcept {
85+
static constexpr size_t UPPER = 0;
86+
static constexpr size_t LOWER = 1;
87+
88+
const auto index{static_cast<size_t>(m_ptr[UPPER] << 8 | m_ptr[LOWER])};
89+
const auto* name_ptr{reinterpret_cast<const char*>(std::next(m_ptr, 2 * sizeof(PCRE2_UCHAR8)))};
90+
return {.name = std::string_view{name_ptr}, .index = index};
91+
}
92+
93+
group_name_iterator& operator++() noexcept {
94+
std::advance(m_ptr, m_entry_size);
95+
return *this;
96+
}
97+
98+
group_name_iterator operator++(int) noexcept { // NOLINT
99+
group_name_iterator tmp{*this};
100+
++*this;
101+
return tmp;
102+
}
103+
104+
bool operator==(const group_name_iterator& other) const noexcept {
105+
return m_ptr == other.m_ptr;
106+
}
107+
};
108+
109+
public:
110+
friend class match_view;
111+
friend class matcher;
112+
113+
static std::expected<regex, kphp::pcre2::compile_error> compile(std::string_view pattern, kphp::pcre2::compile_context& ctx, uint32_t options = 0) noexcept {
114+
int32_t errorcode{};
115+
PCRE2_SIZE erroroffset{};
116+
117+
kphp::pcre2::code re{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.length(), options, std::addressof(errorcode),
118+
std::addressof(erroroffset), ctx.get()),
119+
pcre2_code_free_8};
120+
121+
if (!re) {
122+
return std::unexpected{kphp::pcre2::compile_error{{.code = errorcode}, erroroffset}};
123+
}
124+
return kphp::pcre2::regex{std::move(re)};
125+
}
126+
127+
struct group_name_range {
128+
group_name_iterator b;
129+
group_name_iterator e;
130+
131+
group_name_iterator begin() const noexcept {
132+
return b;
133+
}
134+
group_name_iterator end() const noexcept {
135+
return e;
136+
}
137+
138+
bool empty() const noexcept {
139+
return b == e;
140+
}
141+
};
142+
143+
group_name_range group_names() const noexcept {
144+
uint32_t count{};
145+
uint32_t entry_size{};
146+
PCRE2_SPTR8 table{};
147+
148+
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0);
149+
150+
if (count == 0) {
151+
return {.b = group_name_iterator{nullptr, 0}, .e = group_name_iterator{nullptr, 0}};
152+
}
153+
154+
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(entry_size)) == 0 &&
155+
pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMETABLE, std::addressof(table)) == 0);
156+
157+
return {.b = group_name_iterator{table, entry_size}, .e = group_name_iterator{std::next(table, static_cast<size_t>(count) * entry_size), entry_size}};
158+
}
159+
160+
uint32_t capture_count() const noexcept {
161+
uint32_t count{};
162+
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_CAPTURECOUNT, std::addressof(count)) == 0);
163+
return count;
164+
}
165+
166+
uint32_t name_count() const noexcept {
167+
uint32_t count{};
168+
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_NAMECOUNT, std::addressof(count)) == 0);
169+
return count;
170+
}
171+
172+
bool is_utf() const noexcept {
173+
uint32_t compile_options{};
174+
kphp::log::assertion(pcre2_pattern_info_8(m_code.get(), PCRE2_INFO_ARGOPTIONS, std::addressof(compile_options)) == 0);
175+
return (compile_options & PCRE2_UTF) != 0;
176+
}
177+
178+
private:
179+
explicit regex(kphp::pcre2::code&& code) noexcept
180+
: m_code{std::move(code)} {}
181+
};
182+
183+
class match_view {
184+
const kphp::pcre2::regex& m_re;
185+
std::string_view m_subject;
186+
kphp::pcre2::match_data& m_match_data;
187+
uint32_t m_match_options{};
188+
size_t m_num_groups{};
189+
190+
public:
191+
match_view(const regex& re, std::string_view subject, kphp::pcre2::match_data& match_data, uint32_t match_options, size_t num_groups) noexcept
192+
: m_re{re},
193+
m_subject{subject},
194+
m_match_data{match_data},
195+
m_match_options{match_options},
196+
m_num_groups{num_groups} {}
197+
198+
int32_t size() const noexcept {
199+
return m_num_groups;
200+
}
201+
202+
struct offset_range {
203+
size_t start{};
204+
size_t end{};
205+
};
206+
207+
std::optional<std::string_view> get_group(size_t i) const noexcept {
208+
if (auto range{get_range(i)}; range.has_value()) {
209+
return m_subject.substr(range->start, range->end - range->start);
210+
}
211+
return std::nullopt;
212+
}
213+
214+
struct group_content {
215+
std::string_view text;
216+
size_t offset{};
217+
};
218+
219+
std::optional<group_content> get_group_content(size_t i) const noexcept {
220+
if (auto range{get_range(i)}; range.has_value()) {
221+
return group_content{.text = m_subject.substr(range->start, range->end - range->start), .offset = range->start};
222+
}
223+
return std::nullopt;
224+
}
225+
226+
size_t match_start() const noexcept {
227+
return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::START];
228+
}
229+
size_t match_end() const noexcept {
230+
return pcre2_get_ovector_pointer_8(m_match_data.get())[kphp::pcre2::details::offset_pair::END];
231+
}
232+
233+
std::expected<size_t, std::pair<size_t, kphp::pcre2::error>> substitute(std::string_view replacement, std::span<char> buffer,
234+
kphp::pcre2::match_context& ctx) const noexcept {
235+
kphp::log::assertion(buffer.data() != nullptr);
236+
237+
uint32_t substitute_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY | PCRE2_SUBSTITUTE_MATCHED | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
238+
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | m_match_options};
239+
240+
auto buffer_len{buffer.size()};
241+
auto ret_code{pcre2_substitute_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), 0, substitute_options,
242+
m_match_data.get(), ctx.get(), reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.length(),
243+
reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), std::addressof(buffer_len))};
244+
245+
if (ret_code < 0) {
246+
return std::unexpected<std::pair<size_t, kphp::pcre2::error>>{{buffer_len, {.code = ret_code}}};
247+
}
248+
249+
return buffer_len;
250+
}
251+
252+
private:
253+
std::optional<offset_range> get_range(size_t i) const noexcept {
254+
if (i >= m_num_groups) {
255+
return std::nullopt;
256+
}
257+
258+
const auto* ovector_ptr{pcre2_get_ovector_pointer_8(m_match_data.get())};
259+
// ovector is an array of offset pairs
260+
PCRE2_SIZE start{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::START]};
261+
PCRE2_SIZE end{ovector_ptr[(kphp::pcre2::details::offset_pair::SIZE * i) + kphp::pcre2::details::offset_pair::END]};
262+
263+
if (start == PCRE2_UNSET) {
264+
return std::nullopt;
265+
}
266+
return offset_range{.start = start, .end = end};
267+
}
268+
};
269+
270+
class matcher {
271+
const kphp::pcre2::regex& m_re;
272+
std::string_view m_subject;
273+
kphp::pcre2::match_context& m_ctx;
274+
PCRE2_SIZE m_current_offset{};
275+
kphp::pcre2::match_data& m_match_data;
276+
uint32_t m_user_options{};
277+
uint32_t m_match_options{};
278+
bool m_is_utf{false};
279+
280+
public:
281+
matcher(const kphp::pcre2::regex& re, std::string_view subject, size_t match_from, kphp::pcre2::match_context& ctx, kphp::pcre2::match_data& data,
282+
uint32_t options = 0) noexcept
283+
: m_re{re},
284+
m_subject{subject},
285+
m_ctx{ctx},
286+
m_current_offset{match_from},
287+
m_match_data{data},
288+
m_user_options{options},
289+
m_is_utf{re.is_utf()} {}
290+
291+
std::expected<std::optional<kphp::pcre2::match_view>, kphp::pcre2::error> next() noexcept {
292+
while (m_current_offset <= m_subject.length()) {
293+
uint32_t current_attempt_options{m_user_options | m_match_options};
294+
295+
auto ret_code{pcre2_match_8(m_re.m_code.get(), reinterpret_cast<PCRE2_SPTR8>(m_subject.data()), m_subject.length(), m_current_offset,
296+
current_attempt_options, m_match_data.get(), m_ctx.get())};
297+
298+
if (ret_code == PCRE2_ERROR_NOMATCH) {
299+
if (m_match_options != 0) {
300+
// If the anchored non-empty match failed, advance 1 unit and try again
301+
m_match_options = 0;
302+
m_current_offset++;
303+
if (m_is_utf) {
304+
m_current_offset = kphp::pcre2::details::skip_utf8_subsequent_bytes(m_current_offset, m_subject);
305+
}
306+
continue;
307+
}
308+
return std::nullopt;
309+
}
310+
311+
// From https://www.pcre.org/current/doc/html/pcre2_match.html
312+
// The return from pcre2_match() is one more than the highest numbered capturing pair that has been set
313+
// (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors.
314+
if (ret_code < 0) [[unlikely]] {
315+
return std::unexpected{error{.code = ret_code}};
316+
}
317+
318+
size_t matched_groups_count{};
319+
if (ret_code == 0) {
320+
matched_groups_count = pcre2_get_ovector_count_8(m_match_data.get());
321+
} else {
322+
matched_groups_count = static_cast<size_t>(ret_code);
323+
}
324+
325+
const PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data.get())};
326+
327+
size_t start{ovector[kphp::pcre2::details::offset_pair::START]};
328+
size_t end{ovector[kphp::pcre2::details::offset_pair::END]};
329+
330+
if (start == end) {
331+
// Found an empty match; set flags to try finding a non-empty match at same position
332+
m_match_options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
333+
} else {
334+
m_match_options = 0;
335+
}
336+
m_current_offset = end;
337+
338+
return kphp::pcre2::match_view{m_re, m_subject, m_match_data, current_attempt_options, matched_groups_count};
339+
}
340+
341+
return std::nullopt;
342+
}
343+
};
344+
345+
} // namespace kphp::pcre2
346+
347+
template<>
348+
struct std::formatter<kphp::pcre2::error> {
349+
template<typename ParseContext>
350+
constexpr auto parse(ParseContext& ctx) const noexcept {
351+
return ctx.begin();
352+
}
353+
354+
template<typename FmtContext>
355+
auto format(kphp::pcre2::error error, FmtContext& ctx) const noexcept {
356+
static constexpr size_t ERROR_BUFFER_LENGTH{256};
357+
358+
std::array<char, ERROR_BUFFER_LENGTH> buffer; // NOLINT
359+
auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size())};
360+
if (ret_code < 0) [[unlikely]] {
361+
switch (ret_code) {
362+
case PCRE2_ERROR_BADDATA:
363+
return format_to(ctx.out(), "unknown error ({})", error.code);
364+
case PCRE2_ERROR_NOMEMORY:
365+
return format_to(ctx.out(), "[truncated] {}", buffer.data());
366+
default:
367+
kphp::log::error("unsupported regex error code: {}", ret_code);
368+
}
369+
}
370+
return format_to(ctx.out(), "{}", buffer.data());
371+
}
372+
};

0 commit comments

Comments
 (0)