Skip to content

Commit 80d1a30

Browse files
committed
Regex Engine: Subexpression call: build multiple copies if needed
do not recurse into subexpressions in the same level or below
1 parent 399d1a6 commit 80d1a30

File tree

12 files changed

+33
-4
lines changed

12 files changed

+33
-4
lines changed

src/lexer.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,7 @@ std::optional<Regexp> NLexer::_regexp() {
886886
advance(-1);
887887
// reset capture indices
888888
nested_index = 0;
889+
inside_index = 0;
889890
while (branch_reset_indices.size())
890891
branch_reset_indices.pop();
891892

@@ -1228,6 +1229,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
12281229
RegexpType::SubExprCall, (char)backrefnum,
12291230
regexp_debug_info(this, "\\g", 2)};
12301231
reg.subexprcall = backrefnum;
1232+
reg.inside_subexpr = inside_index;
12311233
return reg;
12321234
}
12331235
lexer_error(*this, Errors::InvalidRegexpSyntax, error_token(),
@@ -1463,6 +1465,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
14631465
// parenthesised expression
14641466
if (c == '(') {
14651467
nested_index++;
1468+
inside_index++;
14661469
std::optional<int> reset_branch{};
14671470
int branch = 0;
14681471
if (!branch_reset_indices.empty()) {
@@ -1492,6 +1495,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
14921495
}
14931496
}
14941497
advance(1); // consume ')'
1498+
inside_index--;
14951499
if (seen_newline) {
14961500
const Token &mtoken = error_token();
14971501
lexer_error(*this, Errors::InvalidRegexpSyntax, mtoken,
@@ -1511,6 +1515,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15111515
// what _this_ index is
15121516
branch_reset_indices.push(nested_index);
15131517
auto reg = regexp();
1518+
inside_index--;
15141519
if (reset_branch.has_value()) {
15151520
nested_index = *reset_branch;
15161521
branch_reset_indices.push(branch);
@@ -1540,6 +1545,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15401545
nested_index = *reset_branch;
15411546
branch_reset_indices.push(branch);
15421547
}
1548+
inside_index--;
15431549
return {};
15441550
}
15451551
if (c == '<' || c == '=') {
@@ -1550,6 +1556,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15501556
nested_index = *reset_branch;
15511557
branch_reset_indices.push(branch);
15521558
}
1559+
inside_index--;
15531560
return {};
15541561
}
15551562
if (c == '>') {
@@ -1560,6 +1567,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15601567
nested_index = *reset_branch;
15611568
branch_reset_indices.push(branch);
15621569
}
1570+
inside_index--;
15631571
return {};
15641572
}
15651573
if (c == ':') {
@@ -1572,6 +1580,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15721580
if (!reg.has_value())
15731581
return reg;
15741582
auto &rv = reg.value();
1583+
inside_index--;
15751584
c = *source_p;
15761585
if (c != ')') {
15771586
const Token &mtoken = error_token();
@@ -1589,6 +1598,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
15891598
}
15901599
auto my_index = nested_index;
15911600
auto reg = regexp();
1601+
inside_index--;
15921602
if (reset_branch.has_value()) {
15931603
nested_index = *reset_branch;
15941604
branch_reset_indices.push(branch);
@@ -2068,6 +2078,7 @@ Regexp::compile(std::multimap<const Regexp *, NFANode<std::string> *> &cache,
20682078
NFANode<std::string> *tl = new NFANode<std::string>{"R<>" + mangle()};
20692079
parent->epsilon_transition_to(tl);
20702080
tl->subexpr_call = subexprcall;
2081+
tl->inside_subexpr = inside_subexpr;
20712082
tl->named_rule = namef;
20722083
result = tl;
20732084
result->debug_info = debug_info;

src/lexer.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ class NLexer {
209209
int offset;
210210
char buffer[1024000];
211211
int nested_index = 0;
212+
int inside_index = 0;
212213
/// If there's anything on this, reset index to it
213214
/// when matching alternatives
214215
std::stack<int> branch_reset_indices{};

src/nfa.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ template <typename StateInfoT> class DFANode {
7070
std::vector<RegexpAssertion> assertions = {};
7171
std::set<int> subexpr_idxs = {};
7272
std::set<int> subexpr_end_idxs = {};
73+
int inside_subexpr = -1;
7374
std::optional<int> backreference{};
7475
int subexpr_call = -1;
7576
bool subexpr_recurses = false;
@@ -128,6 +129,7 @@ template <typename StateInfoT> class NFANode {
128129
subexpr_end = false, reference_node = false;
129130
int max_opt_steps = 50;
130131
int opt_step = max_opt_steps;
132+
int inside_subexpr = -1;
131133

132134
std::optional<std::string> inline_code =
133135
{}; // code that would be executed should this node match
@@ -150,7 +152,6 @@ template <typename StateInfoT> class NFANode {
150152
int subexpr_idx = -1;
151153
int subexpr_end_idx = -1;
152154
int subexpr_call = -1;
153-
bool subexpr_recurses = false;
154155

155156
NFANode(StateInfoT s) : state_info(s) {}
156157
NFANode() {}

src/parser.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ inline static void parser_error_impl(char const *fmt, va_list arg) {
4141
std::vprintf(fmt, arg);
4242
}
4343

44+
static int sexpr_being_built = 0;
45+
4446
char *parser_errors[(int)ParserErrors::LAST - 10] = {
4547
[(int)ParserErrors::InvalidToken - 11] = "Invalid token",
4648
[(int)ParserErrors::FeatureUnsupported - 11] = "Unsupported feature",
@@ -1298,6 +1300,7 @@ template <typename T> DFANode<std::set<NFANode<T> *>> *NFANode<T>::to_dfa() {
12981300
}
12991301
dfanode->subexpr_recurses =
13001302
dfanode->subexpr_call <= dfanode->subexpr_end_idxs.size();
1303+
dfanode->inside_subexpr = s->inside_subexpr;
13011304
dfanode->subexpr_call = s->subexpr_call;
13021305
}
13031306
if (s->backreference.has_value()) {
@@ -1539,6 +1542,8 @@ void DFANLVMCodeGenerator<T>::generate(
15391542
for (auto subexpr_idx : node->subexpr_idxs) {
15401543
if (!subexprFunc.count(subexpr_idx))
15411544
continue;
1545+
int sbb = sexpr_being_built;
1546+
sexpr_being_built = subexpr_idx;
15421547
decltype(visited) _visited;
15431548
typename std::remove_reference<decltype(blk)>::type _blocks;
15441549
auto scope = subexprFunc[subexpr_idx];
@@ -1585,6 +1590,7 @@ void DFANLVMCodeGenerator<T>::generate(
15851590
dbuilder.CreateCondBr(matched, mroot, builder.module.BBfinalise);
15861591

15871592
builder.module.exit_main();
1593+
sexpr_being_built = sbb;
15881594
}
15891595
}
15901596
builder.issubexp = wasub;
@@ -1913,7 +1919,7 @@ void DFANLVMCodeGenerator<T>::generate(
19131919
}
19141920
// if there is a subexpr call, create it now
19151921
if (node->subexpr_call > -1 &&
1916-
(node->subexpr_recurses || node->subexpr_call > subexprFunc.size())) {
1922+
(node->subexpr_recurses || sexpr_being_built < node->subexpr_call)) {
19171923
llvm::Function *fn;
19181924
auto val = builder.module.current_main()->arg_begin();
19191925
if (subexprFunc.count(node->subexpr_call))

src/regexp.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class Regexp {
5252
bool plus = false, star = false, lazy = false, store = false;
5353
int index = 0; // applies for nested and backref (escape)
5454
int subexprcall = -1; // applies for SubExprCall
55+
int inside_subexpr = -1; // applies for SubExprCall
5556

5657
std::optional<RepeatQuantifier> repeat;
5758

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
testhelloooohellotest

tests/list-tests

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
0009-pos
1010
0010-pl
1111
0011-subexpr
12+
0012-subexpr-expr

tests/outputs/0008-literal-match.stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
res at 0x7ffe8df3e678, s at 0x7ff892c2e010
1+
res at 0x7ffc26eda248, s at 0x7fc1109a1010
22
processing - 'HELP😒
33
'
44
match {'HELP' - (null) - 4 literal} is a stopword

tests/outputs/0011-subexpr.stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
res at 0x7ffdeebc6688, s at 0x7fcaae9ea010
1+
res at 0x7ffd5f1a3468, s at 0x7ff8bcfe3010
22
processing - 'testtest'
33
match {'testtest' - (null) - 8 expr} is not a stopword
44
no match {'' - (null) - 0 expr} is not a stopword

tests/outputs/0012-subexpr-expr.stderr

Whitespace-only changes.

0 commit comments

Comments
 (0)