Skip to content

Commit e8a5cd7

Browse files
authored
Add folding and simplication for OP_ECLASS (#586)
Fixes #537
1 parent 6ef4fee commit e8a5cd7

23 files changed

+2313
-966
lines changed

.github/workflows/dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ jobs:
6868

6969
- name: Test (main test script)
7070
run: |
71-
ulimit -S -s 32768 # Raise stack limit; ASAN with -O0 is very stack-hungry
71+
ulimit -S -s 49152 # Raise stack limit; ASAN with -O0 is very stack-hungry
7272
./RunTest
7373
7474
- name: Test (JIT test program)

HACKING

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,43 @@ When XCL_NOT is set, the bit map, if present, contains bits for characters that
633633
are allowed (exactly as for OP_NCLASS), but the list of items that follow it
634634
specifies characters and properties that are not allowed.
635635

636+
The meaning of the bitmap indicated by XCL_MAP is that, if one is present, then
637+
it fully describes which code points < 256 match the class (without needing to
638+
invert the check according to XCL_NOT); the other items in the OP_XCLASS need
639+
not be consulted. However, if a bitmap is not present, then code points < 256
640+
may still match, so the other items in the OP_XCLASS must be consulted.
641+
642+
For classes containing logical expressions, such as "[\p{Greek} && \p{Lu}]" for
643+
"uppercase Greek letters", OP_ECLASS is used. The expression is encoded as a a
644+
stack-based series of operands and operators, in Reverse Polish Notation. Like
645+
an OP_XCLASS, the OP_ECLASS is first followed by a LINK_SIZE value containing
646+
the total length of the opcode and its data. That is followed by a code unit
647+
containing flags: currently just ECL_MAP indicating that a bit map is present.
648+
There follows the bit map, if ECL_MAP is set. Finally, there is a sequence of
649+
items that are either an operand or operator. Each item starts with a single
650+
code unit containing its type:
651+
652+
ECL_AND AND; no additional data
653+
ECL_OR OR; no additional data
654+
ECL_XOR XOR; no additional data
655+
ECL_NOT NOT; no additional data
656+
ECL_XCLASS The additional data which follows ECL_XCLASS is the same as for
657+
an OP_XCLASS, except that this data is preceded by ECL_XCLASS
658+
rather than OP_XCLASS.
659+
Because the OP_ECLASS has its own bitmap (if required), an
660+
ECL_XCLASS should not contain a bitmap.
661+
662+
Additionally, there are two intermediate values used during compilation, but
663+
these are folded away during generation of the opcode, and so never appear
664+
inside an OP_ECLASS at match time. They are:
665+
666+
ECL_ANY match all characters; no additional data
667+
ECL_NONE match no characters; no additional data
668+
669+
The meaning of the bitmap indicated by ECL_MAP is different to that of XCL_MAP
670+
for OP_XCLASS, in one way. The ECL_MAP bitmap is present whenever any code
671+
points < 256 match the class.
672+
636673

637674
Back references
638675
---------------

doc/pcre2test.1

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ it is preferred to use \eN{U+hh...} when describing characters. When testing
524524
the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte for values
525525
that could fit on it, and causes an error for greater values.
526526
.P
527-
When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
527+
When testing the 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
528528
values are accepted. This makes it possible to construct invalid UTF-16
529529
sequences for testing purposes.
530530
.P

src/pcre2_auto_possess.c

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -480,13 +480,13 @@ switch(c)
480480

481481
case OP_NCLASS:
482482
case OP_CLASS:
483+
#ifdef SUPPORT_WIDE_CHARS
483484
case OP_XCLASS:
484485
case OP_ECLASS:
485-
/* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537
486-
Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */
487486
if (c == OP_XCLASS || c == OP_ECLASS)
488487
end = code + GET(code, 0) - 1;
489488
else
489+
#endif
490490
end = code + 32 / sizeof(PCRE2_UCHAR);
491491
class_end = end;
492492

@@ -1118,17 +1118,15 @@ for(;;)
11181118
list_ptr[2] + LINK_SIZE, (const uint8_t*)cb->start_code, utf))
11191119
return FALSE;
11201120
break;
1121-
#endif
11221121

1123-
/* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537
1124-
Enclose in "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */
11251122
case OP_ECLASS:
11261123
if (PRIV(eclass)(chr,
11271124
(list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE,
11281125
(list_ptr == list ? code : base_end) - list_ptr[3],
11291126
(const uint8_t*)cb->start_code, utf))
11301127
return FALSE;
11311128
break;
1129+
#endif /* SUPPORT_WIDE_CHARS */
11321130

11331131
default:
11341132
return FALSE;
@@ -1236,13 +1234,17 @@ for (;;)
12361234
}
12371235
c = *code;
12381236
}
1239-
else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS || c == OP_ECLASS)
1237+
else if (c == OP_CLASS || c == OP_NCLASS
1238+
#ifdef SUPPORT_WIDE_CHARS
1239+
|| c == OP_XCLASS || c == OP_ECLASS
1240+
#endif
1241+
)
12401242
{
1241-
/* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537
1242-
Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */
1243+
#ifdef SUPPORT_WIDE_CHARS
12431244
if (c == OP_XCLASS || c == OP_ECLASS)
12441245
repeat_opcode = code + GET(code, 1);
12451246
else
1247+
#endif
12461248
repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
12471249

12481250
c = *repeat_opcode;
@@ -1315,12 +1317,12 @@ for (;;)
13151317
code += GET(code, 1 + 2*LINK_SIZE);
13161318
break;
13171319

1318-
/* TODO: [EC] https://github.com/PCRE2Project/pcre2/issues/537
1319-
Add back the "ifdef SUPPORT_WIDE_CHARS" once we stop emitting ECLASS for this case. */
1320-
case OP_ECLASS:
1320+
#ifdef SUPPORT_WIDE_CHARS
13211321
case OP_XCLASS:
1322+
case OP_ECLASS:
13221323
code += GET(code, 1);
13231324
break;
1325+
#endif
13241326

13251327
case OP_MARK:
13261328
case OP_COMMIT_ARG:

0 commit comments

Comments
 (0)