Skip to content

Commit 32ca115

Browse files
committed
Improve regex preprocessing: replace end-of-string '\Z' by '$'.
1 parent e9f24f4 commit 32ca115

File tree

2 files changed

+35
-14
lines changed

2 files changed

+35
-14
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/SREModuleBuiltins.java

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ protected List<? extends NodeFactory<? extends PythonBuiltinNode>> getNodeFactor
7070
@Builtin(name = "tregex_preprocess", fixedNumOfArguments = 1)
7171
@GenerateNodeFactory
7272
abstract static class TregexPreprocessNode extends PythonUnaryBuiltinNode {
73-
@CompilationFinal private Pattern commentPattern;
73+
@CompilationFinal private Pattern namedCaptGroupPattern;
7474

7575
@Specialization
7676
Object run(PString str) {
@@ -79,30 +79,51 @@ Object run(PString str) {
7979

8080
@Specialization
8181
Object run(String str) {
82-
if (commentPattern == null) {
82+
if (namedCaptGroupPattern == null) {
8383
CompilerDirectives.transferToInterpreterAndInvalidate();
84-
commentPattern = Pattern.compile("(#[^\\]]*\n)");
84+
namedCaptGroupPattern = Pattern.compile("\\?P\\<(?<GRPNAME>\\w*)\\>");
8585
}
8686
return replaceAll(str);
8787
}
8888

89-
@TruffleBoundary
89+
/**
90+
* replaces named capturing groups {@code ?P<name>} by {@code ?<name>}, removes comments and
91+
* whitespaces if they are not in a character class, and replaces end-of-string {@code \Z}
92+
* by {@code $}.
93+
*/
94+
@TruffleBoundary(transferToInterpreterOnException = false, allowInlining = true)
9095
private String replaceAll(String r) {
91-
Matcher matcher = commentPattern.matcher(r);
92-
String res = matcher.replaceAll("");
93-
StringBuilder sb = new StringBuilder();
96+
Matcher matcher0 = namedCaptGroupPattern.matcher(r);
97+
StringBuffer sb = new StringBuffer();
98+
while (matcher0.find()) {
99+
matcher0.appendReplacement(sb, "?<" + matcher0.group("GRPNAME") + ">");
100+
}
101+
matcher0.appendTail(sb);
102+
94103
int charclassNestingLevel = 0;
95-
for (int i = 0; i < res.length(); i++) {
96-
char c = res.charAt(i);
97-
if (c == '[') {
104+
boolean inComment = false;
105+
for (int i = 0; i < sb.length();) {
106+
char c = sb.charAt(i);
107+
if (c == '[' && !inComment) {
98108
charclassNestingLevel++;
99-
} else if (c == ']') {
109+
} else if (c == ']' && !inComment) {
100110
charclassNestingLevel--;
111+
} else if (c == '#' && charclassNestingLevel == 0) {
112+
inComment = true;
113+
} else if (c == '\n' && inComment) {
114+
inComment = false;
101115
}
102-
if (!Character.isWhitespace(c) || charclassNestingLevel != 0) {
103-
sb.append(res.charAt(i));
116+
if (inComment || (Character.isWhitespace(c) && charclassNestingLevel == 0)) {
117+
sb.deleteCharAt(i);
118+
} else {
119+
i++;
104120
}
105121
}
122+
123+
for (int idx = sb.indexOf("\\Z"); idx != -1; idx = sb.indexOf("\\Z", idx + 2)) {
124+
sb.replace(idx, idx + 2, "$");
125+
}
126+
106127
return sb.toString();
107128
}
108129

graalpython/lib-graalpython/_sre.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def __init__(self, pattern, flags, code, groups=0, groupindex=None, indexgroup=N
115115
jsflags.append(jsflag)
116116
self.jsflags = "".join(jsflags)
117117

118-
def _decode_string(self, string, flags):
118+
def _decode_string(self, string, flags=0):
119119
if isinstance(string, str):
120120
pattern = string
121121
elif isinstance(string, bytes):

0 commit comments

Comments
 (0)