Skip to content

Commit e9f24f4

Browse files
committed
Improve regex preprocessing: remove whitespaces.
1 parent 59b01c5 commit e9f24f4

File tree

2 files changed

+21
-9
lines changed

2 files changed

+21
-9
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/SREModuleBuiltins.java

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
*/
3939
package com.oracle.graal.python.builtins.modules;
4040

41-
import java.util.ArrayList;
4241
import java.util.List;
4342
import java.util.regex.Matcher;
4443
import java.util.regex.Pattern;
@@ -62,7 +61,7 @@
6261
public class SREModuleBuiltins extends PythonBuiltins {
6362
@Override
6463
protected List<? extends NodeFactory<? extends PythonBuiltinNode>> getNodeFactories() {
65-
return new ArrayList<>();
64+
return SREModuleBuiltinsFactory.getFactories();
6665
}
6766

6867
/**
@@ -71,7 +70,7 @@ protected List<? extends NodeFactory<? extends PythonBuiltinNode>> getNodeFactor
7170
@Builtin(name = "tregex_preprocess", fixedNumOfArguments = 1)
7271
@GenerateNodeFactory
7372
abstract static class TregexPreprocessNode extends PythonUnaryBuiltinNode {
74-
@CompilationFinal private Pattern pattern;
73+
@CompilationFinal private Pattern commentPattern;
7574

7675
@Specialization
7776
Object run(PString str) {
@@ -80,18 +79,31 @@ Object run(PString str) {
8079

8180
@Specialization
8281
Object run(String str) {
83-
str.replaceAll("[^\\[]?#[^\\]]*\n", "");
84-
if (pattern == null) {
82+
if (commentPattern == null) {
8583
CompilerDirectives.transferToInterpreterAndInvalidate();
86-
pattern = Pattern.compile("(?<CMT>#[^\\]]*\n)");
84+
commentPattern = Pattern.compile("(#[^\\]]*\n)");
8785
}
8886
return replaceAll(str);
8987
}
9088

9189
@TruffleBoundary
9290
private String replaceAll(String r) {
93-
Matcher matcher = pattern.matcher(r);
94-
return matcher.replaceAll("");
91+
Matcher matcher = commentPattern.matcher(r);
92+
String res = matcher.replaceAll("");
93+
StringBuilder sb = new StringBuilder();
94+
int charclassNestingLevel = 0;
95+
for (int i = 0; i < res.length(); i++) {
96+
char c = res.charAt(i);
97+
if (c == '[') {
98+
charclassNestingLevel++;
99+
} else if (c == ']') {
100+
charclassNestingLevel--;
101+
}
102+
if (!Character.isWhitespace(c) || charclassNestingLevel != 0) {
103+
sb.append(res.charAt(i));
104+
}
105+
}
106+
return sb.toString();
95107
}
96108

97109
@Fallback

graalpython/lib-graalpython/_sre.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _decode_string(self, string, flags):
128128
# TODO: that's not nearly complete but should be sufficient for now
129129
from sre_compile import SRE_FLAG_VERBOSE
130130
if flags & SRE_FLAG_VERBOSE:
131-
pattern = _sre.tregex_preprocess(pattern)
131+
pattern = tregex_preprocess(pattern)
132132
return pattern
133133

134134

0 commit comments

Comments
 (0)