Skip to content

Commit acaaac0

Browse files
committed
[GR-10788] Support for Python regex syntax.
PullRequest: graalpython/191
2 parents 374a9eb + f73dba5 commit acaaac0

File tree

9 files changed

+315
-299
lines changed

9 files changed

+315
-299
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ language runtime. The main focus is on user-observable behavior of the engine.
88
* Support `help` in the builtin Python shell
99
* Add `readline` to enable history and autocompletion in the Python shell
1010
* Improve display of foreign array-like objects
11+
* Improve support for string and bytes regular expressions using our TRegex engine
1112

1213
## Version 1.0.0 RC8
1314

graalpython/com.oracle.graal.python.test/src/tests/test_re.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __getitem__(self, index):
8585

8686
class B(bytes):
8787
def __getitem__(self, index):
88-
return B(super().__getitem__(index))
88+
return super().__getitem__(index)
8989

9090

9191
class ReTests(unittest.TestCase):
@@ -159,8 +159,13 @@ def test_basic_re_sub(self):
159159
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
160160
(chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7) + chr(8)))
161161

162-
for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
163-
self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
162+
# The following behavior is correct w.r.t. Python 3.7. However, currently
163+
# the gate uses CPython 3.4.1 to validate the test suite,
164+
# which does not pass this test case, so we have to skip.
165+
# for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
166+
# with self.assertRaises(re.error):
167+
# self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
168+
164169

165170
self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
166171

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/SREModuleBuiltins.java

Lines changed: 57 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,20 @@
4242

4343
import static com.oracle.graal.python.runtime.exception.PythonErrorType.RuntimeError;
4444
import static com.oracle.graal.python.runtime.exception.PythonErrorType.TypeError;
45+
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
4546

4647
import java.io.UnsupportedEncodingException;
4748
import java.util.List;
48-
import java.util.regex.Matcher;
4949
import java.util.regex.Pattern;
5050

5151
import com.oracle.graal.python.builtins.Builtin;
5252
import com.oracle.graal.python.builtins.CoreFunctions;
5353
import com.oracle.graal.python.builtins.PythonBuiltins;
54+
import com.oracle.graal.python.builtins.objects.bytes.BytesNodes;
5455
import com.oracle.graal.python.builtins.objects.bytes.BytesUtils;
5556
import com.oracle.graal.python.builtins.objects.bytes.PIBytesLike;
5657
import com.oracle.graal.python.builtins.objects.common.SequenceStorageNodes;
58+
import com.oracle.graal.python.builtins.objects.memoryview.PMemoryView;
5759
import com.oracle.graal.python.builtins.objects.str.PString;
5860
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
5961
import com.oracle.graal.python.nodes.function.PythonBuiltinNode;
@@ -78,6 +80,7 @@
7880
import com.oracle.truffle.api.interop.UnsupportedTypeException;
7981
import com.oracle.truffle.api.nodes.Node;
8082
import com.oracle.truffle.api.profiles.BranchProfile;
83+
import com.oracle.truffle.regex.RegexSyntaxException;
8184

8285
@CoreFunctions(defineModule = "_sre")
8386
public class SREModuleBuiltins extends PythonBuiltins {
@@ -86,106 +89,6 @@ protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFa
8689
return SREModuleBuiltinsFactory.getFactories();
8790
}
8891

89-
@Builtin(name = "tregex_preprocess_for_verbose", fixedNumOfPositionalArgs = 1)
90-
@GenerateNodeFactory
91-
abstract static class TRegexPreprocessVerboseNode extends PythonUnaryBuiltinNode {
92-
93-
@Specialization
94-
Object run(PString str) {
95-
return run(str.getValue());
96-
}
97-
98-
@Specialization
99-
Object run(String str) {
100-
return replaceAll(str);
101-
}
102-
103-
/**
104-
* removes comments and whitespaces if they are not in a character class
105-
*/
106-
@TruffleBoundary(transferToInterpreterOnException = false, allowInlining = true)
107-
private static String replaceAll(String r) {
108-
StringBuffer sb = new StringBuffer(r);
109-
int charclassNestingLevel = 0;
110-
boolean inComment = false;
111-
for (int i = 0; i < sb.length();) {
112-
char c = sb.charAt(i);
113-
if (c == '[' && !inComment) {
114-
charclassNestingLevel++;
115-
} else if (c == ']' && !inComment) {
116-
charclassNestingLevel--;
117-
} else if (c == '#' && charclassNestingLevel == 0) {
118-
inComment = true;
119-
} else if (c == '\n' && inComment) {
120-
inComment = false;
121-
}
122-
if (inComment || (Character.isWhitespace(c) && charclassNestingLevel == 0)) {
123-
sb.deleteCharAt(i);
124-
} else {
125-
i++;
126-
}
127-
}
128-
129-
for (int idx = sb.indexOf("\\Z"); idx != -1; idx = sb.indexOf("\\Z", idx + 2)) {
130-
sb.replace(idx, idx + 2, "$");
131-
}
132-
133-
return sb.toString();
134-
}
135-
136-
@Fallback
137-
Object run(Object o) {
138-
throw raise(PythonErrorType.TypeError, "expected string, not %p", o);
139-
}
140-
141-
}
142-
143-
@Builtin(name = "tregex_preprocess_default", fixedNumOfPositionalArgs = 1)
144-
@GenerateNodeFactory
145-
abstract static class TRegexPreprocessDefaultNode extends PythonUnaryBuiltinNode {
146-
@CompilationFinal private Pattern namedCaptGroupPattern;
147-
148-
@Specialization
149-
Object run(PString str) {
150-
return run(str.getValue());
151-
}
152-
153-
@Specialization
154-
Object run(String str) {
155-
if (namedCaptGroupPattern == null) {
156-
CompilerDirectives.transferToInterpreterAndInvalidate();
157-
namedCaptGroupPattern = Pattern.compile("\\?P\\<(?<GRPNAME>\\w*)\\>");
158-
}
159-
return replaceAll(str);
160-
}
161-
162-
/**
163-
* replaces named capturing groups {@code ?P<name>} by {@code ?<name>} and replaces
164-
* end-of-string {@code \Z} by {@code $}.
165-
*/
166-
@TruffleBoundary(transferToInterpreterOnException = false, allowInlining = true)
167-
private String replaceAll(String r) {
168-
Matcher matcher0 = namedCaptGroupPattern.matcher(r);
169-
StringBuffer sb = new StringBuffer();
170-
while (matcher0.find()) {
171-
matcher0.appendReplacement(sb, "?<" + matcher0.group("GRPNAME") + ">");
172-
}
173-
matcher0.appendTail(sb);
174-
175-
for (int idx = sb.indexOf("\\Z"); idx != -1; idx = sb.indexOf("\\Z", idx + 2)) {
176-
sb.replace(idx, idx + 2, "$");
177-
}
178-
179-
return sb.toString();
180-
}
181-
182-
@Fallback
183-
Object run(Object o) {
184-
throw raise(PythonErrorType.TypeError, "expected string, not %p", o);
185-
}
186-
187-
}
188-
18992
/**
19093
* Replaces any <it>quoted</it> escape sequence like {@code "\\n"} (two characters; backslash +
19194
* 'n') by its single character like {@code "\n"} (one character; newline).
@@ -195,6 +98,7 @@ Object run(Object o) {
19598
abstract static class ProcessEscapeSequences extends PythonUnaryBuiltinNode {
19699

197100
@Child private SequenceStorageNodes.ToByteArrayNode toByteArrayNode;
101+
@Child private BytesNodes.ToBytesNode toBytesNode;
198102

199103
@CompilationFinal private Pattern namedCaptGroupPattern;
200104

@@ -222,6 +126,15 @@ Object run(PIBytesLike str) {
222126
return str;
223127
}
224128

129+
@Specialization
130+
Object run(PMemoryView memoryView) {
131+
byte[] bytes = doBytes(getToBytesNode().execute(memoryView));
132+
if (bytes != null) {
133+
return factory().createByteArray(bytes);
134+
}
135+
return memoryView;
136+
}
137+
225138
@TruffleBoundary(transferToInterpreterOnException = false, allowInlining = true)
226139
private byte[] doBytes(byte[] str) {
227140
try {
@@ -255,47 +168,72 @@ private SequenceStorageNodes.ToByteArrayNode getToByteArrayNode() {
255168
return toByteArrayNode;
256169
}
257170

171+
private BytesNodes.ToBytesNode getToBytesNode() {
172+
if (toBytesNode == null) {
173+
CompilerDirectives.transferToInterpreterAndInvalidate();
174+
toBytesNode = insert(BytesNodes.ToBytesNode.create());
175+
}
176+
return toBytesNode;
177+
}
258178
}
259179

260-
@Builtin(name = "tregex_call_safe", fixedNumOfPositionalArgs = 3)
180+
@Builtin(name = "tregex_call_compile", fixedNumOfPositionalArgs = 3)
261181
@TypeSystemReference(PythonArithmeticTypes.class)
262182
@GenerateNodeFactory
263-
abstract static class TRegexCallSafe extends PythonBuiltinNode {
183+
abstract static class TRegexCallCompile extends PythonBuiltinNode {
264184

265-
private Object doIt(TruffleObject callable, String arg1, Object arg2,
266-
BranchProfile runtimeError,
267-
BranchProfile typeError, Node invokeNode) {
185+
@Specialization(guards = "isForeignObject(callable)")
186+
Object call(TruffleObject callable, Object arg1, Object arg2,
187+
@Cached("create()") BranchProfile syntaxError,
188+
@Cached("create()") BranchProfile typeError,
189+
@Cached("createExecute()") Node invokeNode) {
268190
try {
269191
return ForeignAccess.sendExecute(invokeNode, callable, new Object[]{arg1, arg2});
270192
} catch (ArityException | UnsupportedTypeException | UnsupportedMessageException e) {
271193
typeError.enter();
272194
throw raise(TypeError, "%s", e);
273-
} catch (RuntimeException e) {
274-
runtimeError.enter();
275-
throw raise(RuntimeError, "%s", e);
195+
} catch (RegexSyntaxException e) {
196+
syntaxError.enter();
197+
if (e.getPosition() == -1) {
198+
throw raise(ValueError, "%s", e.getReason());
199+
} else {
200+
throw raise(ValueError, "%s at position %d", e.getReason(), e.getPosition());
201+
}
276202
}
277203
}
278204

279-
@Specialization(guards = "isForeignObject(callable)")
280-
Object call(TruffleObject callable, String arg1, String arg2,
281-
@Cached("create()") BranchProfile runtimeError,
282-
@Cached("create()") BranchProfile typeError,
283-
@Cached("createExecute()") Node invokeNode) {
284-
return doIt(callable, arg1, arg2, runtimeError, typeError, invokeNode);
205+
@SuppressWarnings("unused")
206+
@Fallback
207+
Object call(Object callable, Object arg1, Object arg2) {
208+
throw raise(RuntimeError, "invalid arguments passed to tregex_call_compile");
209+
}
210+
211+
protected static Node createExecute() {
212+
return Message.EXECUTE.createNode();
285213
}
214+
}
215+
216+
@Builtin(name = "tregex_call_exec", fixedNumOfPositionalArgs = 3)
217+
@TypeSystemReference(PythonArithmeticTypes.class)
218+
@GenerateNodeFactory
219+
abstract static class TRegexCallExec extends PythonBuiltinNode {
286220

287221
@Specialization(guards = "isForeignObject(callable)")
288-
Object call(TruffleObject callable, String arg1, int arg2,
289-
@Cached("create()") BranchProfile runtimeError,
222+
Object call(TruffleObject callable, Object arg1, Number arg2,
290223
@Cached("create()") BranchProfile typeError,
291224
@Cached("createExecute()") Node invokeNode) {
292-
return doIt(callable, arg1, arg2, runtimeError, typeError, invokeNode);
225+
try {
226+
return ForeignAccess.sendExecute(invokeNode, callable, new Object[]{arg1, arg2});
227+
} catch (ArityException | UnsupportedTypeException | UnsupportedMessageException e) {
228+
typeError.enter();
229+
throw raise(TypeError, "%s", e);
230+
}
293231
}
294232

295233
@SuppressWarnings("unused")
296234
@Fallback
297235
Object call(Object callable, Object arg1, Object arg2) {
298-
throw raise(RuntimeError);
236+
throw raise(RuntimeError, "invalid arguments passed to tregex_call_exec");
299237
}
300238

301239
protected static Node createExecute() {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,9 @@ public static StringBuilder decodeEscapes(ParserErrorCallback errors, String str
199199
}
200200
throw errors.raise(ValueError, "invalid \\x escape at position %d", i);
201201
default:
202-
if (regexMode) {
202+
if (regexMode && (chr == '\\' || chr == 'g' || (chr >= '0' && chr <= '9'))) {
203+
// only allow backslashes, named group references and numbered group
204+
// references in regex mode
203205
charList.append('\\');
204206
charList.append(chr);
205207
} else {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/builtins/ListNodes.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ public SequenceStorage execute(Object iterator) {
152152
int value = next.executeInt(iterator);
153153
if (i >= elements.length) {
154154
elements = Arrays.copyOf(elements, elements.length * 2);
155+
array = elements;
155156
}
156157
elements[i++] = value;
157158
} catch (PException e) {
@@ -170,6 +171,7 @@ public SequenceStorage execute(Object iterator) {
170171
long value = next.executeLong(iterator);
171172
if (i >= elements.length) {
172173
elements = Arrays.copyOf(elements, elements.length * 2);
174+
array = elements;
173175
}
174176
elements[i++] = value;
175177
} catch (PException e) {
@@ -188,6 +190,7 @@ public SequenceStorage execute(Object iterator) {
188190
double value = next.executeDouble(iterator);
189191
if (i >= elements.length) {
190192
elements = Arrays.copyOf(elements, elements.length * 2);
193+
array = elements;
191194
}
192195
elements[i++] = value;
193196
} catch (PException e) {
@@ -206,6 +209,7 @@ public SequenceStorage execute(Object iterator) {
206209
PList value = PList.expect(next.execute(iterator));
207210
if (i >= elements.length) {
208211
elements = Arrays.copyOf(elements, elements.length * 2);
212+
array = elements;
209213
}
210214
elements[i++] = value;
211215
} catch (PException e) {
@@ -224,6 +228,7 @@ public SequenceStorage execute(Object iterator) {
224228
PTuple value = PTuple.expect(next.execute(iterator));
225229
if (i >= elements.length) {
226230
elements = Arrays.copyOf(elements, elements.length * 2);
231+
array = elements;
227232
}
228233
elements[i++] = value;
229234
} catch (PException e) {

0 commit comments

Comments
 (0)