@@ -70,7 +70,7 @@ protected List<? extends NodeFactory<? extends PythonBuiltinNode>> getNodeFactor
70
70
@ Builtin (name = "tregex_preprocess" , fixedNumOfArguments = 1 )
71
71
@ GenerateNodeFactory
72
72
abstract static class TregexPreprocessNode extends PythonUnaryBuiltinNode {
73
- @ CompilationFinal private Pattern commentPattern ;
73
+ @ CompilationFinal private Pattern namedCaptGroupPattern ;
74
74
75
75
@ Specialization
76
76
Object run (PString str ) {
@@ -79,30 +79,51 @@ Object run(PString str) {
79
79
80
80
@ Specialization
81
81
Object run (String str ) {
82
- if (commentPattern == null ) {
82
+ if (namedCaptGroupPattern == null ) {
83
83
CompilerDirectives .transferToInterpreterAndInvalidate ();
84
- commentPattern = Pattern .compile ("(#[^ \\ ]]* \n ) " );
84
+ namedCaptGroupPattern = Pattern .compile ("\\ ?P \\ <(?<GRPNAME> \\ w*) \\ > " );
85
85
}
86
86
return replaceAll (str );
87
87
}
88
88
89
- @ TruffleBoundary
89
+ /**
90
+ * replaces named capturing groups {@code ?P<name>} by {@code ?<name>}, removes comments and
91
+ * whitespaces if they are not in a character class, and replaces end-of-string {@code \Z}
92
+ * by {@code $}.
93
+ */
94
+ @ TruffleBoundary (transferToInterpreterOnException = false , allowInlining = true )
90
95
private String replaceAll (String r ) {
91
- Matcher matcher = commentPattern .matcher (r );
92
- String res = matcher .replaceAll ("" );
93
- StringBuilder sb = new StringBuilder ();
96
+ Matcher matcher0 = namedCaptGroupPattern .matcher (r );
97
+ StringBuffer sb = new StringBuffer ();
98
+ while (matcher0 .find ()) {
99
+ matcher0 .appendReplacement (sb , "?<" + matcher0 .group ("GRPNAME" ) + ">" );
100
+ }
101
+ matcher0 .appendTail (sb );
102
+
94
103
int charclassNestingLevel = 0 ;
95
- for (int i = 0 ; i < res .length (); i ++) {
96
- char c = res .charAt (i );
97
- if (c == '[' ) {
104
+ boolean inComment = false ;
105
+ for (int i = 0 ; i < sb .length ();) {
106
+ char c = sb .charAt (i );
107
+ if (c == '[' && !inComment ) {
98
108
charclassNestingLevel ++;
99
- } else if (c == ']' ) {
109
+ } else if (c == ']' && ! inComment ) {
100
110
charclassNestingLevel --;
111
+ } else if (c == '#' && charclassNestingLevel == 0 ) {
112
+ inComment = true ;
113
+ } else if (c == '\n' && inComment ) {
114
+ inComment = false ;
101
115
}
102
- if (!Character .isWhitespace (c ) || charclassNestingLevel != 0 ) {
103
- sb .append (res .charAt (i ));
116
+ if (inComment || (Character .isWhitespace (c ) && charclassNestingLevel == 0 )) {
117
+ sb .deleteCharAt (i );
118
+ } else {
119
+ i ++;
104
120
}
105
121
}
122
+
123
+ for (int idx = sb .indexOf ("\\ Z" ); idx != -1 ; idx = sb .indexOf ("\\ Z" , idx + 2 )) {
124
+ sb .replace (idx , idx + 2 , "$" );
125
+ }
126
+
106
127
return sb .toString ();
107
128
}
108
129
0 commit comments