Skip to content

Commit 5f88356

Browse files
committed
Implement backslashreplace error handler
1 parent a8e9257 commit 5f88356

File tree

2 files changed

+101
-20
lines changed

2 files changed

+101
-20
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import com.oracle.graal.python.builtins.CoreFunctions;
6060
import com.oracle.graal.python.builtins.PythonBuiltins;
6161
import com.oracle.graal.python.builtins.objects.PNone;
62+
import com.oracle.graal.python.builtins.objects.bytes.BytesUtils;
6263
import com.oracle.graal.python.builtins.objects.bytes.PBytes;
6364
import com.oracle.graal.python.builtins.objects.bytes.PBytesLike;
6465
import com.oracle.graal.python.builtins.objects.common.HashingStorage;
@@ -95,6 +96,16 @@
9596

9697
@CoreFunctions(defineModule = "_codecs")
9798
public class CodecsModuleBuiltins extends PythonBuiltins {
99+
100+
public static final String STRICT = "strict";
101+
public static final String IGNORE = "ignore";
102+
public static final String REPLACE = "replace";
103+
public static final String BACKSLASHREPLACE = "backslashreplace";
104+
public static final String NAMEREPLACE = "namereplace";
105+
public static final String XMLCHARREFREPLACE = "xmlcharrefreplace";
106+
public static final String SURROGATEESCAPE = "surrogateescape";
107+
public static final String SURROGATEPASS = "surrogatepass";
108+
98109
@Override
99110
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
100111
return CodecsModuleBuiltinsFactory.getFactories();
@@ -134,9 +145,14 @@ void handle(TruffleEncoder encoder, String errorAction, Object inputObject,
134145
@Cached PRaiseNode raiseNode) {
135146
// Ignore and replace are handled by Java Charset
136147
switch (actionProfile.profile(errorAction)) {
137-
case "strict":
148+
case STRICT:
149+
break;
150+
case BACKSLASHREPLACE:
151+
if (backslashreplace(encoder)) {
152+
return;
153+
}
138154
break;
139-
case "surrogatepass":
155+
case SURROGATEPASS:
140156
if (surrogatepass(encoder)) {
141157
return;
142158
}
@@ -147,6 +163,30 @@ void handle(TruffleEncoder encoder, String errorAction, Object inputObject,
147163
throw raiseEncodingErrorNode.execute(encoder, inputObject);
148164
}
149165

166+
@TruffleBoundary
167+
private static boolean backslashreplace(TruffleEncoder encoder) {
168+
String p = new String(encoder.getInputChars(encoder.getErrorLenght()));
169+
StringBuilder sb = new StringBuilder();
170+
byte[] buf = new byte[10];
171+
for (int i = 0; i < p.length();) {
172+
int ch = p.codePointAt(i);
173+
int len;
174+
if (ch < 0x100) {
175+
BytesUtils.byteEscape(ch, 0, buf);
176+
len = 4;
177+
} else {
178+
len = BytesUtils.unicodeNonAsciiEscape(ch, 0, buf);
179+
}
180+
for (int j = 0; j < len; j++) {
181+
sb.append((char) buf[j]);
182+
}
183+
i += Character.charCount(ch);
184+
}
185+
encoder.replace(sb.toString(), p.length());
186+
return true;
187+
}
188+
189+
@TruffleBoundary
150190
private static boolean surrogatepass(TruffleEncoder encoder) {
151191
// UTF-8 only for now. The name should be normalized already
152192
if (encoder.getEncodingName().equals("utf_8")) {
@@ -209,9 +249,14 @@ void doStrict(TruffleDecoder decoder, String errorAction, Object inputObject,
209249
@Cached PRaiseNode raiseNode) {
210250
// Ignore and replace are handled by Java Charset
211251
switch (actionProfile.profile(errorAction)) {
212-
case "strict":
252+
case STRICT:
213253
break;
214-
case "surrogatepass":
254+
case BACKSLASHREPLACE:
255+
if (backslashreplace(decoder)) {
256+
return;
257+
}
258+
break;
259+
case SURROGATEPASS:
215260
if (surrogatepass(decoder)) {
216261
return;
217262
}
@@ -222,6 +267,24 @@ void doStrict(TruffleDecoder decoder, String errorAction, Object inputObject,
222267
throw raiseDecodingErrorNode.execute(decoder, inputObject);
223268
}
224269

270+
@TruffleBoundary
271+
private static boolean backslashreplace(TruffleDecoder decoder) {
272+
byte[] p = decoder.getInputBytes(decoder.getErrorLenght());
273+
char[] replacement = new char[p.length * 4];
274+
int outp = 0;
275+
byte[] buf = new byte[4];
276+
for (int i = 0; i < p.length; i++) {
277+
BytesUtils.byteEscape(p[i], 0, buf);
278+
replacement[outp++] = (char) buf[0];
279+
replacement[outp++] = (char) buf[1];
280+
replacement[outp++] = (char) buf[2];
281+
replacement[outp++] = (char) buf[3];
282+
}
283+
decoder.replace(replacement, p.length);
284+
return true;
285+
}
286+
287+
@TruffleBoundary
225288
private static boolean surrogatepass(TruffleDecoder decoder) {
226289
// UTF-8 only for now. The name should be normalized already
227290
if (decoder.getEncodingName().equals("utf_8")) {
@@ -250,16 +313,18 @@ protected static CodingErrorAction convertCodingErrorAction(String errors) {
250313
CodingErrorAction errorAction;
251314
switch (errors) {
252315
// TODO: see [GR-10256] to implement the correct handling mechanics
253-
case "ignore":
316+
case IGNORE:
254317
errorAction = CodingErrorAction.IGNORE;
255318
break;
256-
case "replace":
257-
case "surrogateescape":
258-
case "namereplace":
259-
case "backslashreplace":
260-
case "xmlcharrefreplace":
319+
case REPLACE:
320+
case SURROGATEESCAPE:
321+
case NAMEREPLACE:
322+
case XMLCHARREFREPLACE:
261323
errorAction = CodingErrorAction.REPLACE;
262324
break;
325+
case STRICT:
326+
case BACKSLASHREPLACE:
327+
case SURROGATEPASS:
263328
default:
264329
// Everything else will be handled by our Handle nodes
265330
errorAction = CodingErrorAction.REPORT;
@@ -280,7 +345,7 @@ public abstract static class CodecsEncodeNode extends EncodeBaseNode {
280345
Object encode(Object str, @SuppressWarnings("unused") PNone encoding, @SuppressWarnings("unused") PNone errors,
281346
@Shared("castStr") @Cached CastToJavaStringNode castStr) {
282347
String profiledStr = cast(castStr, str);
283-
PBytes bytes = encodeString(str, profiledStr, "utf-8", "strict");
348+
PBytes bytes = encodeString(str, profiledStr, "utf-8", STRICT);
284349
return factory().createTuple(new Object[]{bytes, getLength(bytes)});
285350
}
286351

@@ -290,7 +355,7 @@ Object encode(Object str, Object encoding, @SuppressWarnings("unused") PNone err
290355
@Shared("castEncoding") @Cached CastToJavaStringNode castEncoding) {
291356
String profiledStr = cast(castStr, str);
292357
String profiledEncoding = cast(castEncoding, encoding);
293-
PBytes bytes = encodeString(str, profiledStr, profiledEncoding, "strict");
358+
PBytes bytes = encodeString(str, profiledStr, profiledEncoding, STRICT);
294359
return factory().createTuple(new Object[]{bytes, getLength(bytes)});
295360
}
296361

@@ -371,12 +436,12 @@ abstract static class CodecsDecodeNode extends EncodeBaseNode {
371436

372437
@Specialization
373438
Object decode(VirtualFrame frame, PBytesLike bytes, @SuppressWarnings("unused") PNone encoding, @SuppressWarnings("unused") PNone errors, Object finalData) {
374-
return decodeBytes(bytes, "utf-8", "strict", castToBoolean(frame, finalData));
439+
return decodeBytes(bytes, "utf-8", STRICT, castToBoolean(frame, finalData));
375440
}
376441

377442
@Specialization(guards = {"isString(encoding)"})
378443
Object decode(VirtualFrame frame, PBytesLike bytes, Object encoding, @SuppressWarnings("unused") PNone errors, Object finalData) {
379-
return decodeBytes(bytes, castToString(encoding), "strict", castToBoolean(frame, finalData));
444+
return decodeBytes(bytes, castToString(encoding), STRICT, castToBoolean(frame, finalData));
380445
}
381446

382447
@Specialization(guards = {"isString(errors)"})
@@ -498,7 +563,7 @@ private static int codePointAt(String chars, int pos) {
498563
static class TruffleEncoder {
499564
private final String encodingName;
500565
private final CharsetEncoder encoder;
501-
private final CharBuffer inputBuffer;
566+
private CharBuffer inputBuffer;
502567
private ByteBuffer outputBuffer;
503568
private CoderResult coderResult;
504569

@@ -589,6 +654,16 @@ public void replace(byte[] replacement, int skipInput) {
589654
inputBuffer.position(inputBuffer.position() + skipInput);
590655
}
591656

657+
@TruffleBoundary
658+
public void replace(String replacement, int skipInput) {
659+
inputBuffer.position(inputBuffer.position() + skipInput);
660+
CharBuffer newBuffer = CharBuffer.allocate(inputBuffer.remaining() + replacement.length());
661+
newBuffer.put(replacement);
662+
newBuffer.put(inputBuffer);
663+
newBuffer.flip();
664+
inputBuffer = newBuffer;
665+
}
666+
592667
public String getEncodingName() {
593668
return encodingName;
594669
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,18 +297,24 @@ public static int unicodeEscape(int codePoint, int startIndex, byte[] buffer) {
297297
buffer[i++] = 'r';
298298
} else {
299299
/* Map non-printable US ASCII and 8-bit characters to '\xHH' */
300-
buffer[i++] = '\\';
301-
buffer[i++] = 'x';
302-
buffer[i++] = hexdigits[(codePoint >> 4) & 0x000F];
303-
buffer[i++] = hexdigits[codePoint & 0x000F];
300+
byteEscape(codePoint, i, buffer);
301+
i += 4;
304302
}
305303
} else {
306304
i = unicodeNonAsciiEscape(codePoint, i, buffer);
307305
}
308306
return i;
309307
}
310308

311-
private static int unicodeNonAsciiEscape(int codePoint, int startIndex, byte[] buffer) {
309+
public static void byteEscape(int codePoint, int startIndex, byte[] buffer) {
310+
int i = startIndex;
311+
buffer[i++] = '\\';
312+
buffer[i++] = 'x';
313+
buffer[i++] = hexdigits[(codePoint >> 4) & 0x000F];
314+
buffer[i] = hexdigits[codePoint & 0x000F];
315+
}
316+
317+
public static int unicodeNonAsciiEscape(int codePoint, int startIndex, byte[] buffer) {
312318
int i = startIndex;
313319
if (codePoint < 0x100) {
314320
buffer[i++] = (byte) codePoint;

0 commit comments

Comments
 (0)