Skip to content

Commit a2241c1

Browse files
committed
[GR-68826] [GR-37218] Various improvements on TruffleString usages.
PullRequest: graalpython/3975
2 parents 89bade9 + 668cf3d commit a2241c1

18 files changed

+378
-266
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonLanguage.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,16 @@
5050
import java.util.concurrent.Semaphore;
5151
import java.util.logging.Level;
5252

53-
import com.oracle.truffle.api.exception.AbstractTruffleException;
5453
import org.graalvm.home.Version;
5554
import org.graalvm.nativeimage.ImageInfo;
5655
import org.graalvm.options.OptionDescriptors;
5756
import org.graalvm.options.OptionKey;
5857
import org.graalvm.options.OptionValues;
5958
import org.graalvm.polyglot.SandboxPolicy;
6059

60+
import com.oracle.graal.python.annotations.PythonOS;
6161
import com.oracle.graal.python.builtins.Python3Core;
6262
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
63-
import com.oracle.graal.python.annotations.PythonOS;
6463
import com.oracle.graal.python.builtins.modules.MarshalModuleBuiltins;
6564
import com.oracle.graal.python.builtins.modules.SignalModuleBuiltins;
6665
import com.oracle.graal.python.builtins.objects.PNone;
@@ -127,6 +126,7 @@
127126
import com.oracle.truffle.api.debug.DebuggerTags;
128127
import com.oracle.truffle.api.dsl.Bind;
129128
import com.oracle.truffle.api.dsl.Idempotent;
129+
import com.oracle.truffle.api.exception.AbstractTruffleException;
130130
import com.oracle.truffle.api.frame.VirtualFrame;
131131
import com.oracle.truffle.api.instrumentation.AllocationReporter;
132132
import com.oracle.truffle.api.instrumentation.ProvidedTags;
@@ -1018,6 +1018,22 @@ public CallTarget cacheCode(TruffleString filename, Supplier<CallTarget> createC
10181018
}
10191019
}
10201020

1021+
private static final Source LINEBREAK_REGEX_SOURCE = Source.newBuilder("regex", "/\r\n|[\n\u000B\u000C\r\u0085\u2028\u2029]/", "re_linebreak") //
1022+
.option("regex.Flavor", "Python") //
1023+
.option("regex.Encoding", "UTF-32") //
1024+
.mimeType("application/tregex") //
1025+
.internal(true) //
1026+
.build();
1027+
@CompilationFinal private Object cachedTRegexLineBreakRegex;
1028+
1029+
public Object getCachedTRegexLineBreakRegex(PythonContext context) {
1030+
if (cachedTRegexLineBreakRegex == null) {
1031+
CompilerDirectives.transferToInterpreterAndInvalidate();
1032+
cachedTRegexLineBreakRegex = context.getEnv().parseInternal(LINEBREAK_REGEX_SOURCE).call();
1033+
}
1034+
return cachedTRegexLineBreakRegex;
1035+
}
1036+
10211037
@Override
10221038
protected boolean isThreadAccessAllowed(Thread thread, boolean singleThreaded) {
10231039
if (singleThreaded) {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@
150150
import com.oracle.truffle.api.nodes.Node;
151151
import com.oracle.truffle.api.source.Source;
152152
import com.oracle.truffle.api.strings.InternalByteArray;
153+
import com.oracle.truffle.api.strings.TranscodingErrorHandler;
153154
import com.oracle.truffle.api.strings.TruffleString;
154155
import com.oracle.truffle.api.strings.TruffleString.Encoding;
155156

@@ -1156,30 +1157,14 @@ private Object readObject(int type, AddRefAndReturn addRef) throws NumberFormatE
11561157
}
11571158

11581159
private void writeString(TruffleString v) {
1159-
/*
1160-
* Ugly workaround for GR-39571 - TruffleString UTF-8 doesn't support surrogate
1161-
* passthrough. If the string contains surrogates, we mark it and emit it as UTF-32.
1162-
*/
1163-
Encoding encoding;
1164-
if (v.isValidUncached(TS_ENCODING)) {
1165-
encoding = Encoding.UTF_8;
1166-
} else {
1167-
encoding = Encoding.UTF_32LE;
1168-
writeInt(-1);
1169-
}
1170-
InternalByteArray ba = v.switchEncodingUncached(encoding).getInternalByteArrayUncached(encoding);
1160+
InternalByteArray ba = v.switchEncodingUncached(Encoding.UTF_8, TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8).getInternalByteArrayUncached(Encoding.UTF_8);
11711161
writeSize(ba.getLength());
11721162
writeBytes(ba.getArray(), ba.getOffset(), ba.getLength());
11731163
}
11741164

11751165
private TruffleString readString() {
1176-
Encoding encoding = Encoding.UTF_8;
11771166
int sz = readInt();
1178-
if (sz < 0) {
1179-
encoding = Encoding.UTF_32LE;
1180-
sz = readSize();
1181-
}
1182-
return TruffleString.fromByteArrayUncached(readNBytes(sz), 0, sz, encoding, true).switchEncodingUncached(TS_ENCODING);
1167+
return TruffleString.fromByteArrayUncached(readNBytes(sz), 0, sz, Encoding.UTF_8, true).switchEncodingUncached(TS_ENCODING, TranscodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8);
11831168
}
11841169

11851170
private void writeShortString(String v) throws IOException {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextArrayBuiltins.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ static int getbuffer(PArray array, Object pyBufferPtr, int flags,
109109
@Cached CStructAccess.WritePointerNode writePointerNode,
110110
@Cached CStructAccess.WriteLongNode writeLongNode,
111111
@Cached CStructAccess.WriteIntNode writeIntNode,
112-
@Cached CStructAccess.WriteByteNode writeByteNode,
112+
@Cached CStructAccess.WriteTruffleStringNode writeTruffleStringNode,
113113
@Cached CStructAccess.AllocateNode allocateNode) {
114114
Object bufPtr = ensureNativeStorageNode.execute(inliningTarget, array).getPtr();
115115
Object nativeNull = PythonContext.get(inliningTarget).getNativeNull();
@@ -139,10 +139,8 @@ static int getbuffer(PArray array, Object pyBufferPtr, int flags,
139139
TruffleString.Encoding formatEncoding = TruffleString.Encoding.US_ASCII;
140140
format = switchEncodingNode.execute(format, formatEncoding);
141141
int formatLen = format.byteLength(formatEncoding);
142-
byte[] bytes = new byte[formatLen + 1];
143-
copyToByteArrayNode.execute(format, 0, bytes, 0, formatLen, formatEncoding);
144-
formatPtr = allocateNode.alloc(bytes.length);
145-
writeByteNode.writeByteArray(formatPtr, bytes);
142+
formatPtr = allocateNode.alloc(formatLen + 1);
143+
writeTruffleStringNode.write(formatPtr, format, formatEncoding);
146144
}
147145
writePointerNode.write(pyBufferPtr, CFields.Py_buffer__format, formatPtr);
148146
writePointerNode.write(pyBufferPtr, CFields.Py_buffer__internal, nativeNull);

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextSlotBuiltins.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@
135135
import com.oracle.truffle.api.dsl.Specialization;
136136
import com.oracle.truffle.api.nodes.Node;
137137
import com.oracle.truffle.api.profiles.InlinedConditionProfile;
138-
import com.oracle.truffle.api.strings.InternalByteArray;
139138
import com.oracle.truffle.api.strings.TruffleString;
140139

141140
public final class PythonCextSlotBuiltins {
@@ -676,9 +675,8 @@ static Object get(PString object,
676675
@Bind Node inliningTarget,
677676
@Cached TruffleString.GetCodeRangeNode getCodeRangeNode,
678677
@Cached TruffleString.SwitchEncodingNode switchEncodingNode,
679-
@Cached TruffleString.GetInternalByteArrayNode getInternalByteArrayNode,
680678
@Cached CStructAccess.AllocateNode allocateNode,
681-
@Cached CStructAccess.WriteByteNode writeByteNode,
679+
@Cached CStructAccess.WriteTruffleStringNode writeTruffleStringNode,
682680
@Cached HiddenAttr.WriteNode writeAttribute) {
683681
if (object.isNativeCharSequence()) {
684682
// in this case, we can just return the pointer
@@ -704,14 +702,13 @@ static Object get(PString object,
704702
encoding = TruffleString.Encoding.UTF_32;
705703
}
706704
string = switchEncodingNode.execute(string, encoding);
707-
InternalByteArray byteArray = getInternalByteArrayNode.execute(string, encoding);
708-
int byteLength = byteArray.getLength() + /* null terminator */ charSize;
705+
int byteLength = string.byteLength(encoding) + /* null terminator */ charSize;
709706
Object ptr = allocateNode.alloc(byteLength);
710-
writeByteNode.writeByteArray(ptr, byteArray.getArray(), byteArray.getLength(), byteArray.getOffset(), 0);
707+
writeTruffleStringNode.write(ptr, string, encoding);
711708
/*
712709
* Set native char sequence, so we can just return the pointer the next time.
713710
*/
714-
NativeCharSequence nativeSequence = new NativeCharSequence(ptr, byteArray.getLength() / charSize, charSize, isAscii);
711+
NativeCharSequence nativeSequence = new NativeCharSequence(ptr, string.byteLength(encoding) / charSize, charSize, isAscii);
715712
object.setNativeCharSequence(nativeSequence);
716713
/*
717714
* Create a native sequence storage to manage the lifetime of the native memory.

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
import static com.oracle.graal.python.builtins.modules.CodecsModuleBuiltins.T_UTF_32_LE;
5656
import static com.oracle.graal.python.builtins.modules.cext.PythonCextBuiltins.CApiCallPath.Direct;
5757
import static com.oracle.graal.python.builtins.modules.cext.PythonCextBuiltins.CApiCallPath.Ignored;
58+
import static com.oracle.graal.python.builtins.modules.ctypes.CtypesNodes.WCHAR_T_ENCODING;
59+
import static com.oracle.graal.python.builtins.modules.ctypes.CtypesNodes.WCHAR_T_SIZE;
5860
import static com.oracle.graal.python.builtins.objects.cext.capi.transitions.ArgDescriptor.CONST_WCHAR_PTR;
5961
import static com.oracle.graal.python.builtins.objects.cext.capi.transitions.ArgDescriptor.ConstCharPtr;
6062
import static com.oracle.graal.python.builtins.objects.cext.capi.transitions.ArgDescriptor.ConstCharPtrAsTruffleString;
@@ -75,7 +77,6 @@
7577
import static com.oracle.graal.python.nodes.ErrorMessages.SEPARATOR_EXPECTED_STR_INSTANCE_P_FOUND;
7678
import static com.oracle.graal.python.nodes.SpecialMethodNames.T___GETITEM__;
7779
import static com.oracle.graal.python.nodes.StringLiterals.T_EMPTY_STRING;
78-
import static com.oracle.graal.python.nodes.StringLiterals.T_REPLACE;
7980
import static com.oracle.graal.python.nodes.StringLiterals.T_SPACE;
8081
import static com.oracle.graal.python.nodes.StringLiterals.T_STRICT;
8182
import static com.oracle.graal.python.nodes.StringLiterals.T_UTF8;
@@ -87,9 +88,6 @@
8788
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_32LE;
8889
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8;
8990

90-
import java.nio.charset.Charset;
91-
import java.nio.charset.StandardCharsets;
92-
9391
import com.oracle.graal.python.PythonLanguage;
9492
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
9593
import com.oracle.graal.python.builtins.modules.BuiltinFunctions.ChrNode;
@@ -109,7 +107,6 @@
109107
import com.oracle.graal.python.builtins.modules.cext.PythonCextBuiltins.CApiUnaryBuiltinNode;
110108
import com.oracle.graal.python.builtins.modules.codecs.ErrorHandlers;
111109
import com.oracle.graal.python.builtins.objects.PNone;
112-
import com.oracle.graal.python.builtins.objects.buffer.PythonBufferAccessLibrary;
113110
import com.oracle.graal.python.builtins.objects.bytes.PBytes;
114111
import com.oracle.graal.python.builtins.objects.cext.PythonAbstractNativeObject;
115112
import com.oracle.graal.python.builtins.objects.cext.capi.CApiContext;
@@ -1072,9 +1069,10 @@ abstract static class PyUnicode_EncodeFSDefault extends CApiUnaryBuiltinNode {
10721069
static PBytes fromObject(Object s,
10731070
@Bind Node inliningTarget,
10741071
@Cached CastToTruffleStringNode castStr,
1075-
@Cached EncodeNativeStringNode encode) {
1076-
byte[] array = encode.execute(StandardCharsets.UTF_8, castStr.execute(inliningTarget, s), T_REPLACE);
1077-
return PFactory.createBytes(PythonLanguage.get(inliningTarget), array);
1072+
@Cached TruffleString.SwitchEncodingNode switchEncodingNode,
1073+
@Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode) {
1074+
TruffleString utf8Str = switchEncodingNode.execute(castStr.execute(inliningTarget, s), TruffleString.Encoding.UTF_8);
1075+
return PFactory.createBytes(PythonLanguage.get(inliningTarget), copyToByteArrayNode.execute(utf8Str, TruffleString.Encoding.UTF_8));
10781076
}
10791077
}
10801078

@@ -1103,22 +1101,24 @@ Object doInt(Object arr, long size,
11031101
}
11041102

11051103
abstract static class NativeEncoderNode extends CApiBinaryBuiltinNode {
1106-
private final Charset charset;
1104+
private final TruffleString.Encoding encoding;
11071105

1108-
protected NativeEncoderNode(Charset charset) {
1109-
this.charset = charset;
1106+
protected NativeEncoderNode(TruffleString.Encoding encoding) {
1107+
this.encoding = encoding;
11101108
}
11111109

11121110
@Specialization(guards = "isNoValue(errors)")
11131111
Object doUnicode(Object s, @SuppressWarnings("unused") PNone errors,
1114-
@Shared("encodeNode") @Cached EncodeNativeStringNode encodeNativeStringNode) {
1115-
return doUnicode(s, T_STRICT, encodeNativeStringNode);
1112+
@Shared("encodeNode") @Cached EncodeNativeStringNode encodeNativeStringNode,
1113+
@Shared("copyNode") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode) {
1114+
return doUnicode(s, T_STRICT, encodeNativeStringNode, copyToByteArrayNode);
11161115
}
11171116

11181117
@Specialization
11191118
Object doUnicode(Object s, TruffleString errors,
1120-
@Shared("encodeNode") @Cached EncodeNativeStringNode encodeNativeStringNode) {
1121-
return PFactory.createBytes(PythonLanguage.get(this), encodeNativeStringNode.execute(charset, s, errors));
1119+
@Shared("encodeNode") @Cached EncodeNativeStringNode encodeNativeStringNode,
1120+
@Shared("copyNode") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode) {
1121+
return PFactory.createBytes(PythonLanguage.get(this), copyToByteArrayNode.execute(encodeNativeStringNode.execute(encoding, s, errors), encoding));
11221122
}
11231123

11241124
@Fallback
@@ -1131,22 +1131,22 @@ static Object doUnicode(@SuppressWarnings("unused") Object s, @SuppressWarnings(
11311131
@CApiBuiltin(ret = PyObjectTransfer, args = {PyObject, ConstCharPtrAsTruffleString}, call = Direct)
11321132
abstract static class _PyUnicode_AsLatin1String extends NativeEncoderNode {
11331133
protected _PyUnicode_AsLatin1String() {
1134-
super(StandardCharsets.ISO_8859_1);
1134+
super(TruffleString.Encoding.ISO_8859_1);
11351135
}
11361136
}
11371137

11381138
@CApiBuiltin(ret = PyObjectTransfer, args = {PyObject, ConstCharPtrAsTruffleString}, call = Direct)
11391139
abstract static class _PyUnicode_AsASCIIString extends NativeEncoderNode {
11401140
protected _PyUnicode_AsASCIIString() {
1141-
super(StandardCharsets.US_ASCII);
1141+
super(TruffleString.Encoding.US_ASCII);
11421142
}
11431143
}
11441144

11451145
@CApiBuiltin(ret = PyObjectTransfer, args = {PyObject, ConstCharPtrAsTruffleString}, call = Direct)
11461146
abstract static class _PyUnicode_AsUTF8String extends NativeEncoderNode {
11471147

11481148
protected _PyUnicode_AsUTF8String() {
1149-
super(StandardCharsets.UTF_8);
1149+
super(TruffleString.Encoding.UTF_8);
11501150
}
11511151

11521152
@NeverDefault
@@ -1190,15 +1190,14 @@ abstract static class GraalPyPrivate_Unicode_FillUtf8 extends CApiUnaryBuiltinNo
11901190
@Specialization
11911191
static Object doNative(PythonAbstractNativeObject s,
11921192
@Cached CStructAccess.WriteLongNode writeLongNode,
1193-
@Cached _PyUnicode_AsUTF8String asUTF8String,
1194-
@CachedLibrary(limit = "1") PythonBufferAccessLibrary bufferLib,
1193+
@Cached EncodeNativeStringNode encodeNativeStringNode,
11951194
@Cached CStructAccess.WritePointerNode writePointerNode,
11961195
@Cached CStructAccess.AllocateNode allocateNode,
1197-
@Cached CStructAccess.WriteByteNode writeByteNode) {
1198-
PBytes bytes = (PBytes) asUTF8String.execute(s, T_STRICT);
1199-
int len = bufferLib.getBufferLength(bytes);
1196+
@Cached CStructAccess.WriteTruffleStringNode writeTruffleStringNode) {
1197+
TruffleString utf8Str = encodeNativeStringNode.execute(UTF_8, s, T_STRICT);
1198+
int len = utf8Str.byteLength(UTF_8);
12001199
Object mem = allocateNode.alloc(len + 1, true);
1201-
writeByteNode.writeByteArray(mem, bufferLib.getInternalOrCopiedByteArray(bytes), len, 0, 0);
1200+
writeTruffleStringNode.write(mem, utf8Str, UTF_8);
12021201
writePointerNode.writeToObj(s, CFields.PyCompactUnicodeObject__utf8, mem);
12031202
writeLongNode.writeToObject(s, CFields.PyCompactUnicodeObject__utf8_length, len);
12041203
return 0;
@@ -1259,15 +1258,13 @@ abstract static class GraalPyPrivate_Unicode_FillUnicode extends CApiUnaryBuilti
12591258
static Object doNative(PythonAbstractNativeObject s,
12601259
@Bind Node inliningTarget,
12611260
@Cached CastToTruffleStringNode cast,
1262-
@Cached UnicodeAsWideCharNode asWideCharNode,
1263-
@CachedLibrary(limit = "1") PythonBufferAccessLibrary bufferLib,
1261+
@Cached TruffleString.SwitchEncodingNode switchEncodingNode,
12641262
@Cached CStructAccess.AllocateNode allocateNode,
1265-
@Cached CStructAccess.WriteByteNode writeByteNode) {
1266-
int wcharSize = CStructs.wchar_t.size();
1267-
PBytes bytes = asWideCharNode.executeNativeOrder(inliningTarget, cast.castKnownString(inliningTarget, s), wcharSize);
1268-
int len = bufferLib.getBufferLength(bytes);
1269-
Object mem = allocateNode.alloc(len + wcharSize, true);
1270-
writeByteNode.writeByteArray(mem, bufferLib.getInternalOrCopiedByteArray(bytes), len, 0, 0);
1263+
@Cached CStructAccess.WriteTruffleStringNode writeTruffleStringNode) {
1264+
TruffleString str = switchEncodingNode.execute(cast.castKnownString(inliningTarget, s), WCHAR_T_ENCODING);
1265+
int len = str.byteLength(WCHAR_T_ENCODING);
1266+
Object mem = allocateNode.alloc(len + WCHAR_T_SIZE, true);
1267+
writeTruffleStringNode.write(mem, str, WCHAR_T_ENCODING);
12711268
return 0;
12721269
}
12731270
}

0 commit comments

Comments
 (0)