Skip to content

Commit 9fc01af

Browse files
committed
[GR-26315] Correctly support PyUnicode_New.
PullRequest: graalpython/1295
2 parents 29d4032 + 3609014 commit 9fc01af

File tree

6 files changed

+106
-10
lines changed

6 files changed

+106
-10
lines changed

graalpython/com.oracle.graal.python.cext/src/unicodeobject.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ PyTypeObject PyUnicode_Type = PY_TRUFFLE_TYPE("str", &PyType_Type, Py_TPFLAGS_DE
4545
/* The empty Unicode object is shared to improve performance. */
4646
static PyObject *unicode_empty = NULL;
4747

48+
#define MAX_UNICODE 0x10ffff
49+
4850
#define _Py_RETURN_UNICODE_EMPTY() \
4951
do { \
5052
_Py_INCREF_UNICODE_EMPTY(); \
@@ -489,8 +491,30 @@ PyObject* PyUnicode_Join(PyObject *separator, PyObject *seq) {
489491
return UPCALL_CEXT_O(_jls_PyUnicode_Join, native_to_java(separator), native_to_java(seq));
490492
}
491493

494+
typedef PyObject* (*unicode_new_fun_t)(void* data, int elementSize, int is_ascii);
495+
UPCALL_TYPED_ID(PyUnicode_New, unicode_new_fun_t);
492496
PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) {
493-
return to_sulong(polyglot_from_string("", "ascii"));
497+
enum PyUnicode_Kind kind;
498+
int is_ascii = 0;
499+
if (maxchar < 128) {
500+
kind = PyUnicode_1BYTE_KIND;
501+
is_ascii = 1;
502+
} else if (maxchar < 256) {
503+
kind = PyUnicode_1BYTE_KIND;
504+
} else if (maxchar < 65536) {
505+
kind = PyUnicode_2BYTE_KIND;
506+
} else {
507+
if (maxchar > MAX_UNICODE) {
508+
PyErr_SetString(PyExc_SystemError,
509+
"invalid maximum character passed to PyUnicode_New");
510+
return NULL;
511+
}
512+
kind = PyUnicode_4BYTE_KIND;
513+
}
514+
515+
size_t n = size * kind;
516+
int8_t* ptr = (int8_t*) malloc(n);
517+
return _jls_PyUnicode_New(polyglot_from_i8_array((int8_t*)ptr, n), kind, is_ascii);
494518
}
495519

496520
UPCALL_ID(PyUnicode_Compare);

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/PythonCextBuiltins.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@
187187
import com.oracle.graal.python.builtins.objects.object.PythonObject;
188188
import com.oracle.graal.python.builtins.objects.object.PythonObjectLibrary;
189189
import com.oracle.graal.python.builtins.objects.set.PBaseSet;
190+
import com.oracle.graal.python.builtins.objects.str.NativeCharSequence;
190191
import com.oracle.graal.python.builtins.objects.str.PString;
191192
import com.oracle.graal.python.builtins.objects.traceback.GetTracebackNode;
192193
import com.oracle.graal.python.builtins.objects.traceback.LazyTraceback;
@@ -714,6 +715,17 @@ Object run(PBaseException exception, Object object,
714715
}
715716
}
716717

718+
// directly called without landing function
719+
@Builtin(name = "PyUnicode_New", minNumOfPositionalArgs = 3)
720+
@GenerateNodeFactory
721+
abstract static class PyUnicodeNewNode extends PythonBuiltinNode {
722+
@Specialization
723+
Object doGeneric(Object ptr, int elementSize, int isAscii,
724+
@Cached CExtNodes.ToNewRefNode toNewRefNode) {
725+
return toNewRefNode.execute(factory().createString(new NativeCharSequence(ptr, elementSize, isAscii != 0)));
726+
}
727+
}
728+
717729
@Builtin(name = "PyUnicode_FromString", minNumOfPositionalArgs = 1)
718730
@GenerateNodeFactory
719731
abstract static class PyUnicodeFromStringNode extends PythonBuiltinNode {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/CExtNodes.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1451,7 +1451,7 @@ public abstract static class FromCharPointerNode extends Node {
14511451
@Specialization
14521452
PString execute(Object charPtr,
14531453
@Cached PythonObjectFactory factory) {
1454-
return factory.createString(new NativeCharSequence(charPtr));
1454+
return factory.createString(new NativeCharSequence(charPtr, 1, false));
14551455
}
14561456
}
14571457

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/PyUnicodeWrappers.java

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,15 @@
5353
import java.nio.charset.CharsetEncoder;
5454
import java.nio.charset.StandardCharsets;
5555

56+
import com.oracle.graal.python.builtins.objects.cext.CExtNodes.SizeofWCharNode;
5657
import com.oracle.graal.python.builtins.objects.cext.DynamicObjectNativeWrapper.PAsPointerNode;
5758
import com.oracle.graal.python.builtins.objects.cext.DynamicObjectNativeWrapper.ToPyObjectNode;
5859
import com.oracle.graal.python.builtins.objects.cext.UnicodeObjectNodes.UnicodeAsWideCharNode;
60+
import com.oracle.graal.python.builtins.objects.str.NativeCharSequence;
5961
import com.oracle.graal.python.builtins.objects.str.PString;
6062
import com.oracle.graal.python.builtins.objects.str.StringNodes.StringLenNode;
63+
import com.oracle.graal.python.builtins.objects.str.StringNodes.StringMaterializeNode;
64+
import com.oracle.truffle.api.CompilerDirectives;
6165
import com.oracle.truffle.api.CompilerDirectives.CompilationFinal;
6266
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
6367
import com.oracle.truffle.api.dsl.Cached;
@@ -67,6 +71,7 @@
6771
import com.oracle.truffle.api.library.CachedLibrary;
6872
import com.oracle.truffle.api.library.ExportLibrary;
6973
import com.oracle.truffle.api.library.ExportMessage;
74+
import com.oracle.truffle.api.profiles.ConditionProfile;
7075
import com.oracle.truffle.llvm.spi.NativeTypeLibrary;
7176

7277
public abstract class PyUnicodeWrappers {
@@ -163,6 +168,12 @@ Object readMember(String member,
163168
if (isMemberReadable(member)) {
164169
int elementSize = (int) sizeofWcharNode.execute();
165170
PString s = getPString(lib);
171+
CharSequence content = s.getCharSequence();
172+
173+
if (content instanceof NativeCharSequence) {
174+
// in this case, we can just return the pointer
175+
return ((NativeCharSequence) content).getPtr();
176+
}
166177
return new PySequenceArrayWrapper(asWideCharNode.execute(s, elementSize, stringLenNode.execute(s)), elementSize);
167178
}
168179
throw UnknownIdentifierException.create(member);
@@ -209,25 +220,46 @@ boolean isMemberReadable(String member) {
209220
@ExportMessage
210221
Object readMember(String member,
211222
@CachedLibrary("this") PythonNativeWrapperLibrary lib,
223+
@Cached ConditionProfile storageProfile,
224+
@Cached StringMaterializeNode materializeNode,
212225
@Cached CExtNodes.SizeofWCharNode sizeofWcharNode) throws UnknownIdentifierException {
213226
// padding(24), ready(1), ascii(1), compact(1), kind(3), interned(2)
214227
int value = 0b000000000000000000000000_1_0_0_000_00;
215-
if (onlyAscii(getPString(lib).getValue())) {
228+
PString delegate = getPString(lib);
229+
if (onlyAscii(delegate, storageProfile, materializeNode)) {
216230
value |= 0b1_0_000_00;
217231
}
218-
value |= ((int) sizeofWcharNode.execute() << 2) & 0b11100;
232+
value |= (getKind(delegate, storageProfile, sizeofWcharNode) << 2) & 0b11100;
219233
if (isMemberReadable(member)) {
220234
// it's a bit field; so we need to return the whole 32-bit word
221235
return value;
222236
}
223237
throw UnknownIdentifierException.create(member);
224238
}
225239

226-
private boolean onlyAscii(String value) {
240+
private boolean onlyAscii(PString value, ConditionProfile storageProfile, StringMaterializeNode stringMaterializeNode) {
241+
CharSequence storage = value.getCharSequence();
242+
243+
// important: avoid materialization of native sequences
244+
if (storageProfile.profile(storage instanceof NativeCharSequence)) {
245+
return ((NativeCharSequence) storage).isAsciiOnly();
246+
}
247+
227248
if (asciiEncoder == null) {
249+
CompilerDirectives.transferToInterpreterAndInvalidate();
228250
asciiEncoder = newAsciiEncoder();
229251
}
230-
return doCheck(value, asciiEncoder);
252+
return doCheck(stringMaterializeNode.execute(value), asciiEncoder);
253+
}
254+
255+
private static int getKind(PString value, ConditionProfile storageProfile, SizeofWCharNode sizeofWcharNode) {
256+
CharSequence storage = value.getCharSequence();
257+
258+
// important: avoid materialization of native sequences
259+
if (storageProfile.profile(storage instanceof NativeCharSequence)) {
260+
return ((NativeCharSequence) storage).getElementSize();
261+
}
262+
return (int) sizeofWcharNode.execute();
231263
}
232264

233265
@TruffleBoundary

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/UnicodeObjectNodes.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -49,6 +49,7 @@
4949
import com.oracle.graal.python.builtins.objects.cext.UnicodeObjectNodesFactory.UnicodeAsWideCharNodeGen.LittleEndianNodeGen;
5050
import com.oracle.graal.python.builtins.objects.cext.UnicodeObjectNodesFactory.UnicodeAsWideCharNodeGen.NativeOrderNodeGen;
5151
import com.oracle.graal.python.builtins.objects.str.PString;
52+
import com.oracle.graal.python.builtins.objects.str.StringNodes.StringMaterializeNode;
5253
import com.oracle.graal.python.runtime.object.PythonObjectFactory;
5354
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
5455
import com.oracle.truffle.api.dsl.Cached;
@@ -129,8 +130,9 @@ public static UnicodeAsWideCharNode getUncachedBigEndian() {
129130

130131
@Specialization
131132
PBytes doUnicode(PString s, long elementSize, long elements,
133+
@Cached StringMaterializeNode materializeNode,
132134
@Shared("factory") @Cached PythonObjectFactory factory) {
133-
return doUnicode(s.getValue(), elementSize, elements, factory);
135+
return doUnicode(materializeNode.execute(s), elementSize, elements, factory);
134136
}
135137

136138
@Specialization

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeCharSequence.java

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,29 @@
4848

4949
public final class NativeCharSequence implements PCharSequence {
5050

51+
/**
52+
* Pointer to the native buffer (most like a {@code char*} containing ASCII characters but could
53+
* also be an arbitrary {@code void*} for {@code Py_UCS1}, {@code Py_UCS2}, or {@code Py_UCS4}
54+
* characters)
55+
*/
5156
private final Object ptr;
57+
58+
/**
59+
* The size of a single character in bytes (valid values are {@code 1, 2, 4}).
60+
*/
61+
private final int elementSize;
62+
63+
/**
64+
* Specifies if the native buffer contains only ASCII characters.
65+
*/
66+
private final boolean asciiOnly;
67+
5268
private String materialized;
5369

54-
public NativeCharSequence(Object ptr) {
70+
public NativeCharSequence(Object ptr, int elementSize, boolean asciiOnly) {
5571
this.ptr = ptr;
72+
this.elementSize = elementSize;
73+
this.asciiOnly = asciiOnly;
5674
}
5775

5876
@Override
@@ -90,10 +108,18 @@ public String materialize(PCallCapiFunction node) {
90108
return materialized;
91109
}
92110

93-
Object getPtr() {
111+
public Object getPtr() {
94112
return ptr;
95113
}
96114

115+
public int getElementSize() {
116+
return elementSize;
117+
}
118+
119+
public boolean isAsciiOnly() {
120+
return asciiOnly;
121+
}
122+
97123
@Override
98124
public String toString() {
99125
CompilerAsserts.neverPartOfCompilation();

0 commit comments

Comments
 (0)