Skip to content

Commit 001f12b

Browse files
committed
[GR-19758] Support native string subclasses.
PullRequest: graalpython/755
2 parents 8e16bd1 + aac3102 commit 001f12b

34 files changed

+1831
-1129
lines changed

graalpython/com.oracle.graal.python.cext/src/unicodeobject.c

Lines changed: 169 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,109 @@ static Py_ssize_t unicode_aswidechar(PyObject *unicode, wchar_t *w, Py_ssize_t s
109109
}
110110
}
111111

112+
#define _PyUnicode_UTF8(op) \
113+
(((PyCompactUnicodeObject*)(op))->utf8)
114+
#define _PyUnicode_UTF8_LENGTH(op) \
115+
(((PyCompactUnicodeObject*)(op))->utf8_length)
116+
#define _PyUnicode_WSTR(op) \
117+
(((PyASCIIObject*)(op))->wstr)
118+
#define _PyUnicode_WSTR_LENGTH(op) \
119+
(((PyCompactUnicodeObject*)(op))->wstr_length)
120+
#define _PyUnicode_LENGTH(op) \
121+
(((PyASCIIObject *)(op))->length)
122+
#define _PyUnicode_STATE(op) \
123+
(((PyASCIIObject *)(op))->state)
124+
#define _PyUnicode_DATA_ANY(op) \
125+
(((PyUnicodeObject*)(op))->data.any)
126+
127+
POLYGLOT_DECLARE_TYPE(PyUnicodeObject);
128+
129+
PyUnicodeObject* unicode_subtype_new(PyTypeObject *type, PyObject *unicode) {
130+
PyObject *self;
131+
Py_ssize_t length, char_size;
132+
int share_wstr, share_utf8;
133+
unsigned int kind;
134+
void *data;
135+
136+
if (unicode == NULL)
137+
return NULL;
138+
assert(_PyUnicode_CHECK(unicode));
139+
if (PyUnicode_READY(unicode) == -1) {
140+
Py_DECREF(unicode);
141+
return NULL;
142+
}
143+
144+
self = type->tp_alloc(type, 0);
145+
if (self == NULL) {
146+
Py_DECREF(unicode);
147+
return NULL;
148+
}
149+
kind = PyUnicode_KIND(unicode);
150+
length = PyUnicode_GET_LENGTH(unicode);
151+
152+
_PyUnicode_LENGTH(self) = length;
153+
_PyUnicode_STATE(self).interned = 0;
154+
_PyUnicode_STATE(self).kind = kind;
155+
_PyUnicode_STATE(self).compact = 0;
156+
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
157+
_PyUnicode_STATE(self).ready = 1;
158+
_PyUnicode_WSTR(self) = NULL;
159+
_PyUnicode_UTF8_LENGTH(self) = 0;
160+
_PyUnicode_UTF8(self) = NULL;
161+
_PyUnicode_WSTR_LENGTH(self) = 0;
162+
_PyUnicode_DATA_ANY(self) = NULL;
163+
164+
share_utf8 = 0;
165+
share_wstr = 0;
166+
if (kind == PyUnicode_1BYTE_KIND) {
167+
char_size = 1;
168+
if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
169+
share_utf8 = 1;
170+
}
171+
else if (kind == PyUnicode_2BYTE_KIND) {
172+
char_size = 2;
173+
if (sizeof(wchar_t) == 2)
174+
share_wstr = 1;
175+
}
176+
else {
177+
assert(kind == PyUnicode_4BYTE_KIND);
178+
char_size = 4;
179+
if (sizeof(wchar_t) == 4)
180+
share_wstr = 1;
181+
}
182+
183+
/* Ensure we won't overflow the length. */
184+
if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
185+
PyErr_NoMemory();
186+
// Py_DECREF(unicode);
187+
// Py_DECREF(self);
188+
return NULL;
189+
}
190+
data = malloc((length + 1) * char_size);
191+
if (data == NULL) {
192+
PyErr_NoMemory();
193+
// Py_DECREF(unicode);
194+
// Py_DECREF(self);
195+
return NULL;
196+
}
197+
198+
_PyUnicode_DATA_ANY(self) = data;
199+
if (share_utf8) {
200+
_PyUnicode_UTF8_LENGTH(self) = length;
201+
_PyUnicode_UTF8(self) = data;
202+
}
203+
if (share_wstr) {
204+
_PyUnicode_WSTR_LENGTH(self) = length;
205+
_PyUnicode_WSTR(self) = (wchar_t *)data;
206+
}
207+
208+
memcpy(data, PyUnicode_DATA(unicode),
209+
kind * (length + 1));
210+
assert(_PyUnicode_CheckConsistency(self, 1));
211+
Py_DECREF(unicode);
212+
return (PyUnicodeObject*) polyglot_from_PyUnicodeObject((PyUnicodeObject*)self);
213+
}
214+
112215
PyObject* PyUnicode_FromString(const char* o) {
113216
return to_sulong(polyglot_from_string(o, SRC_CS));
114217
}
@@ -245,9 +348,8 @@ PyObject* PyUnicode_FromObject(PyObject* o) {
245348
return UPCALL_CEXT_O(_jls_PyUnicode_FromObject, native_to_java(o));
246349
}
247350

248-
UPCALL_ID(PyUnicode_GetLength);
249351
Py_ssize_t PyUnicode_GetLength(PyObject *unicode) {
250-
return UPCALL_CEXT_L(_jls_PyUnicode_GetLength, native_to_java(unicode));
352+
return PyUnicode_GET_LENGTH(unicode);
251353
}
252354

253355
UPCALL_ID(PyUnicode_Concat);
@@ -305,7 +407,7 @@ PyObject * PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *err
305407
PyObject *result;
306408
void *jerrors = errors != NULL ? polyglot_from_string(errors, SRC_CS) : NULL;
307409
int bo = byteorder != NULL ? *byteorder : 0;
308-
return polyglot_invoke(PY_TRUFFLE_CEXT, "PyTruffle_Unicode_DecodeUTF32", s, size, native_to_java(jerrors), bo, NULL);
410+
return polyglot_invoke(PY_TRUFFLE_CEXT, "PyTruffle_Unicode_DecodeUTF32", polyglot_from_i8_array(s, size), size, native_to_java(jerrors), bo, NULL);
309411
}
310412

311413
Py_ssize_t PyUnicode_AsWideChar(PyObject *unicode, wchar_t *w, Py_ssize_t size) {
@@ -525,3 +627,67 @@ UPCALL_ID(PyUnicode_Replace);
525627
PyObject * PyUnicode_Replace(PyObject *str, PyObject *substr, PyObject *replstr, Py_ssize_t maxcount) {
526628
return UPCALL_CEXT_O(_jls_PyUnicode_Replace, native_to_java(str), native_to_java(substr), native_to_java(replstr), maxcount);
527629
}
630+
631+
/* Generic helper macro to convert characters of different types.
632+
from_type and to_type have to be valid type names, begin and end
633+
are pointers to the source characters which should be of type
634+
"from_type *". to is a pointer of type "to_type *" and points to the
635+
buffer where the result characters are written to. */
636+
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
637+
do { \
638+
to_type *_to = (to_type *)(to); \
639+
const from_type *_iter = (from_type *)(begin); \
640+
const from_type *_end = (from_type *)(end); \
641+
Py_ssize_t n = (_end) - (_iter); \
642+
const from_type *_unrolled_end = \
643+
_iter + _Py_SIZE_ROUND_DOWN(n, 4); \
644+
while (_iter < (_unrolled_end)) { \
645+
_to[0] = (to_type) _iter[0]; \
646+
_to[1] = (to_type) _iter[1]; \
647+
_to[2] = (to_type) _iter[2]; \
648+
_to[3] = (to_type) _iter[3]; \
649+
_iter += 4; _to += 4; \
650+
} \
651+
while (_iter < (_end)) \
652+
*_to++ = (to_type) *_iter++; \
653+
} while (0)
654+
655+
656+
POLYGLOT_DECLARE_TYPE(Py_UCS4);
657+
658+
/* used from Java only to decode a native unicode object */
659+
void* native_unicode_as_string(PyObject *string) {
660+
Py_UCS4 *target = NULL;
661+
int kind = 0;
662+
void *data = NULL;
663+
void *result = NULL;
664+
Py_ssize_t len;
665+
if (PyUnicode_READY(string) == -1) {
666+
PyErr_Format(PyExc_TypeError, "provided unicode object is not ready");
667+
return NULL;
668+
}
669+
kind = PyUnicode_KIND(string);
670+
data = PyUnicode_DATA(string);
671+
len = PyUnicode_GET_LENGTH(string);
672+
if (kind == PyUnicode_1BYTE_KIND) {
673+
Py_UCS1 *start = (Py_UCS1 *) data;
674+
if (PyUnicode_IS_COMPACT_ASCII(string)) {
675+
return polyglot_from_string_n((const char *)data, sizeof(Py_UCS1) * len, "ascii");
676+
}
677+
return polyglot_from_string_n((const char *)data, sizeof(Py_UCS1) * len, "latin1");
678+
}
679+
else if (kind == PyUnicode_2BYTE_KIND) {
680+
Py_UCS2 *start = (Py_UCS2 *) data;
681+
target = PyMem_New(Py_UCS4, len);
682+
if (!target) {
683+
PyErr_NoMemory();
684+
return NULL;
685+
}
686+
_PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
687+
result = polyglot_from_string_n((const char *)target, sizeof(Py_UCS4) * len, "UTF-32");
688+
free(target);
689+
return result;
690+
}
691+
assert(kind == PyUnicode_4BYTE_KIND);
692+
return polyglot_from_string_n((const char *)data, sizeof(Py_UCS4) * len, "UTF-32");
693+
}

0 commit comments

Comments
 (0)