@@ -109,6 +109,109 @@ static Py_ssize_t unicode_aswidechar(PyObject *unicode, wchar_t *w, Py_ssize_t s
109
109
}
110
110
}
111
111
112
+ #define _PyUnicode_UTF8 (op ) \
113
+ (((PyCompactUnicodeObject*)(op))->utf8)
114
+ #define _PyUnicode_UTF8_LENGTH (op ) \
115
+ (((PyCompactUnicodeObject*)(op))->utf8_length)
116
+ #define _PyUnicode_WSTR (op ) \
117
+ (((PyASCIIObject*)(op))->wstr)
118
+ #define _PyUnicode_WSTR_LENGTH (op ) \
119
+ (((PyCompactUnicodeObject*)(op))->wstr_length)
120
+ #define _PyUnicode_LENGTH (op ) \
121
+ (((PyASCIIObject *)(op))->length)
122
+ #define _PyUnicode_STATE (op ) \
123
+ (((PyASCIIObject *)(op))->state)
124
+ #define _PyUnicode_DATA_ANY (op ) \
125
+ (((PyUnicodeObject*)(op))->data.any)
126
+
127
+ POLYGLOT_DECLARE_TYPE (PyUnicodeObject );
128
+
129
+ PyUnicodeObject * unicode_subtype_new (PyTypeObject * type , PyObject * unicode ) {
130
+ PyObject * self ;
131
+ Py_ssize_t length , char_size ;
132
+ int share_wstr , share_utf8 ;
133
+ unsigned int kind ;
134
+ void * data ;
135
+
136
+ if (unicode == NULL )
137
+ return NULL ;
138
+ assert (_PyUnicode_CHECK (unicode ));
139
+ if (PyUnicode_READY (unicode ) == -1 ) {
140
+ Py_DECREF (unicode );
141
+ return NULL ;
142
+ }
143
+
144
+ self = type -> tp_alloc (type , 0 );
145
+ if (self == NULL ) {
146
+ Py_DECREF (unicode );
147
+ return NULL ;
148
+ }
149
+ kind = PyUnicode_KIND (unicode );
150
+ length = PyUnicode_GET_LENGTH (unicode );
151
+
152
+ _PyUnicode_LENGTH (self ) = length ;
153
+ _PyUnicode_STATE (self ).interned = 0 ;
154
+ _PyUnicode_STATE (self ).kind = kind ;
155
+ _PyUnicode_STATE (self ).compact = 0 ;
156
+ _PyUnicode_STATE (self ).ascii = _PyUnicode_STATE (unicode ).ascii ;
157
+ _PyUnicode_STATE (self ).ready = 1 ;
158
+ _PyUnicode_WSTR (self ) = NULL ;
159
+ _PyUnicode_UTF8_LENGTH (self ) = 0 ;
160
+ _PyUnicode_UTF8 (self ) = NULL ;
161
+ _PyUnicode_WSTR_LENGTH (self ) = 0 ;
162
+ _PyUnicode_DATA_ANY (self ) = NULL ;
163
+
164
+ share_utf8 = 0 ;
165
+ share_wstr = 0 ;
166
+ if (kind == PyUnicode_1BYTE_KIND ) {
167
+ char_size = 1 ;
168
+ if (PyUnicode_MAX_CHAR_VALUE (unicode ) < 128 )
169
+ share_utf8 = 1 ;
170
+ }
171
+ else if (kind == PyUnicode_2BYTE_KIND ) {
172
+ char_size = 2 ;
173
+ if (sizeof (wchar_t ) == 2 )
174
+ share_wstr = 1 ;
175
+ }
176
+ else {
177
+ assert (kind == PyUnicode_4BYTE_KIND );
178
+ char_size = 4 ;
179
+ if (sizeof (wchar_t ) == 4 )
180
+ share_wstr = 1 ;
181
+ }
182
+
183
+ /* Ensure we won't overflow the length. */
184
+ if (length > (PY_SSIZE_T_MAX / char_size - 1 )) {
185
+ PyErr_NoMemory ();
186
+ // Py_DECREF(unicode);
187
+ // Py_DECREF(self);
188
+ return NULL ;
189
+ }
190
+ data = malloc ((length + 1 ) * char_size );
191
+ if (data == NULL ) {
192
+ PyErr_NoMemory ();
193
+ // Py_DECREF(unicode);
194
+ // Py_DECREF(self);
195
+ return NULL ;
196
+ }
197
+
198
+ _PyUnicode_DATA_ANY (self ) = data ;
199
+ if (share_utf8 ) {
200
+ _PyUnicode_UTF8_LENGTH (self ) = length ;
201
+ _PyUnicode_UTF8 (self ) = data ;
202
+ }
203
+ if (share_wstr ) {
204
+ _PyUnicode_WSTR_LENGTH (self ) = length ;
205
+ _PyUnicode_WSTR (self ) = (wchar_t * )data ;
206
+ }
207
+
208
+ memcpy (data , PyUnicode_DATA (unicode ),
209
+ kind * (length + 1 ));
210
+ assert (_PyUnicode_CheckConsistency (self , 1 ));
211
+ Py_DECREF (unicode );
212
+ return (PyUnicodeObject * ) polyglot_from_PyUnicodeObject ((PyUnicodeObject * )self );
213
+ }
214
+
112
215
PyObject * PyUnicode_FromString (const char * o ) {
113
216
return to_sulong (polyglot_from_string (o , SRC_CS ));
114
217
}
@@ -245,9 +348,8 @@ PyObject* PyUnicode_FromObject(PyObject* o) {
245
348
return UPCALL_CEXT_O (_jls_PyUnicode_FromObject , native_to_java (o ));
246
349
}
247
350
248
- UPCALL_ID (PyUnicode_GetLength );
249
351
Py_ssize_t PyUnicode_GetLength (PyObject * unicode ) {
250
- return UPCALL_CEXT_L ( _jls_PyUnicode_GetLength , native_to_java ( unicode ) );
352
+ return PyUnicode_GET_LENGTH ( unicode );
251
353
}
252
354
253
355
UPCALL_ID (PyUnicode_Concat );
@@ -305,7 +407,7 @@ PyObject * PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *err
305
407
PyObject * result ;
306
408
void * jerrors = errors != NULL ? polyglot_from_string (errors , SRC_CS ) : NULL ;
307
409
int bo = byteorder != NULL ? * byteorder : 0 ;
308
- return polyglot_invoke (PY_TRUFFLE_CEXT , "PyTruffle_Unicode_DecodeUTF32" , s , size , native_to_java (jerrors ), bo , NULL );
410
+ return polyglot_invoke (PY_TRUFFLE_CEXT , "PyTruffle_Unicode_DecodeUTF32" , polyglot_from_i8_array ( s , size ) , size , native_to_java (jerrors ), bo , NULL );
309
411
}
310
412
311
413
Py_ssize_t PyUnicode_AsWideChar (PyObject * unicode , wchar_t * w , Py_ssize_t size ) {
@@ -525,3 +627,67 @@ UPCALL_ID(PyUnicode_Replace);
525
627
PyObject * PyUnicode_Replace (PyObject * str , PyObject * substr , PyObject * replstr , Py_ssize_t maxcount ) {
526
628
return UPCALL_CEXT_O (_jls_PyUnicode_Replace , native_to_java (str ), native_to_java (substr ), native_to_java (replstr ), maxcount );
527
629
}
630
+
631
+ /* Generic helper macro to convert characters of different types.
632
+ from_type and to_type have to be valid type names, begin and end
633
+ are pointers to the source characters which should be of type
634
+ "from_type *". to is a pointer of type "to_type *" and points to the
635
+ buffer where the result characters are written to. */
636
+ #define _PyUnicode_CONVERT_BYTES (from_type , to_type , begin , end , to ) \
637
+ do { \
638
+ to_type *_to = (to_type *)(to); \
639
+ const from_type *_iter = (from_type *)(begin); \
640
+ const from_type *_end = (from_type *)(end); \
641
+ Py_ssize_t n = (_end) - (_iter); \
642
+ const from_type *_unrolled_end = \
643
+ _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
644
+ while (_iter < (_unrolled_end)) { \
645
+ _to[0] = (to_type) _iter[0]; \
646
+ _to[1] = (to_type) _iter[1]; \
647
+ _to[2] = (to_type) _iter[2]; \
648
+ _to[3] = (to_type) _iter[3]; \
649
+ _iter += 4; _to += 4; \
650
+ } \
651
+ while (_iter < (_end)) \
652
+ *_to++ = (to_type) *_iter++; \
653
+ } while (0)
654
+
655
+
656
+ POLYGLOT_DECLARE_TYPE (Py_UCS4 );
657
+
658
+ /* used from Java only to decode a native unicode object */
659
+ void * native_unicode_as_string (PyObject * string ) {
660
+ Py_UCS4 * target = NULL ;
661
+ int kind = 0 ;
662
+ void * data = NULL ;
663
+ void * result = NULL ;
664
+ Py_ssize_t len ;
665
+ if (PyUnicode_READY (string ) == -1 ) {
666
+ PyErr_Format (PyExc_TypeError , "provided unicode object is not ready" );
667
+ return NULL ;
668
+ }
669
+ kind = PyUnicode_KIND (string );
670
+ data = PyUnicode_DATA (string );
671
+ len = PyUnicode_GET_LENGTH (string );
672
+ if (kind == PyUnicode_1BYTE_KIND ) {
673
+ Py_UCS1 * start = (Py_UCS1 * ) data ;
674
+ if (PyUnicode_IS_COMPACT_ASCII (string )) {
675
+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS1 ) * len , "ascii" );
676
+ }
677
+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS1 ) * len , "latin1" );
678
+ }
679
+ else if (kind == PyUnicode_2BYTE_KIND ) {
680
+ Py_UCS2 * start = (Py_UCS2 * ) data ;
681
+ target = PyMem_New (Py_UCS4 , len );
682
+ if (!target ) {
683
+ PyErr_NoMemory ();
684
+ return NULL ;
685
+ }
686
+ _PyUnicode_CONVERT_BYTES (Py_UCS2 , Py_UCS4 , start , start + len , target );
687
+ result = polyglot_from_string_n ((const char * )target , sizeof (Py_UCS4 ) * len , "UTF-32" );
688
+ free (target );
689
+ return result ;
690
+ }
691
+ assert (kind == PyUnicode_4BYTE_KIND );
692
+ return polyglot_from_string_n ((const char * )data , sizeof (Py_UCS4 ) * len , "UTF-32" );
693
+ }
0 commit comments