11
11
12
12
/* --- Internal Unicode Operations ---------------------------------------- */
13
13
14
- #ifndef USE_UNICODE_WCHAR_CACHE
15
- # define USE_UNICODE_WCHAR_CACHE 1
16
- #endif /* USE_UNICODE_WCHAR_CACHE */
17
-
18
- /* Since splitting on whitespace is an important use case, and
19
- whitespace in most situations is solely ASCII whitespace, we
20
- optimize for the common case by using a quick look-up table
21
- _Py_ascii_whitespace (see below) with an inlined check.
22
-
23
- */
24
- #define Py_UNICODE_ISSPACE (ch ) \
25
- ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26
-
27
- #define Py_UNICODE_ISLOWER (ch ) _PyUnicode_IsLowercase(ch)
28
- #define Py_UNICODE_ISUPPER (ch ) _PyUnicode_IsUppercase(ch)
29
- #define Py_UNICODE_ISTITLE (ch ) _PyUnicode_IsTitlecase(ch)
30
- #define Py_UNICODE_ISLINEBREAK (ch ) _PyUnicode_IsLinebreak(ch)
31
-
32
- #define Py_UNICODE_TOLOWER (ch ) _PyUnicode_ToLowercase(ch)
33
- #define Py_UNICODE_TOUPPER (ch ) _PyUnicode_ToUppercase(ch)
34
- #define Py_UNICODE_TOTITLE (ch ) _PyUnicode_ToTitlecase(ch)
35
-
36
- #define Py_UNICODE_ISDECIMAL (ch ) _PyUnicode_IsDecimalDigit(ch)
37
- #define Py_UNICODE_ISDIGIT (ch ) _PyUnicode_IsDigit(ch)
38
- #define Py_UNICODE_ISNUMERIC (ch ) _PyUnicode_IsNumeric(ch)
39
- #define Py_UNICODE_ISPRINTABLE (ch ) _PyUnicode_IsPrintable(ch)
14
+ // Static inline functions to work with surrogates
15
+ static inline int Py_UNICODE_IS_SURROGATE (Py_UCS4 ch ) {
16
+ return (0xD800 <= ch && ch <= 0xDFFF );
17
+ }
18
+ static inline int Py_UNICODE_IS_HIGH_SURROGATE (Py_UCS4 ch ) {
19
+ return (0xD800 <= ch && ch <= 0xDBFF );
20
+ }
21
+ static inline int Py_UNICODE_IS_LOW_SURROGATE (Py_UCS4 ch ) {
22
+ return (0xDC00 <= ch && ch <= 0xDFFF );
23
+ }
40
24
41
- #define Py_UNICODE_TODECIMAL (ch ) _PyUnicode_ToDecimalDigit(ch)
42
- #define Py_UNICODE_TODIGIT (ch ) _PyUnicode_ToDigit(ch)
43
- #define Py_UNICODE_TONUMERIC (ch ) _PyUnicode_ToNumeric(ch)
25
+ // Join two surrogate characters and return a single Py_UCS4 value.
26
+ static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES (Py_UCS4 high , Py_UCS4 low ) {
27
+ assert (Py_UNICODE_IS_HIGH_SURROGATE (high ));
28
+ assert (Py_UNICODE_IS_LOW_SURROGATE (low ));
29
+ return 0x10000 + (((high & 0x03FF ) << 10 ) | (low & 0x03FF ));
30
+ }
44
31
45
- #define Py_UNICODE_ISALPHA (ch ) _PyUnicode_IsAlpha(ch)
32
+ // High surrogate = top 10 bits added to 0xD800.
33
+ // The character must be in the range [U+10000; U+10ffff].
34
+ static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE (Py_UCS4 ch ) {
35
+ assert (0x10000 <= ch && ch <= 0x10ffff );
36
+ return (0xD800 - (0x10000 >> 10 ) + (ch >> 10 ));
37
+ }
46
38
47
- #define Py_UNICODE_ISALNUM (ch ) \
48
- (Py_UNICODE_ISALPHA(ch) || \
49
- Py_UNICODE_ISDECIMAL (ch) || \
50
- Py_UNICODE_ISDIGIT(ch) || \
51
- Py_UNICODE_ISNUMERIC(ch))
52
-
53
- /* macros to work with surrogates */
54
- #define Py_UNICODE_IS_SURROGATE (ch ) (0xD800 <= (ch) && (ch) <= 0xDFFF )
55
- #define Py_UNICODE_IS_HIGH_SURROGATE (ch ) (0xD800 <= (ch) && (ch) <= 0xDBFF )
56
- #define Py_UNICODE_IS_LOW_SURROGATE (ch ) (0xDC00 <= (ch) && (ch) <= 0xDFFF )
57
- /* Join two surrogate characters and return a single Py_UCS4 value. */
58
- #define Py_UNICODE_JOIN_SURROGATES (high, low ) \
59
- (((((Py_UCS4)(high) & 0x03FF ) << 10 ) | \
60
- ((Py_UCS4)(low) & 0x03FF )) + 0x10000 )
61
- /* high surrogate = top 10 bits added to D800 */
62
- #define Py_UNICODE_HIGH_SURROGATE (ch ) (0xD800 - (0x10000 >> 10 ) + ((ch) >> 10 ))
63
- /* low surrogate = bottom 10 bits added to DC00 */
64
- #define Py_UNICODE_LOW_SURROGATE (ch ) (0xDC00 + ((ch) & 0x3FF ))
39
+ // Low surrogate = bottom 10 bits added to 0xDC00.
40
+ // The character must be in the range [U+10000; U+10ffff].
41
+ static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE (Py_UCS4 ch ) {
42
+ assert (0x10000 <= ch && ch <= 0x10ffff );
43
+ return (0xDC00 + (ch & 0x3FF ));
44
+ }
65
45
66
46
/* --- Unicode Type ------------------------------------------------------- */
67
47
@@ -160,16 +140,12 @@ typedef struct {
160
140
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
161
141
set, use the PyASCIIObject structure. */
162
142
unsigned int ascii :1 ;
163
- /* The ready flag indicates whether the object layout is initialized
164
- completely. This means that this is either a compact object, or
165
- the data pointer is filled out. The bit is redundant, and helps
166
- to minimize the test in PyUnicode_IS_READY(). */
167
- unsigned int ready:1 ;
143
+ /* The object is statically allocated. */
144
+ unsigned int statically_allocated :1 ;
168
145
/* Padding to ensure that PyUnicode_DATA() is always aligned to
169
146
4 bytes (see issue #19537 on m68k). */
170
147
unsigned int :24 ;
171
148
} state ;
172
- wchar_t *wstr; /* wchar_t representation (null-terminated) */
173
149
} PyASCIIObject ;
174
150
175
151
/* Non-ASCII strings allocated through PyUnicode_New use the
@@ -180,8 +156,6 @@ typedef struct {
180
156
Py_ssize_t utf8_length ; /* Number of bytes in utf8, excluding the
181
157
* terminating \0. */
182
158
char * utf8 ; /* UTF-8 representation (null-terminated) */
183
- Py_ssize_t wstr_length; /* Number of code points in wstr, possible
184
- * surrogates count as two code points. */
185
159
} PyCompactUnicodeObject ;
186
160
187
161
/* Object format for Unicode subclasses. */
@@ -510,114 +484,12 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
510
484
Py_ssize_t start ,
511
485
Py_ssize_t end );
512
486
513
- /* --- Legacy deprecated API ---------------------------------------------- */
514
-
515
- /* Return a read-only pointer to the Unicode object's internal
516
- Py_UNICODE buffer.
517
- If the wchar_t/Py_UNICODE representation is not yet available, this
518
- function will calculate it. */
519
- Py_DEPRECATED (3.3 ) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
520
- PyObject *unicode /* Unicode object */
521
- );
522
-
523
- /* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
524
- contains null characters. */
525
- PyAPI_FUNC (const Py_UNICODE *) _PyUnicode_AsUnicode(
526
- PyObject *unicode /* Unicode object */
527
- );
528
-
529
- /* Fast access macros */
530
-
531
- Py_DEPRECATED (3.3 )
532
- static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op)
533
- {
534
- if (PyUnicode_IS_COMPACT_ASCII (op)) {
535
- return _PyASCIIObject_CAST (op)->length ;
536
- }
537
- else {
538
- return _PyCompactUnicodeObject_CAST (op)->wstr_length ;
539
- }
540
- }
541
- #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
542
- # define PyUnicode_WSTR_LENGTH (op ) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op))
543
- #endif
544
-
545
- /* Returns the deprecated Py_UNICODE representation's size in code units
546
- (this includes surrogate pairs as 2 units).
547
- If the Py_UNICODE representation is not available, it will be computed
548
- on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
549
-
550
- Py_DEPRECATED (3.3 )
551
- static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op)
552
- {
553
- _Py_COMP_DIAG_PUSH
554
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
555
- if (_PyASCIIObject_CAST (op)->wstr == _Py_NULL) {
556
- (void )PyUnicode_AsUnicode (op);
557
- assert (_PyASCIIObject_CAST (op)->wstr != _Py_NULL);
558
- }
559
- return PyUnicode_WSTR_LENGTH (op);
560
- _Py_COMP_DIAG_POP
561
- }
562
- #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
563
- # define PyUnicode_GET_SIZE (op ) PyUnicode_GET_SIZE(_PyObject_CAST(op))
564
- #endif
565
-
566
- Py_DEPRECATED (3.3 )
567
- static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op)
568
- {
569
- _Py_COMP_DIAG_PUSH
570
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
571
- return PyUnicode_GET_SIZE (op) * Py_UNICODE_SIZE;
572
- _Py_COMP_DIAG_POP
573
- }
574
- #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
575
- # define PyUnicode_GET_DATA_SIZE (op ) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op))
576
- #endif
577
-
578
- /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
579
- representation on demand. Using this macro is very inefficient now,
580
- try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
581
- use PyUnicode_WRITE() and PyUnicode_READ(). */
582
-
583
- Py_DEPRECATED (3.3 )
584
- static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op)
585
- {
586
- wchar_t *wstr = _PyASCIIObject_CAST (op)->wstr ;
587
- if (wstr != _Py_NULL) {
588
- return wstr;
589
- }
590
-
591
- _Py_COMP_DIAG_PUSH
592
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
593
- return PyUnicode_AsUnicode (op);
594
- _Py_COMP_DIAG_POP
595
- }
596
- #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
597
- # define PyUnicode_AS_UNICODE (op ) PyUnicode_AS_UNICODE(_PyObject_CAST(op))
598
- #endif
599
-
600
- Py_DEPRECATED (3.3 )
601
- static inline const char* PyUnicode_AS_DATA(PyObject *op)
602
- {
603
- _Py_COMP_DIAG_PUSH
604
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
605
- Py_UNICODE *data = PyUnicode_AS_UNICODE (op);
606
- // In C++, casting directly PyUnicode* to const char* is not valid
607
- return _Py_STATIC_CAST (const char *, _Py_STATIC_CAST (const void *, data));
608
- _Py_COMP_DIAG_POP
609
- }
610
- #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
611
- # define PyUnicode_AS_DATA (op ) PyUnicode_AS_DATA(_PyObject_CAST(op))
612
- #endif
613
-
614
-
615
487
/* --- _PyUnicodeWriter API ----------------------------------------------- */
616
488
617
489
typedef struct {
618
490
PyObject * buffer ;
619
491
void * data ;
620
- enum PyUnicode_Kind kind;
492
+ int kind ;
621
493
Py_UCS4 maxchar ;
622
494
Py_ssize_t size ;
623
495
Py_ssize_t pos ;
@@ -668,16 +540,15 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
668
540
669
541
Return 0 on success, raise an exception and return -1 on error. */
670
542
#define _PyUnicodeWriter_PrepareKind (WRITER , KIND ) \
671
- (assert((KIND) != PyUnicode_WCHAR_KIND), \
672
- (KIND) <= (WRITER)->kind \
543
+ ((KIND) <= (WRITER)->kind \
673
544
? 0 \
674
545
: _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
675
546
676
547
/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
677
548
macro instead. */
678
549
PyAPI_FUNC (int )
679
550
_PyUnicodeWriter_PrepareKindInternal (_PyUnicodeWriter * writer ,
680
- enum PyUnicode_Kind kind);
551
+ int kind );
681
552
682
553
/* Append a Unicode character.
683
554
Return 0 on success, raise an exception and return -1 on error. */
0 commit comments