-
-
Notifications
You must be signed in to change notification settings - Fork 33.2k
gh-120196: Faster ascii_decode and find_max_char implementations #120212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
a54bf86
71ff457
98a6449
10527c6
f0f4139
849a068
fad19a0
f04bb2c
cd0fc5e
8f0fd56
37aee7a
a6fc417
104ca62
d465517
1ce308e
48f1e84
21de804
1ec2113
0258ae0
ec76b74
845eb4e
f8cc68d
89ab2c9
002b7ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| Improve performance of ASCII decoding and maximum character checking | ||
| by allowing vectorization by the compiler on suitable platforms. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,23 +20,45 @@ Py_LOCAL_INLINE(Py_UCS4) | |
| STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||
| { | ||
| const unsigned char *p = (const unsigned char *) begin; | ||
| const unsigned char *_end = (const unsigned char *)end; | ||
| const size_t *aligned_end = (const size_t *)(_end - SIZEOF_SIZE_T); | ||
| const size_t *unrolled_end = aligned_end - 3; | ||
| unsigned char accumulator = 0; | ||
| /* Do not test each character individually, but use bitwise OR and test | ||
| all characters at once. */ | ||
| while (p < _end && !_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { | ||
| accumulator |= *p; | ||
| p += 1; | ||
| } | ||
| if (accumulator & 0x80) { | ||
| return 255; | ||
| } else if (p == end) { | ||
| return 127; | ||
| } | ||
|
|
||
| while (p < end) { | ||
| if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { | ||
| /* Help register allocation */ | ||
| const unsigned char *_p = p; | ||
| while (_p + SIZEOF_SIZE_T <= end) { | ||
| size_t value = *(const size_t *) _p; | ||
| if (value & UCS1_ASCII_CHAR_MASK) | ||
| return 255; | ||
| _p += SIZEOF_SIZE_T; | ||
| } | ||
| p = _p; | ||
| if (p == end) | ||
| break; | ||
| } | ||
| if (*p++ & 0x80) | ||
| /* On 64-bit platforms with 128-bit vectors (x86-64, arm64) the | ||
| compiler can load 4 size_t values into two 16-byte vectors and do a | ||
| vector bitwise OR. */ | ||
| const size_t *_p = (const size_t *)p; | ||
| while (_p < unrolled_end) { | ||
| size_t value = _p[0] | _p[1] | _p[2] | _p[3]; | ||
| if (value & UCS1_ASCII_CHAR_MASK) { | ||
| return 255; | ||
| } | ||
| _p += 4; | ||
| } | ||
| size_t value = 0; | ||
| while (_p < aligned_end) { | ||
| value |= *_p; | ||
| _p += 1; | ||
| } | ||
| p = (const unsigned char *)_p; | ||
| while (p < _end) { | ||
|
||
| value |= *p; | ||
| p += 1; | ||
| } | ||
|
||
| if (value & UCS1_ASCII_CHAR_MASK) { | ||
| return 255; | ||
| } | ||
| return 127; | ||
| } | ||
|
|
@@ -69,13 +91,15 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | |
| Py_UCS4 mask; | ||
| Py_ssize_t n = end - begin; | ||
| const STRINGLIB_CHAR *p = begin; | ||
| const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); | ||
| const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 8); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in the other PR, maybe the 4/8 choice can be chosen at compile time depending on the architecture. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 99% of production work were performance matters will be done on ARM64 (with 16-byte neon vectors) and X86-64 (with SSE2 vectors) other platforms will not be hurt by this decision. I think there is no reason to complicate the build with choices like this. |
||
| Py_UCS4 max_char; | ||
|
|
||
| max_char = MAX_CHAR_ASCII; | ||
| mask = MASK_ASCII; | ||
| while (p < unrolled_end) { | ||
| STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; | ||
| /* Loading 8 values at once allows platforms that have 16-byte vectors | ||
| to do a vector load and vector bitwise OR. */ | ||
| STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here, you would have some #if arch == ... to choose between 4 or 8 values. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel that those vector load could be macros, for clarity purposes. It would then be optimized by the compiler but having them as macros might be helpful for future work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would to not have a macro, the code is good as it is. |
||
| if (bits & mask) { | ||
| if (mask == mask_limit) { | ||
| /* Limit reached */ | ||
|
|
@@ -94,7 +118,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | |
| /* We check the new mask on the same chars in the next iteration */ | ||
| continue; | ||
| } | ||
| p += 4; | ||
| p += 8; | ||
| } | ||
| while (p < end) { | ||
| if (p[0] & mask) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4700,6 +4700,8 @@ static Py_ssize_t | |
| ascii_decode(const char *start, const char *end, Py_UCS1 *dest) | ||
| { | ||
| const char *p = start; | ||
| const char *size_t_end = end - SIZEOF_SIZE_T; | ||
| const char *unrolled_end = end - (4 * SIZEOF_SIZE_T); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO, we should prefer (There is no C API WG formal guideline yet; this is a personal recommendation. If anyone wants a wider discussion, do that in Discourse.) |
||
|
|
||
| #if SIZEOF_SIZE_T <= SIZEOF_VOID_P | ||
|
||
| if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T) | ||
|
|
@@ -4710,7 +4712,25 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) | |
| /* Help allocation */ | ||
| const char *_p = p; | ||
| Py_UCS1 * q = dest; | ||
| while (_p + SIZEOF_SIZE_T <= end) { | ||
| while (_p <= unrolled_end) { | ||
| const size_t *restrict __p = (const size_t *)_p; | ||
|
||
| size_t value0 = __p[0]; | ||
| size_t value1 = __p[1]; | ||
| size_t value2 = __p[2]; | ||
| size_t value3 = __p[3]; | ||
| size_t value = value0 | value1 | value2 | value3; | ||
| if (value & ASCII_CHAR_MASK) { | ||
| break; | ||
| } | ||
| size_t *restrict _q = (size_t *)q; | ||
| _q[0] = value0; | ||
| _q[1] = value1; | ||
| _q[2] = value2; | ||
| _q[3] = value3; | ||
| _p += (4 * SIZEOF_SIZE_T); | ||
| q += (4 * SIZEOF_SIZE_T); | ||
| } | ||
| while (_p <= size_t_end) { | ||
|
||
| size_t value = *(const size_t *) _p; | ||
| if (value & ASCII_CHAR_MASK) | ||
| break; | ||
|
|
@@ -4733,7 +4753,15 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) | |
| if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { | ||
| /* Help allocation */ | ||
| const char *_p = p; | ||
| while (_p + SIZEOF_SIZE_T <= end) { | ||
| while (_p <= unrolled_end) { | ||
| const size_t *restrict __p = (const size_t *)_p; | ||
| size_t value = __p[0] | __p[1] | __p[2] | __p[3]; | ||
| if (value & ASCII_CHAR_MASK) { | ||
| break; | ||
| } | ||
| _p += (4 * SIZEOF_SIZE_T); | ||
| } | ||
| while (_p <= size_t_end) { | ||
|
||
| size_t value = *(const size_t *) _p; | ||
| if (value & ASCII_CHAR_MASK) | ||
| break; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just checking whether I understand correctly:
aligned_endis not aligned, but good enough to serve as the end of the loop over aligned values. To make it really aligned we would need to do something likealigned_end = _Py_SIZE_ROUND_DOWN(_end, SIZEOF_SIZE_T)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that is the gist of it. I tried something similar to _Py_SIZE_ROUND_DOWN and got segfaults so I opted for this simpler tried and true solution. The name
aligned_endis probably not the best, but I can't think of a better name now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be something like
Would be interested in understanding why you got a segfault though.