Skip to content

Commit ec25996

Browse files
committed
Use memcpy to avoid unaligned read in find_first_nonascii
1 parent 5eb7fd4 commit ec25996

File tree

1 file changed

+12
-49
lines changed

1 file changed

+12
-49
lines changed

Objects/unicodeobject.c

Lines changed: 12 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5018,53 +5018,6 @@ ctz(size_t v)
50185018
#define HAVE_CTZ 0
50195019
#endif
50205020

5021-
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5022-
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5023-
static size_t
5024-
load_unaligned(const unsigned char *p, size_t size)
5025-
{
5026-
union {
5027-
size_t s;
5028-
unsigned char b[SIZEOF_SIZE_T];
5029-
} u;
5030-
u.s = 0;
5031-
// This switch statement assumes little endian because:
5032-
// * union is faster than bitwise or and shift.
5033-
// * big endian machine is rare and hard to maintain.
5034-
switch (size) {
5035-
default:
5036-
#if SIZEOF_SIZE_T == 8
5037-
case 8:
5038-
u.b[7] = p[7];
5039-
_Py_FALLTHROUGH;
5040-
case 7:
5041-
u.b[6] = p[6];
5042-
_Py_FALLTHROUGH;
5043-
case 6:
5044-
u.b[5] = p[5];
5045-
_Py_FALLTHROUGH;
5046-
case 5:
5047-
u.b[4] = p[4];
5048-
_Py_FALLTHROUGH;
5049-
#endif
5050-
case 4:
5051-
u.b[3] = p[3];
5052-
_Py_FALLTHROUGH;
5053-
case 3:
5054-
u.b[2] = p[2];
5055-
_Py_FALLTHROUGH;
5056-
case 2:
5057-
u.b[1] = p[1];
5058-
_Py_FALLTHROUGH;
5059-
case 1:
5060-
u.b[0] = p[0];
5061-
break;
5062-
case 0:
5063-
break;
5064-
}
5065-
return u.s;
5066-
}
5067-
#endif
50685021

50695022
/*
50705023
* Find the first non-ASCII character in a byte sequence.
@@ -5077,12 +5030,17 @@ load_unaligned(const unsigned char *p, size_t size)
50775030
static Py_ssize_t
50785031
find_first_nonascii(const unsigned char *start, const unsigned char *end)
50795032
{
5033+
// The search is done in `size_t` chunks.
5034+
// The start and end might not be aligned at `size_t` boundaries,
5035+
// so they're handled specially.
5036+
50805037
const unsigned char *p = start;
50815038

50825039
if (end - start >= SIZEOF_SIZE_T) {
50835040
const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
50845041
#if PY_LITTLE_ENDIAN && HAVE_CTZ
50855042
if (p < p2) {
5043+
// Avoid unaligned read.
50865044
size_t u;
50875045
memcpy(&u, p, sizeof(size_t));
50885046
u &= ASCII_CHAR_MASK;
@@ -5114,9 +5072,14 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51145072
p += SIZEOF_SIZE_T;
51155073
}
51165074
}
5075+
5076+
// less than size_t bytes left.
5077+
assert(end - p <= SIZEOF_SIZE_T);
51175078
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5118-
// we can not use *(const size_t*)p to avoid buffer overrun.
5119-
size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5079+
// Avoid unaligned read and read ahead.
5080+
size_t u = 0;
5081+
memcpy(&u, p, end - p);
5082+
u &= ASCII_CHAR_MASK;
51205083
if (u) {
51215084
return p - start + (ctz(u) - 7) / 8;
51225085
}

0 commit comments

Comments
 (0)