@@ -5077,28 +5077,32 @@ load_unaligned(const unsigned char *p, size_t size)
50775077static Py_ssize_t
50785078find_first_nonascii (const unsigned char * start , const unsigned char * end )
50795079{
5080+ // The search is done in `size_t` chunks.
5081+ // The start and end might not be aligned at `size_t` boundaries,
5082+ // so they're handled specially.
5083+
50805084 const unsigned char * p = start ;
50815085
50825086 if (end - start >= SIZEOF_SIZE_T ) {
5083- const unsigned char * p2 = _Py_ALIGN_UP ( p , SIZEOF_SIZE_T );
5087+ // Avoid unaligned read.
50845088#if PY_LITTLE_ENDIAN && HAVE_CTZ
5085- if (p < p2 ) {
5086- size_t u ;
5087- memcpy (& u , p , sizeof (size_t ));
5088- u &= ASCII_CHAR_MASK ;
5089- if (u ) {
5090- return (ctz (u ) - 7 ) / 8 ;
5091- }
5092- p = p2 ;
5089+ size_t u ;
5090+ memcpy (& u , p , sizeof (size_t ));
5091+ u &= ASCII_CHAR_MASK ;
5092+ if (u ) {
5093+ return (ctz (u ) - 7 ) / 8 ;
50935094 }
5095+ p = _Py_ALIGN_DOWN (p + SIZEOF_SIZE_T , SIZEOF_SIZE_T );
50945096#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5097+ const unsigned char * p2 = _Py_ALIGN_UP (p , SIZEOF_SIZE_T );
50955098 while (p < p2 ) {
50965099 if (* p & 0x80 ) {
50975100 return p - start ;
50985101 }
50995102 p ++ ;
51005103 }
51015104#endif
5105+
51025106 const unsigned char * e = end - SIZEOF_SIZE_T ;
51035107 while (p <= e ) {
51045108 size_t u = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
@@ -5115,6 +5119,7 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end)
51155119 }
51165120 }
51175121#if PY_LITTLE_ENDIAN && HAVE_CTZ
5122+ assert ((end - p ) < SIZEOF_SIZE_T );
51185123 // we can not use *(const size_t*)p to avoid buffer overrun.
51195124 size_t u = load_unaligned (p , end - p ) & ASCII_CHAR_MASK ;
51205125 if (u ) {
0 commit comments