@@ -39,11 +39,11 @@ void *memmove(void *dest, const void *src, size_t n) {
39
39
#ifdef __wasm_simd128__
40
40
41
41
__attribute__((weak ))
42
- int memcmp (const void * v1 , const void * v2 , size_t n ) {
42
+ int memcmp (const void * vl , const void * vr , size_t n ) {
43
43
// Scalar algorithm.
44
44
if (n < sizeof (v128_t )) {
45
- const unsigned char * u1 = (unsigned char * )v1 ;
46
- const unsigned char * u2 = (unsigned char * )v2 ;
45
+ const unsigned char * u1 = (unsigned char * )vl ;
46
+ const unsigned char * u2 = (unsigned char * )vr ;
47
47
while (n -- ) {
48
48
if (* u1 != * u2 ) return * u1 - * u2 ;
49
49
u1 ++ ;
@@ -56,32 +56,32 @@ int memcmp(const void *v1, const void *v2, size_t n) {
56
56
// Find the first different character in the objects.
57
57
// Unaligned loads handle the case where the objects
58
58
// have mismatching alignments.
59
- const v128_t * w1 = (v128_t * )v1 ;
60
- const v128_t * w2 = (v128_t * )v2 ;
59
+ const v128_t * v1 = (v128_t * )vl ;
60
+ const v128_t * v2 = (v128_t * )vr ;
61
61
while (n ) {
62
- const v128_t cmp = wasm_i8x16_eq (wasm_v128_load (w1 ), wasm_v128_load (w2 ));
62
+ const v128_t cmp = wasm_i8x16_eq (wasm_v128_load (v1 ), wasm_v128_load (v2 ));
63
63
// Bitmask is slow on AArch64, all_true is much faster.
64
64
if (!wasm_i8x16_all_true (cmp )) {
65
65
// Find the offset of the first zero bit (little-endian).
66
66
size_t ctz = __builtin_ctz (~wasm_i8x16_bitmask (cmp ));
67
- const unsigned char * u1 = (unsigned char * )w1 + ctz ;
68
- const unsigned char * u2 = (unsigned char * )w2 + ctz ;
67
+ const unsigned char * u1 = (unsigned char * )v1 + ctz ;
68
+ const unsigned char * u2 = (unsigned char * )v2 + ctz ;
69
69
// This may help the compiler if the function is inlined.
70
70
__builtin_assume (* u1 - * u2 != 0 );
71
71
return * u1 - * u2 ;
72
72
}
73
73
// This makes n a multiple of sizeof(v128_t)
74
74
// for every iteration except the first.
75
75
size_t align = (n - 1 ) % sizeof (v128_t ) + 1 ;
76
- w1 = (v128_t * )((char * )w1 + align );
77
- w2 = (v128_t * )((char * )w2 + align );
76
+ v1 = (v128_t * )((char * )v1 + align );
77
+ v2 = (v128_t * )((char * )v2 + align );
78
78
n -= align ;
79
79
}
80
80
return 0 ;
81
81
}
82
82
83
83
__attribute__((weak ))
84
- void * memchr (const void * v , int c , size_t n ) {
84
+ void * memchr (const void * s , int c , size_t n ) {
85
85
// When n is zero, a function that locates a character finds no occurrence.
86
86
// Otherwise, decrement n to ensure sub_overflow overflows
87
87
// when n would go equal-to-or-below zero.
@@ -92,12 +92,13 @@ void *memchr(const void *v, int c, size_t n) {
92
92
// memchr must behave as if it reads characters sequentially
93
93
// and stops as soon as a match is found.
94
94
// Aligning ensures loads beyond the first match are safe.
95
- uintptr_t align = (uintptr_t )v % sizeof (v128_t );
96
- const v128_t * w = (v128_t * )((char * )v - align );
97
- const v128_t wc = wasm_i8x16_splat (c );
95
+ // Volatile avoids compiler tricks around out of bounds loads.
96
+ uintptr_t align = (uintptr_t )s % sizeof (v128_t );
97
+ const volatile v128_t * v = (v128_t * )((char * )s - align );
98
+ const v128_t vc = wasm_i8x16_splat (c );
98
99
99
100
for (;;) {
100
- const v128_t cmp = wasm_i8x16_eq (* w , wc );
101
+ const v128_t cmp = wasm_i8x16_eq (* v , vc );
101
102
// Bitmask is slow on AArch64, any_true is much faster.
102
103
if (wasm_v128_any_true (cmp )) {
103
104
// Clear the bits corresponding to alignment (little-endian)
@@ -113,36 +114,36 @@ void *memchr(const void *v, int c, size_t n) {
113
114
// That's a match, unless it is beyond the end of the object.
114
115
// Recall that we decremented n, so less-than-or-equal-to is correct.
115
116
size_t ctz = __builtin_ctz (mask );
116
- return ctz - align <= n ? (char * )w + ctz : NULL ;
117
+ return ctz - align <= n ? (char * )v + ctz : NULL ;
117
118
}
118
119
}
119
120
// Decrement n; if it overflows we're done.
120
121
if (__builtin_sub_overflow (n , sizeof (v128_t ) - align , & n )) {
121
122
return NULL ;
122
123
}
123
124
align = 0 ;
124
- w ++ ;
125
+ v ++ ;
125
126
}
126
127
}
127
128
128
129
__attribute__((weak ))
129
- void * memrchr (const void * v , int c , size_t n ) {
130
+ void * memrchr (const void * s , int c , size_t n ) {
130
131
// memrchr is allowed to read up to n bytes from the object.
131
132
// Search backward for the last matching character.
132
- const v128_t * w = (v128_t * )((char * )v + n );
133
- const v128_t wc = wasm_i8x16_splat (c );
133
+ const v128_t * v = (v128_t * )((char * )s + n );
134
+ const v128_t vc = wasm_i8x16_splat (c );
134
135
for (; n >= sizeof (v128_t ); n -= sizeof (v128_t )) {
135
- const v128_t cmp = wasm_i8x16_eq (wasm_v128_load (-- w ), wc );
136
+ const v128_t cmp = wasm_i8x16_eq (wasm_v128_load (-- v ), vc );
136
137
// Bitmask is slow on AArch64, any_true is much faster.
137
138
if (wasm_v128_any_true (cmp )) {
138
139
// Find the offset of the last one bit (little-endian).
139
140
size_t clz = __builtin_clz (wasm_i8x16_bitmask (cmp )) - 15 ;
140
- return (char * )(w + 1 ) - clz ;
141
+ return (char * )(v + 1 ) - clz ;
141
142
}
142
143
}
143
144
144
145
// Scalar algorithm.
145
- const char * a = (char * )w ;
146
+ const char * a = (char * )v ;
146
147
while (n -- ) {
147
148
if (* (-- a ) == (char )c ) return (char * )a ;
148
149
}
@@ -154,12 +155,13 @@ size_t strlen(const char *s) {
154
155
// strlen must stop as soon as it finds the terminator.
155
156
// Aligning ensures loads beyond the terminator are safe.
156
157
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
157
- const v128_t * w = (v128_t * )(s - align );
158
+ const volatile v128_t * v = (v128_t * )(s - align );
158
159
159
160
for (;;) {
161
+ const v128_t vv = * v ;
160
162
// Bitmask is slow on AArch64, all_true is much faster.
161
- if (!wasm_i8x16_all_true (* w )) {
162
- const v128_t cmp = wasm_i8x16_eq (* w , (v128_t ){});
163
+ if (!wasm_i8x16_all_true (vv )) {
164
+ const v128_t cmp = wasm_i8x16_eq (vv , (v128_t ){});
163
165
// Clear the bits corresponding to alignment (little-endian)
164
166
// so we can count trailing zeros.
165
167
int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
@@ -170,11 +172,11 @@ size_t strlen(const char *s) {
170
172
// it's as if we didn't find anything.
171
173
if (mask ) {
172
174
// Find the offset of the first one bit (little-endian).
173
- return (char * )w - s + __builtin_ctz (mask );
175
+ return (char * )v - s + __builtin_ctz (mask );
174
176
}
175
177
}
176
178
align = 0 ;
177
- w ++ ;
179
+ v ++ ;
178
180
}
179
181
}
180
182
@@ -268,12 +270,14 @@ int strncmp(const char *s1, const char *s2, size_t n) {
268
270
static char * __strchrnul (const char * s , int c ) {
269
271
// strchrnul must stop as soon as it finds the terminator.
270
272
// Aligning ensures loads beyond the terminator are safe.
273
+ // Volatile avoids compiler tricks around out of bounds loads.
271
274
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
272
- const v128_t * w = (v128_t * )(s - align );
273
- const v128_t wc = wasm_i8x16_splat (c );
275
+ const volatile v128_t * v = (v128_t * )(s - align );
276
+ const v128_t vc = wasm_i8x16_splat (c );
274
277
275
278
for (;;) {
276
- const v128_t cmp = wasm_i8x16_eq (* w , (v128_t ){}) | wasm_i8x16_eq (* w , wc );
279
+ const v128_t vv = * v ;
280
+ const v128_t cmp = wasm_i8x16_eq (vv , (v128_t ){}) | wasm_i8x16_eq (vv , vc );
277
281
// Bitmask is slow on AArch64, any_true is much faster.
278
282
if (wasm_v128_any_true (cmp )) {
279
283
// Clear the bits corresponding to alignment (little-endian)
@@ -286,11 +290,11 @@ static char *__strchrnul(const char *s, int c) {
286
290
// it's as if we didn't find anything.
287
291
if (mask ) {
288
292
// Find the offset of the first one bit (little-endian).
289
- return (char * )w + __builtin_ctz (mask );
293
+ return (char * )v + __builtin_ctz (mask );
290
294
}
291
295
}
292
296
align = 0 ;
293
- w ++ ;
297
+ v ++ ;
294
298
}
295
299
}
296
300
@@ -371,14 +375,15 @@ __attribute__((weak))
371
375
size_t strspn (const char * s , const char * c ) {
372
376
// strspn must stop as soon as it finds the terminator.
373
377
// Aligning ensures loads beyond the terminator are safe.
378
+ // Volatile avoids compiler tricks around out of bounds loads.
374
379
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
375
- const v128_t * w = (v128_t * )(s - align );
380
+ const volatile v128_t * v = (v128_t * )(s - align );
376
381
377
382
if (!c [0 ]) return 0 ;
378
383
if (!c [1 ]) {
379
- const v128_t wc = wasm_i8x16_splat (* c );
384
+ const v128_t vc = wasm_i8x16_splat (* c );
380
385
for (;;) {
381
- const v128_t cmp = wasm_i8x16_eq (* w , wc );
386
+ const v128_t cmp = wasm_i8x16_eq (* v , vc );
382
387
// Bitmask is slow on AArch64, all_true is much faster.
383
388
if (!wasm_i8x16_all_true (cmp )) {
384
389
// Clear the bits corresponding to alignment (little-endian)
@@ -391,11 +396,11 @@ size_t strspn(const char *s, const char *c) {
391
396
// it's as if we didn't find anything.
392
397
if (mask ) {
393
398
// Find the offset of the first one bit (little-endian).
394
- return (char * )w - s + __builtin_ctz (mask );
399
+ return (char * )v - s + __builtin_ctz (mask );
395
400
}
396
401
}
397
402
align = 0 ;
398
- w ++ ;
403
+ v ++ ;
399
404
}
400
405
}
401
406
@@ -407,7 +412,7 @@ size_t strspn(const char *s, const char *c) {
407
412
}
408
413
409
414
for (;;) {
410
- const v128_t cmp = __wasm_v128_chkbits (bitmap , * w );
415
+ const v128_t cmp = __wasm_v128_chkbits (bitmap , * v );
411
416
// Bitmask is slow on AArch64, all_true is much faster.
412
417
if (!wasm_i8x16_all_true (cmp )) {
413
418
// Clear the bits corresponding to alignment (little-endian)
@@ -420,11 +425,11 @@ size_t strspn(const char *s, const char *c) {
420
425
// it's as if we didn't find anything.
421
426
if (mask ) {
422
427
// Find the offset of the first one bit (little-endian).
423
- return (char * )w - s + __builtin_ctz (mask );
428
+ return (char * )v - s + __builtin_ctz (mask );
424
429
}
425
430
}
426
431
align = 0 ;
427
- w ++ ;
432
+ v ++ ;
428
433
}
429
434
}
430
435
@@ -434,8 +439,9 @@ size_t strcspn(const char *s, const char *c) {
434
439
435
440
// strcspn must stop as soon as it finds the terminator.
436
441
// Aligning ensures loads beyond the terminator are safe.
442
+ // Volatile avoids compiler tricks around out of bounds loads.
437
443
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
438
- const v128_t * w = (v128_t * )(s - align );
444
+ const volatile v128_t * v = (v128_t * )(s - align );
439
445
440
446
__wasm_v128_bitmap256_t bitmap = {};
441
447
@@ -445,7 +451,7 @@ size_t strcspn(const char *s, const char *c) {
445
451
} while (* c ++ );
446
452
447
453
for (;;) {
448
- const v128_t cmp = __wasm_v128_chkbits (bitmap , * w );
454
+ const v128_t cmp = __wasm_v128_chkbits (bitmap , * v );
449
455
// Bitmask is slow on AArch64, any_true is much faster.
450
456
if (wasm_v128_any_true (cmp )) {
451
457
// Clear the bits corresponding to alignment (little-endian)
@@ -458,11 +464,11 @@ size_t strcspn(const char *s, const char *c) {
458
464
// it's as if we didn't find anything.
459
465
if (mask ) {
460
466
// Find the offset of the first one bit (little-endian).
461
- return (char * )w - s + __builtin_ctz (mask );
467
+ return (char * )v - s + __builtin_ctz (mask );
462
468
}
463
469
}
464
470
align = 0 ;
465
- w ++ ;
471
+ v ++ ;
466
472
}
467
473
}
468
474
0 commit comments