@@ -40,27 +40,32 @@ void *memmove(void *dest, const void *src, size_t n) {
40
40
41
41
// SIMD versions of some string.h functions.
42
42
//
43
- // These assume aligned v128_t reads can't fail,
44
- // and so can't unaligned reads up to the last
43
+ // These assume aligned v128_t loads can't fail,
44
+ // and so can't unaligned loads up to the last
45
45
// aligned address less than memory size.
46
46
//
47
47
// These also assume unaligned access is not painfully slow,
48
48
// but that bitmask extraction is really slow on AArch64.
49
49
50
50
__attribute__((weak ))
51
51
int memcmp (const void * v1 , const void * v2 , size_t n ) {
52
+ // memcmp can read up to n bytes from each object.
53
+ // Using unaligned loads to handle the case where
54
+ // the objects have mismatching alignments.
52
55
const v128_t * w1 = v1 ;
53
56
const v128_t * w2 = v2 ;
54
57
for (; n >= sizeof (v128_t ); n -= sizeof (v128_t )) {
58
+ // Find any single bit difference.
55
59
if (wasm_v128_any_true (wasm_v128_load (w1 ) ^ wasm_v128_load (w2 ))) {
56
60
break ;
57
61
}
58
62
w1 ++ ;
59
63
w2 ++ ;
60
64
}
61
65
62
- const uint8_t * u1 = (void * )w1 ;
63
- const uint8_t * u2 = (void * )w2 ;
66
+ // Continue byte-by-byte.
67
+ const unsigned char * u1 = (void * )w1 ;
68
+ const unsigned char * u2 = (void * )w2 ;
64
69
while (n -- ) {
65
70
if (* u1 != * u2 ) return * u1 - * u2 ;
66
71
u1 ++ ;
@@ -71,24 +76,40 @@ int memcmp(const void *v1, const void *v2, size_t n) {
71
76
72
77
__attribute__((weak ))
73
78
void * memchr (const void * v , int c , size_t n ) {
79
+ // When n is zero, a function that locates a character finds no occurrence.
80
+ // Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
81
+ // when n would go equal-to-or-below zero.
74
82
if (n -- == 0 ) {
75
83
return NULL ;
76
84
}
77
85
86
+ // memchr must behave as if it reads characters sequentially
87
+ // and stops as soon as a match is found.
88
+ // Aligning ensures loads can't fail.
78
89
uintptr_t align = (uintptr_t )v % sizeof (v128_t );
79
90
const v128_t * w = (void * )(v - align );
80
91
const v128_t wc = wasm_i8x16_splat (c );
81
92
82
93
while (true) {
83
94
const v128_t cmp = wasm_i8x16_eq (* w , wc );
95
+ // Bitmask is slow on AArch64, any_true is much faster.
84
96
if (wasm_v128_any_true (cmp )) {
97
+ // Clear the bits corresponding to alignment
98
+ // so we can count trailing zeros.
85
99
int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
100
+ // At least one bit will be set, unless we cleared them.
101
+ // Knowing this helps the compiler.
86
102
__builtin_assume (mask || align );
103
+ // If the mask is zero because of alignment,
104
+ // it's as if we didn't find anything.
87
105
if (mask ) {
106
+ // We found a match, unless it is beyond the end of the object.
107
+ // Recall that we decremented n, so less-than-or-equal-to is correct.
88
108
size_t ctz = __builtin_ctz (mask );
89
109
return ctz <= n + align ? (void * )w + ctz : NULL ;
90
110
}
91
111
}
112
+ // Decrement n; if it "overflows" we're done.
92
113
if (__builtin_sub_overflow (n , sizeof (v128_t ) - align , & n )) {
93
114
return NULL ;
94
115
}
@@ -99,13 +120,20 @@ void *memchr(const void *v, int c, size_t n) {
99
120
100
121
__attribute__((weak ))
101
122
size_t strlen (const char * s ) {
123
+ // strlen must stop as soon as it finds the terminator.
124
+ // Aligning ensures loads can't fail.
102
125
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
103
126
const v128_t * w = (void * )(s - align );
104
127
105
128
while (true) {
129
+ // Bitmask is slow on AArch64, all_true is much faster.
106
130
if (!wasm_i8x16_all_true (* w )) {
107
131
const v128_t cmp = wasm_i8x16_eq (* w , (v128_t ){});
132
+ // Clear the bits corresponding to alignment
133
+ // so we can count trailing zeros.
108
134
int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
135
+ // At least one bit will be set, unless we cleared them.
136
+ // Knowing this helps the compiler.
109
137
__builtin_assume (mask || align );
110
138
if (mask ) {
111
139
return (char * )w - s + __builtin_ctz (mask );
@@ -117,24 +145,33 @@ size_t strlen(const char *s) {
117
145
}
118
146
119
147
static int __strcmp (const char * s1 , const char * s2 ) {
148
+ // Set limit to the largest possible valid v128_t pointer.
149
+ // Unsigned modular arithmetic gives the correct result
150
+ // unless memory size is zero, in which case all pointers are invalid.
120
151
const v128_t * const limit =
121
152
(v128_t * )(__builtin_wasm_memory_size (0 ) * PAGESIZE ) - 1 ;
122
153
154
+ // Using unaligned loads to handle the case where
155
+ // the strings have mismatching alignments.
123
156
const v128_t * w1 = (void * )s1 ;
124
157
const v128_t * w2 = (void * )s2 ;
125
158
while (w1 <= limit && w2 <= limit ) {
159
+ // Find any single bit difference.
126
160
if (wasm_v128_any_true (wasm_v128_load (w1 ) ^ wasm_v128_load (w2 ))) {
127
161
break ;
128
162
}
163
+ // All bytes are equal.
164
+ // If any byte is zero (on both strings) the strings are equal.
129
165
if (!wasm_i8x16_all_true (wasm_v128_load (w1 ))) {
130
166
return 0 ;
131
167
}
132
168
w1 ++ ;
133
169
w2 ++ ;
134
170
}
135
171
136
- const uint8_t * u1 = (void * )w1 ;
137
- const uint8_t * u2 = (void * )w2 ;
172
+ // Continue byte-by-byte.
173
+ const unsigned char * u1 = (void * )w1 ;
174
+ const unsigned char * u2 = (void * )w2 ;
138
175
while (true) {
139
176
if (* u1 != * u2 ) return * u1 - * u2 ;
140
177
if (* u1 == 0 ) break ;
@@ -146,6 +183,8 @@ static int __strcmp(const char *s1, const char *s2) {
146
183
147
184
__attribute__((weak , always_inline ))
148
185
int strcmp (const char * s1 , const char * s2 ) {
186
+ // Use strncmp when comparing against literal strings.
187
+ // If the literal is small, the vector search will be skipped.
149
188
if (__builtin_constant_p (strlen (s2 ))) {
150
189
return strncmp (s1 , s2 , strlen (s2 ));
151
190
}
@@ -154,24 +193,33 @@ int strcmp(const char *s1, const char *s2) {
154
193
155
194
__attribute__((weak ))
156
195
int strncmp (const char * s1 , const char * s2 , size_t n ) {
196
+ // Set limit to the largest possible valid v128_t pointer.
197
+ // Unsigned modular arithmetic gives the correct result
198
+ // unless memory size is zero, in which case all pointers are invalid.
157
199
const v128_t * const limit =
158
200
(v128_t * )(__builtin_wasm_memory_size (0 ) * PAGESIZE ) - 1 ;
159
201
202
+ // Using unaligned loads to handle the case where
203
+ // the strings have mismatching alignments.
160
204
const v128_t * w1 = (void * )s1 ;
161
205
const v128_t * w2 = (void * )s2 ;
162
206
for (; w1 <= limit && w2 <= limit && n >= sizeof (v128_t ); n -= sizeof (v128_t )) {
207
+ // Find any single bit difference.
163
208
if (wasm_v128_any_true (wasm_v128_load (w1 ) ^ wasm_v128_load (w2 ))) {
164
209
break ;
165
210
}
211
+ // All bytes are equal.
212
+ // If any byte is zero (on both strings) the strings are equal.
166
213
if (!wasm_i8x16_all_true (wasm_v128_load (w1 ))) {
167
214
return 0 ;
168
215
}
169
216
w1 ++ ;
170
217
w2 ++ ;
171
218
}
172
219
173
- const uint8_t * u1 = (void * )w1 ;
174
- const uint8_t * u2 = (void * )w2 ;
220
+ // Continue byte-by-byte.
221
+ const unsigned char * u1 = (void * )w1 ;
222
+ const unsigned char * u2 = (void * )w2 ;
175
223
while (n -- ) {
176
224
if (* u1 != * u2 ) return * u1 - * u2 ;
177
225
if (* u1 == 0 ) break ;
@@ -182,14 +230,21 @@ int strncmp(const char *s1, const char *s2, size_t n) {
182
230
}
183
231
184
232
static char * __strchrnul (const char * s , int c ) {
233
+ // strchrnul must stop as soon as a match is found.
234
+ // Aligning ensures loads can't fail.
185
235
uintptr_t align = (uintptr_t )s % sizeof (v128_t );
186
236
const v128_t * w = (void * )(s - align );
187
237
const v128_t wc = wasm_i8x16_splat (c );
188
238
189
239
while (true) {
190
240
const v128_t cmp = wasm_i8x16_eq (* w , (v128_t ){}) | wasm_i8x16_eq (* w , wc );
241
+ // Bitmask is slow on AArch64, any_true is much faster.
191
242
if (wasm_v128_any_true (cmp )) {
243
+ // Clear the bits corresponding to alignment
244
+ // so we can count trailing zeros.
192
245
int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
246
+ // At least one bit will be set, unless we cleared them.
247
+ // Knowing this helps the compiler.
193
248
__builtin_assume (mask || align );
194
249
if (mask ) {
195
250
return (char * )w + __builtin_ctz (mask );
@@ -202,6 +257,7 @@ static char *__strchrnul(const char *s, int c) {
202
257
203
258
__attribute__((weak , always_inline ))
204
259
char * strchrnul (const char * s , int c ) {
260
+ // For finding the terminator, strlen is faster.
205
261
if (__builtin_constant_p (c ) && (char )c == 0 ) {
206
262
return (char * )s + strlen (s );
207
263
}
@@ -210,6 +266,7 @@ char *strchrnul(const char *s, int c) {
210
266
211
267
__attribute__((weak , always_inline ))
212
268
char * strchr (const char * s , int c ) {
269
+ // For finding the terminator, strlen is faster.
213
270
if (__builtin_constant_p (c ) && (char )c == 0 ) {
214
271
return (char * )s + strlen (s );
215
272
}
@@ -220,13 +277,16 @@ char *strchr(const char *s, int c) {
220
277
__attribute__((weak ))
221
278
size_t strspn (const char * s , const char * c ) {
222
279
#ifndef _REENTRANT
223
- static
280
+ static // Avoid the stack for builds without threads.
224
281
#endif
225
282
char byteset [UCHAR_MAX + 1 ];
226
283
const char * const a = s ;
227
284
228
285
if (!c [0 ]) return 0 ;
229
286
if (!c [1 ]) {
287
+ // Set limit to the largest possible valid v128_t pointer.
288
+ // Unsigned modular arithmetic gives the correct result
289
+ // unless memory size is zero, in which case all pointers are invalid.
230
290
const v128_t * const limit =
231
291
(v128_t * )(__builtin_wasm_memory_size (0 ) * PAGESIZE ) - 1 ;
232
292
@@ -244,33 +304,63 @@ size_t strspn(const char *s, const char *c) {
244
304
return s - a ;
245
305
}
246
306
307
+ #if defined(__OPTIMIZE_SIZE__ ) || !defined(__OPTIMIZE__ )
308
+
309
+ // Unoptimized version.
310
+ memset (byteset , 0 , sizeof (byteset ));
311
+ while (* c && (byteset [* (unsigned char * )c ] = 1 )) c ++ ;
312
+ while (byteset [* (unsigned char * )s ]) s ++ ;
313
+
314
+ #else
315
+
316
+ // This is faster than memset.
247
317
volatile v128_t * w = (void * )byteset ;
248
- #pragma unroll
318
+ #pragma unroll
249
319
for (size_t i = sizeof (byteset ) / sizeof (v128_t ); i -- ;) w [i ] = (v128_t ){};
320
+ static_assert (sizeof (byteset ) % sizeof (v128_t ) == 0 );
321
+
322
+ // Keeping byteset[0] = 0 avoids the other loop having to test for it.
323
+ while (* c && (byteset [* (unsigned char * )c ] = 1 )) c ++ ;
324
+ #pragma unroll 4
325
+ while (byteset [* (unsigned char * )s ]) s ++ ;
326
+
327
+ #endif
250
328
251
- while (* c && (byteset [* (uint8_t * )c ] = 1 )) c ++ ;
252
- #pragma unroll 4
253
- while (byteset [* (uint8_t * )s ]) s ++ ;
254
329
return s - a ;
255
330
}
256
331
257
332
__attribute__((weak ))
258
333
size_t strcspn (const char * s , const char * c ) {
259
334
#ifndef _REENTRANT
260
- static
335
+ static // Avoid the stack for builds without threads.
261
336
#endif
262
337
char byteset [UCHAR_MAX + 1 ];
263
338
const char * const a = s ;
264
339
265
340
if (!c [0 ] || !c [1 ]) return __strchrnul (s , * c ) - s ;
266
341
342
+ #if defined(__OPTIMIZE_SIZE__ ) || !defined(__OPTIMIZE__ )
343
+
344
+ // Unoptimized version.
345
+ memset (byteset , 0 , sizeof (byteset ));
346
+ while ((byteset [* (unsigned char * )c ] = 1 ) && * c ) c ++ ;
347
+ while (!byteset [* (unsigned char * )s ]) s ++ ;
348
+
349
+ #else
350
+
351
+ // This is faster than memset.
267
352
volatile v128_t * w = (void * )byteset ;
268
- #pragma unroll
353
+ #pragma unroll
269
354
for (size_t i = sizeof (byteset ) / sizeof (v128_t ); i -- ;) w [i ] = (v128_t ){};
355
+ static_assert (sizeof (byteset ) % sizeof (v128_t ) == 0 );
356
+
357
+ // Setting byteset[0] = 1 avoids the other loop having to test for it.
358
+ while ((byteset [* (unsigned char * )c ] = 1 ) && * c ) c ++ ;
359
+ #pragma unroll 4
360
+ while (!byteset [* (unsigned char * )s ]) s ++ ;
361
+
362
+ #endif
270
363
271
- while ((byteset [* (uint8_t * )c ] = 1 ) && * c ) c ++ ;
272
- #pragma unroll 4
273
- while (!byteset [* (uint8_t * )s ]) s ++ ;
274
364
return s - a ;
275
365
}
276
366
0 commit comments