Skip to content

Commit 9ea7099

Browse files
committed
Size optimized versions.
1 parent 29aa365 commit 9ea7099

File tree

1 file changed

+108
-18
lines changed

1 file changed

+108
-18
lines changed

sqlite3/libc/string.h

Lines changed: 108 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,27 +40,32 @@ void *memmove(void *dest, const void *src, size_t n) {
4040

4141
// SIMD versions of some string.h functions.
4242
//
43-
// These assume aligned v128_t reads can't fail,
44-
// and so can't unaligned reads up to the last
43+
// These assume aligned v128_t loads can't fail,
44+
// and so can't unaligned loads up to the last
4545
// aligned address less than memory size.
4646
//
4747
// These also assume unaligned access is not painfully slow,
4848
// but that bitmask extraction is really slow on AArch64.
4949

5050
__attribute__((weak))
5151
int memcmp(const void *v1, const void *v2, size_t n) {
52+
// memcmp can read up to n bytes from each object.
53+
// Using unaligned loads to handle the case where
54+
// the objects have mismatching alignments.
5255
const v128_t *w1 = v1;
5356
const v128_t *w2 = v2;
5457
for (; n >= sizeof(v128_t); n -= sizeof(v128_t)) {
58+
// Find any single bit difference.
5559
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
5660
break;
5761
}
5862
w1++;
5963
w2++;
6064
}
6165

62-
const uint8_t *u1 = (void *)w1;
63-
const uint8_t *u2 = (void *)w2;
66+
// Continue byte-by-byte.
67+
const unsigned char *u1 = (void *)w1;
68+
const unsigned char *u2 = (void *)w2;
6469
while (n--) {
6570
if (*u1 != *u2) return *u1 - *u2;
6671
u1++;
@@ -71,24 +76,40 @@ int memcmp(const void *v1, const void *v2, size_t n) {
7176

7277
__attribute__((weak))
7378
void *memchr(const void *v, int c, size_t n) {
79+
// When n is zero, a function that locates a character finds no occurrence.
80+
// Otherwise, decrement n to ensure __builtin_sub_overflow "overflows"
81+
// when n would go equal-to-or-below zero.
7482
if (n-- == 0) {
7583
return NULL;
7684
}
7785

86+
// memchr must behave as if it reads characters sequentially
87+
// and stops as soon as a match is found.
88+
// Aligning ensures loads can't fail.
7889
uintptr_t align = (uintptr_t)v % sizeof(v128_t);
7990
const v128_t *w = (void *)(v - align);
8091
const v128_t wc = wasm_i8x16_splat(c);
8192

8293
while (true) {
8394
const v128_t cmp = wasm_i8x16_eq(*w, wc);
95+
// Bitmask is slow on AArch64, any_true is much faster.
8496
if (wasm_v128_any_true(cmp)) {
97+
// Clear the bits corresponding to alignment
98+
// so we can count trailing zeros.
8599
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
100+
// At least one bit will be set, unless we cleared them.
101+
// Knowing this helps the compiler.
86102
__builtin_assume(mask || align);
103+
// If the mask is zero because of alignment,
104+
// it's as if we didn't find anything.
87105
if (mask) {
106+
// We found a match, unless it is beyond the end of the object.
107+
// Recall that we decremented n, so less-than-or-equal-to is correct.
88108
size_t ctz = __builtin_ctz(mask);
89109
return ctz <= n + align ? (void *)w + ctz : NULL;
90110
}
91111
}
112+
// Decrement n; if it "overflows" we're done.
92113
if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
93114
return NULL;
94115
}
@@ -99,13 +120,20 @@ void *memchr(const void *v, int c, size_t n) {
99120

100121
__attribute__((weak))
101122
size_t strlen(const char *s) {
123+
// strlen must stop as soon as it finds the terminator.
124+
// Aligning ensures loads can't fail.
102125
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
103126
const v128_t *w = (void *)(s - align);
104127

105128
while (true) {
129+
// Bitmask is slow on AArch64, all_true is much faster.
106130
if (!wasm_i8x16_all_true(*w)) {
107131
const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){});
132+
// Clear the bits corresponding to alignment
133+
// so we can count trailing zeros.
108134
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
135+
// At least one bit will be set, unless we cleared them.
136+
// Knowing this helps the compiler.
109137
__builtin_assume(mask || align);
110138
if (mask) {
111139
return (char *)w - s + __builtin_ctz(mask);
@@ -117,24 +145,33 @@ size_t strlen(const char *s) {
117145
}
118146

119147
static int __strcmp(const char *s1, const char *s2) {
148+
// Set limit to the largest possible valid v128_t pointer.
149+
// Unsigned modular arithmetic gives the correct result
150+
// unless memory size is zero, in which case all pointers are invalid.
120151
const v128_t *const limit =
121152
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
122153

154+
// Using unaligned loads to handle the case where
155+
// the strings have mismatching alignments.
123156
const v128_t *w1 = (void *)s1;
124157
const v128_t *w2 = (void *)s2;
125158
while (w1 <= limit && w2 <= limit) {
159+
// Find any single bit difference.
126160
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
127161
break;
128162
}
163+
// All bytes are equal.
164+
// If any byte is zero (on both strings) the strings are equal.
129165
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
130166
return 0;
131167
}
132168
w1++;
133169
w2++;
134170
}
135171

136-
const uint8_t *u1 = (void *)w1;
137-
const uint8_t *u2 = (void *)w2;
172+
// Continue byte-by-byte.
173+
const unsigned char *u1 = (void *)w1;
174+
const unsigned char *u2 = (void *)w2;
138175
while (true) {
139176
if (*u1 != *u2) return *u1 - *u2;
140177
if (*u1 == 0) break;
@@ -146,6 +183,8 @@ static int __strcmp(const char *s1, const char *s2) {
146183

147184
__attribute__((weak, always_inline))
148185
int strcmp(const char *s1, const char *s2) {
186+
// Use strncmp when comparing against literal strings.
187+
// If the literal is small, the vector search will be skipped.
149188
if (__builtin_constant_p(strlen(s2))) {
150189
return strncmp(s1, s2, strlen(s2));
151190
}
@@ -154,24 +193,33 @@ int strcmp(const char *s1, const char *s2) {
154193

155194
__attribute__((weak))
156195
int strncmp(const char *s1, const char *s2, size_t n) {
196+
// Set limit to the largest possible valid v128_t pointer.
197+
// Unsigned modular arithmetic gives the correct result
198+
// unless memory size is zero, in which case all pointers are invalid.
157199
const v128_t *const limit =
158200
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
159201

202+
// Using unaligned loads to handle the case where
203+
// the strings have mismatching alignments.
160204
const v128_t *w1 = (void *)s1;
161205
const v128_t *w2 = (void *)s2;
162206
for (; w1 <= limit && w2 <= limit && n >= sizeof(v128_t); n -= sizeof(v128_t)) {
207+
// Find any single bit difference.
163208
if (wasm_v128_any_true(wasm_v128_load(w1) ^ wasm_v128_load(w2))) {
164209
break;
165210
}
211+
// All bytes are equal.
212+
// If any byte is zero (on both strings) the strings are equal.
166213
if (!wasm_i8x16_all_true(wasm_v128_load(w1))) {
167214
return 0;
168215
}
169216
w1++;
170217
w2++;
171218
}
172219

173-
const uint8_t *u1 = (void *)w1;
174-
const uint8_t *u2 = (void *)w2;
220+
// Continue byte-by-byte.
221+
const unsigned char *u1 = (void *)w1;
222+
const unsigned char *u2 = (void *)w2;
175223
while (n--) {
176224
if (*u1 != *u2) return *u1 - *u2;
177225
if (*u1 == 0) break;
@@ -182,14 +230,21 @@ int strncmp(const char *s1, const char *s2, size_t n) {
182230
}
183231

184232
static char *__strchrnul(const char *s, int c) {
233+
// strchrnul must stop as soon as a match is found.
234+
// Aligning ensures loads can't fail.
185235
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
186236
const v128_t *w = (void *)(s - align);
187237
const v128_t wc = wasm_i8x16_splat(c);
188238

189239
while (true) {
190240
const v128_t cmp = wasm_i8x16_eq(*w, (v128_t){}) | wasm_i8x16_eq(*w, wc);
241+
// Bitmask is slow on AArch64, any_true is much faster.
191242
if (wasm_v128_any_true(cmp)) {
243+
// Clear the bits corresponding to alignment
244+
// so we can count trailing zeros.
192245
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
246+
// At least one bit will be set, unless we cleared them.
247+
// Knowing this helps the compiler.
193248
__builtin_assume(mask || align);
194249
if (mask) {
195250
return (char *)w + __builtin_ctz(mask);
@@ -202,6 +257,7 @@ static char *__strchrnul(const char *s, int c) {
202257

203258
__attribute__((weak, always_inline))
204259
char *strchrnul(const char *s, int c) {
260+
// For finding the terminator, strlen is faster.
205261
if (__builtin_constant_p(c) && (char)c == 0) {
206262
return (char *)s + strlen(s);
207263
}
@@ -210,6 +266,7 @@ char *strchrnul(const char *s, int c) {
210266

211267
__attribute__((weak, always_inline))
212268
char *strchr(const char *s, int c) {
269+
// For finding the terminator, strlen is faster.
213270
if (__builtin_constant_p(c) && (char)c == 0) {
214271
return (char *)s + strlen(s);
215272
}
@@ -220,13 +277,16 @@ char *strchr(const char *s, int c) {
220277
__attribute__((weak))
221278
size_t strspn(const char *s, const char *c) {
222279
#ifndef _REENTRANT
223-
static
280+
static // Avoid the stack for builds without threads.
224281
#endif
225282
char byteset[UCHAR_MAX + 1];
226283
const char *const a = s;
227284

228285
if (!c[0]) return 0;
229286
if (!c[1]) {
287+
// Set limit to the largest possible valid v128_t pointer.
288+
// Unsigned modular arithmetic gives the correct result
289+
// unless memory size is zero, in which case all pointers are invalid.
230290
const v128_t *const limit =
231291
(v128_t *)(__builtin_wasm_memory_size(0) * PAGESIZE) - 1;
232292

@@ -244,33 +304,63 @@ size_t strspn(const char *s, const char *c) {
244304
return s - a;
245305
}
246306

307+
#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__)
308+
309+
// Unoptimized version.
310+
memset(byteset, 0, sizeof(byteset));
311+
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
312+
while (byteset[*(unsigned char *)s]) s++;
313+
314+
#else
315+
316+
// This is faster than memset.
247317
volatile v128_t *w = (void *)byteset;
248-
#pragma unroll
318+
#pragma unroll
249319
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
320+
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
321+
322+
// Keeping byteset[0] = 0 avoids the other loop having to test for it.
323+
while (*c && (byteset[*(unsigned char *)c] = 1)) c++;
324+
#pragma unroll 4
325+
while (byteset[*(unsigned char *)s]) s++;
326+
327+
#endif
250328

251-
while (*c && (byteset[*(uint8_t *)c] = 1)) c++;
252-
#pragma unroll 4
253-
while (byteset[*(uint8_t *)s]) s++;
254329
return s - a;
255330
}
256331

257332
__attribute__((weak))
258333
size_t strcspn(const char *s, const char *c) {
259334
#ifndef _REENTRANT
260-
static
335+
static // Avoid the stack for builds without threads.
261336
#endif
262337
char byteset[UCHAR_MAX + 1];
263338
const char *const a = s;
264339

265340
if (!c[0] || !c[1]) return __strchrnul(s, *c) - s;
266341

342+
#if defined(__OPTIMIZE_SIZE__) || !defined(__OPTIMIZE__)
343+
344+
// Unoptimized version.
345+
memset(byteset, 0, sizeof(byteset));
346+
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
347+
while (!byteset[*(unsigned char *)s]) s++;
348+
349+
#else
350+
351+
// This is faster than memset.
267352
volatile v128_t *w = (void *)byteset;
268-
#pragma unroll
353+
#pragma unroll
269354
for (size_t i = sizeof(byteset) / sizeof(v128_t); i--;) w[i] = (v128_t){};
355+
static_assert(sizeof(byteset) % sizeof(v128_t) == 0);
356+
357+
// Setting byteset[0] = 1 avoids the other loop having to test for it.
358+
while ((byteset[*(unsigned char *)c] = 1) && *c) c++;
359+
#pragma unroll 4
360+
while (!byteset[*(unsigned char *)s]) s++;
361+
362+
#endif
270363

271-
while ((byteset[*(uint8_t *)c] = 1) && *c) c++;
272-
#pragma unroll 4
273-
while (!byteset[*(uint8_t *)s]) s++;
274364
return s - a;
275365
}
276366

0 commit comments

Comments
 (0)