Skip to content

Commit 8a9fe5a

Browse files
committed
perf: cheat OOB by ensuring enough space. ~21% performance increase
1 parent d3150e4 commit 8a9fe5a

File tree

2 files changed

+44
-26
lines changed

2 files changed

+44
-26
lines changed

assembly/serialize/simd/string.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ export function serializeString_SIMD(src: string): void {
4040
store<u16>(bs.offset, 34); // "
4141
bs.offset += 2;
4242

43-
while (srcStart <= srcEnd16) {
43+
while (srcStart < srcEnd16) {
4444
const block = v128.load(srcStart);
4545
v128.store(bs.offset, block);
4646

@@ -84,15 +84,15 @@ export function serializeString_SIMD(src: string): void {
8484
const dstIdx = bs.offset + laneIdx;
8585
store<u64>(dstIdx, U00_MARKER);
8686
store<u32>(dstIdx, escaped, 8);
87-
memory.copy(dstIdx + 12, srcIdx + 2, 14 - laneIdx);
88-
// store<v128>(dstIdx, load<v128>(srcIdx, 2), 12); // unsafe. can overflow here
87+
// memory.copy(dstIdx + 12, srcIdx + 2, 14 - laneIdx);
88+
store<v128>(dstIdx, load<v128>(srcIdx, 2), 12); // unsafe. can overflow here
8989
bs.offset += 10;
9090
} else {
9191
bs.growSize(2);
9292
const dstIdx = bs.offset + laneIdx;
9393
store<u32>(dstIdx, escaped);
94-
// store<v128>(dstIdx, load<v128>(srcIdx, 2), 4);
95-
memory.copy(dstIdx + 4, srcIdx + 2, 14 - laneIdx);
94+
store<v128>(dstIdx, load<v128>(srcIdx, 2), 4);
95+
// memory.copy(dstIdx + 4, srcIdx + 2, 14 - laneIdx);
9696
bs.offset += 2;
9797
}
9898
continue;
@@ -121,8 +121,8 @@ export function serializeString_SIMD(src: string): void {
121121
const dstIdx = bs.offset + laneIdx - 1;
122122
store<u32>(dstIdx, U_MARKER); // \u
123123
store<u64>(dstIdx, load<u64>(changetype<usize>(code.toString(16))), 4);
124-
memory.copy(dstIdx + 12, srcIdx + 1, 15 - laneIdx);
125-
// store<v128>(dstIdx, load<v128>(srcIdx, 1), 12);
124+
// memory.copy(dstIdx + 12, srcIdx + 1, 15 - laneIdx);
125+
store<v128>(dstIdx, load<v128>(srcIdx, 1), 12);
126126
bs.offset += 10;
127127
} while (mask !== 0);
128128
}

assembly/serialize/swar/string.ts

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,22 @@ import { mask_to_string } from "../../util/masks";
88
@lazy const U00_MARKER = 13511005048209500;
99
// @ts-ignore: decorator allowed
1010
@lazy const U_MARKER = 7667804;
11+
// @ts-ignore: decorator allowed
12+
@lazy const LOW_MASK = 0x00FF_00FF_00FF_00FF;
13+
// @ts-ignore: decorator allowed
14+
@lazy @inline const FILTER_0020 = 0x0020_0020_0020_0020;
15+
// @ts-ignore: decorator allowed
16+
@lazy @inline const FILTER_0022 = 0x0022_0022_0022_0022;
17+
// @ts-ignore: decorator allowed
18+
@lazy @inline const FILTER_005C = 0x005C_005C_005C_005C;
19+
// @ts-ignore: decorator allowed
20+
@lazy @inline const FILTER_0001 = 0x0001_0001_0001_0001;
21+
// @ts-ignore: decorator allowed
22+
@lazy @inline const FILTER_0080 = 0x0080_0080_0080_0080;
23+
// @ts-ignore: decorator allowed
24+
@lazy @inline const FILTER_0100 = 0x0100_0100_0100_0100;
25+
// @ts-ignore: decorator allowed
26+
@lazy @inline const FILTER_8000 = 0x8000_8000_8000_8000;
1127

1228
export function serializeString_SWAR(src: string): void {
1329
let srcStart = changetype<usize>(src);
@@ -31,27 +47,27 @@ export function serializeString_SWAR(src: string): void {
3147
store<u16>(bs.offset, 34); // "
3248
bs.offset += 2;
3349

34-
while (srcStart <= srcEnd8) {
50+
while (srcStart < srcEnd8) {
3551
let block = load<u64>(srcStart);
3652
store<u64>(bs.offset, block);
3753

38-
const lo = block & 0x00FF_00FF_00FF_00FF;
54+
const lo = block & LOW_MASK;
3955
const ascii_mask = (
40-
((lo - 0x0020_0020_0020_0020) |
41-
((lo ^ 0x0022_0022_0022_0022) - 0x0001_0001_0001_0001) |
42-
((lo ^ 0x005C_005C_005C_005C) - 0x0001_0001_0001_0001))
43-
& (0x0080_0080_0080_0080 & ~lo)
56+
((lo - FILTER_0020) |
57+
((lo ^ FILTER_0022) - FILTER_0001) |
58+
((lo ^ FILTER_005C) - FILTER_0001))
59+
& (FILTER_0080 & ~lo)
4460
);
45-
const hi_mask = ((block - 0x0100010001000100) & ~block & 0x8000800080008000) ^ 0x8000800080008000;
61+
const hi_mask = ((block - FILTER_0100) & ~block & FILTER_8000) ^ FILTER_8000;
4662
let mask = (ascii_mask & (~hi_mask >> 8)) | hi_mask;
4763

48-
if (mask === 0) {
49-
srcStart += 8;
50-
bs.offset += 8;
51-
continue;
52-
}
64+
// if (mask === 0) {
65+
// srcStart += 8;
66+
// bs.offset += 8;
67+
// continue;
68+
// }
5369

54-
do {
70+
while (mask !== 0) {
5571
const laneIdx = usize(ctz(mask) >> 3);
5672
mask &= mask - 1;
5773
// Even (0 2 4 6) -> Confirmed ASCII Escape
@@ -66,15 +82,15 @@ export function serializeString_SWAR(src: string): void {
6682
const dstIdx = bs.offset + laneIdx;
6783
store<u64>(dstIdx, U00_MARKER);
6884
store<u32>(dstIdx, escaped, 8);
69-
memory.copy(dstIdx + 12, srcIdx + 2, 6 - laneIdx);
70-
// store<u64>(dstIdx, load<u64>(srcIdx, 2), 12); // unsafe. can overflow here
85+
// memory.copy(dstIdx + 12, srcIdx + 2, 6 - laneIdx);
86+
store<u64>(dstIdx, load<u64>(srcIdx, 2), 12); // unsafe. can overflow here
7187
bs.offset += 10;
7288
} else {
7389
bs.growSize(2);
7490
const dstIdx = bs.offset + laneIdx;
7591
store<u32>(dstIdx, escaped);
76-
// store<u64>(dstIdx, load<u64>(srcIdx, 2), 4);
77-
memory.copy(dstIdx + 4, srcIdx + 2, 6 - laneIdx);
92+
store<u64>(dstIdx, load<u64>(srcIdx, 2), 4);
93+
// memory.copy(dstIdx + 4, srcIdx + 2, 6 - laneIdx);
7894
bs.offset += 2;
7995
}
8096
continue;
@@ -87,13 +103,15 @@ export function serializeString_SWAR(src: string): void {
87103
// console.log("c->" + code.toString(16));
88104
if (code < 0xD800 || code > 0xDFFF) continue;
89105

90-
if (code <= 0xDBFF && srcIdx + 1 <= srcEnd - 2) {
106+
if (code <= 0xDBFF && srcIdx + 2 < srcEnd) {
107+
// if (srcIdx + 3 <= srcEnd) {
91108
const next = load<u16>(srcIdx, 1);
92109
if (next >= 0xDC00 && next <= 0xDFFF) {
93110
// paired surrogate
94111
mask &= mask - 1;
95112
continue;
96113
}
114+
// }
97115
}
98116

99117
bs.growSize(10);
@@ -103,7 +121,7 @@ export function serializeString_SWAR(src: string): void {
103121
store<u64>(dstIdx, load<u64>(changetype<usize>(code.toString(16))), 4);
104122
store<u64>(dstIdx, load<u64>(srcIdx, 1), 12);
105123
bs.offset += 10;
106-
} while (mask !== 0);
124+
}
107125

108126
srcStart += 8;
109127
bs.offset += 8;

0 commit comments

Comments
 (0)