Skip to content

Commit 5b7c3a9

Browse files
committed
Optimize a scan of non state-chaning bytes with SSE2 instructions
This commit optimizes the scan of non-state-changing bytes using SSE2 instructions. A [_mm_cmpestri](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri) operation appears to be quite slow compared to alternative approach that involves [_mm_shuffle_epi8](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8) for low/high nibble of the input and using bitwise-and for the results to get a 16 bytes of LUT in one go (it also involves a bunch of other SSE2 operations which all have nice latency/throughput properties). The resulting LUT of 16 bytes can be analyzed (also vectorized) to get the index of the first byte (if any) that changes the state. That is done by figuring out the first byte that LUTs to zero. The tricky part here is the following: ``` Find A, B arrays (uint8_t[16]) such that * `A[i] & B[j] == 0` if `LUT[i | (j <<4)] == 0` * `A[i] & B[j] != 0` if `LUT[i | (j <<4)] != 0` // Note we don't need any specific non-zero value for all i,j = 0..15. ``` To find `A` and `B` satisfying the above conditions a [Z3](https://github.com/Z3Prover/z3) library is used. The npm package that wrapps z3 for using in ts is not particularly friendly to the author of this change so another package (synckit) was required to handle the async API for z3-wrapper. Using llhttp as a benchmark framework this change draws the following improvemnts: ``` Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz http: "seanmonstar/httparse" (C) BEFORE: 8192.00 mb | 1456.72 mb/s | 2172811.81 ops/sec | 5.62 s AFTER: 8192.00 mb | 1752.90 mb/s | 2614577.82 ops/sec | 4.67 s ~20% improvement http: "nodejs/http-parser" (C) BEFORE: 8192.00 mb | 1050.60 mb/s | 2118535.14 ops/sec | 7.80 s AFTER: 8192.00 mb | 1167.42 mb/s | 2354101.76 ops/sec | 7.02 s ~11% improvement ``` For more header-fields-heavy messages numbers might be even more convincing.
1 parent 4d7e352 commit 5b7c3a9

File tree

4 files changed

+155
-9
lines changed

4 files changed

+155
-9
lines changed

package.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "llparse",
3-
"version": "7.1.1",
3+
"version": "7.2.0",
44
"description": "Compile incremental parsers to C code",
55
"main": "lib/api.js",
66
"types": "lib/api.d.ts",
@@ -34,7 +34,7 @@
3434
"devDependencies": {
3535
"@types/debug": "^4.1.5",
3636
"@types/mocha": "^8.0.3",
37-
"@types/node": "^14.11.8",
37+
"@types/node": "^14.14.14",
3838
"esm": "^3.2.25",
3939
"llparse-test-fixture": "^5.0.1",
4040
"mocha": "^9.2.2",
@@ -44,6 +44,8 @@
4444
},
4545
"dependencies": {
4646
"debug": "^4.2.0",
47-
"llparse-frontend": "^3.0.0"
47+
"llparse-frontend": "^3.0.0",
48+
"z3-solver": "^4.12.2",
49+
"synckit": "^0.8.5"
4850
}
4951
}

src/implementation/c/compilation.ts

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,17 @@ const BLOB_GROUP_SIZE = 11;
1919

2020
type WrappedNode = frontend.IWrap<frontend.node.Node>;
2121

22+
// The SSE versions in use with the generator.
23+
export enum SseFamily {
24+
SSE2 = 'SSE2',
25+
SSE4_2 = 'SSE4_2',
26+
}
27+
2228
interface IBlob {
2329
readonly alignment: number | undefined;
2430
readonly buffer: Buffer;
2531
readonly name: string;
32+
sseFamily: SseFamily;
2633
}
2734

2835
// TODO(indutny): deduplicate
@@ -78,7 +85,7 @@ export class Compilation {
7885
}
7986

8087
if (blob.alignment) {
81-
out.push('#ifdef __SSE4_2__');
88+
out.push(`#ifdef __${blob.sseFamily.toString()}__`);
8289
}
8390
out.push(`static const unsigned char${align} ${blob.name}[] = {`);
8491

@@ -107,7 +114,7 @@ export class Compilation {
107114

108115
out.push(`};`);
109116
if (blob.alignment) {
110-
out.push('#endif /* __SSE4_2__ */');
117+
out.push(`#endif /* __${blob.sseFamily.toString()}__ */`);
111118
}
112119
}
113120
out.push('');
@@ -320,16 +327,25 @@ export class Compilation {
320327
return JSON.stringify(value);
321328
}
322329

323-
public blob(value: Buffer, alignment?: number): string {
330+
public blob(value: Buffer, alignment?: number, sseFamily?: SseFamily): string {
331+
if(!sseFamily) {
332+
sseFamily = SseFamily.SSE4_2
333+
}
324334
if (this.blobs.has(value)) {
325-
return this.blobs.get(value)!.name;
335+
let b = this.blobs.get(value)!;
336+
if( b.sseFamily > sseFamily ) {
337+
b.sseFamily = sseFamily;
338+
}
339+
340+
return b.name;
326341
}
327342

328343
const res = BLOB_PREFIX + this.blobs.size;
329344
this.blobs.set(value, {
330345
alignment,
331346
buffer: value,
332347
name: res,
348+
sseFamily: sseFamily,
333349
});
334350
return res;
335351
}

src/implementation/c/node/table-lookup.ts

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import * as assert from 'assert';
22
import * as frontend from 'llparse-frontend';
3+
import { createSyncFn } from 'synckit'
34

4-
import { Compilation } from '../compilation';
5+
import { Compilation, SseFamily } from '../compilation';
56
import { Node } from './base';
67

78
const MAX_CHAR = 0xff;
@@ -65,7 +66,86 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
6566
out.push('}');
6667
}
6768

68-
private buildSSE(out: string[]): boolean {
69+
private buildSSE2(out: string[]): boolean {
70+
// return false;
71+
const ctx = this.compilation;
72+
73+
// Transformation is not supported atm
74+
if (this.ref.transform && this.ref.transform.ref.name !== 'id') {
75+
return false;
76+
}
77+
78+
if (this.ref.edges.length !== 1) {
79+
return false;
80+
}
81+
82+
const edge = this.ref.edges[0];
83+
if (edge.node.ref !== this.ref) {
84+
return false;
85+
}
86+
87+
let initial_lut = new Array<number>(256).fill(0);
88+
edge.keys.forEach(i => { initial_lut[i] = 1;});
89+
90+
// the worker path must be absolute
91+
const lutLowNibbleHighNibbleResolver = createSyncFn(require.resolve('./z3-lookup-solver'), {
92+
tsRunner: 'ts-node', // optional, can be `'ts-node' | 'esbuild-register' | 'esbuild-runner' | 'tsx'`
93+
})
94+
95+
// do whatever you want, you will get the result synchronously!
96+
const result = lutLowNibbleHighNibbleResolver(initial_lut)
97+
98+
if (!result) {
99+
return false;
100+
}
101+
102+
const blob1 = ctx.blob(Buffer.from(result[0]), SSE_ALIGNMENT, SseFamily.SSE2);
103+
const blob2 = ctx.blob(Buffer.from(result[1]), SSE_ALIGNMENT, SseFamily.SSE2);
104+
105+
out.push('#ifdef __SSE2__');
106+
out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`);
107+
out.push(' int index;');
108+
out.push(' __m128i lut_tlo;');
109+
out.push(' __m128i lut_thi;');
110+
out.push(` lut_tlo = _mm_load_si128((__m128i const*) ${blob1});`);
111+
out.push(` lut_thi = _mm_load_si128((__m128i const*) ${blob2});`);
112+
out.push('');
113+
out.push(` for( ;${ctx.endPosArg()} - ${ctx.posArg()} >= 16; ${ctx.posArg()} += 16) {`);
114+
115+
out.push(' __m128i lut_res_lo;');
116+
out.push(' __m128i lut_res_hi;');
117+
out.push(' __m128i input;');
118+
out.push(` input = _mm_loadu_si128((__m128i const*) ${ctx.posArg()});`);
119+
out.push('');
120+
out.push(' lut_res_lo = _mm_shuffle_epi8(lut_tlo, _mm_and_si128(input, _mm_set1_epi8(0x0F)));');
121+
out.push(' lut_res_hi = _mm_shuffle_epi8(lut_thi, _mm_srli_epi16(_mm_and_si128(input, _mm_set1_epi8(0xF0)), 4));');
122+
out.push('');
123+
out.push(' input = _mm_cmpeq_epi8(_mm_and_si128(lut_res_lo, lut_res_hi), _mm_setzero_si128());');
124+
out.push(' index = _mm_movemask_epi8(input);');
125+
out.push(' if( 0 != index )');
126+
out.push(' {');
127+
out.push(' p += __builtin_ctz(index);');
128+
{
129+
const tmp: string[] = [];
130+
this.tailTo(tmp, this.ref.otherwise!);
131+
ctx.indent(out, tmp, ' ');
132+
}
133+
out.push(' }');
134+
135+
out.push(' }');
136+
const tmp: string[] = [];
137+
assert.strictEqual(edge.noAdvance, false);
138+
this.tailTo(tmp, {
139+
noAdvance: true,
140+
node: edge.node,
141+
});
142+
ctx.indent(out, tmp, ' ');
143+
out.push('}');
144+
out.push('#endif /* __SSE2__ */');
145+
return true;
146+
}
147+
148+
private buildSSE42(out: string[]): boolean {
69149
const ctx = this.compilation;
70150

71151
// Transformation is not supported atm
@@ -114,6 +194,7 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
114194
}
115195

116196
out.push('#ifdef __SSE4_2__');
197+
out.push('// ${}');
117198
out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`);
118199
out.push(' __m128i ranges;');
119200
out.push(' __m128i input;');
@@ -166,6 +247,13 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
166247
return true;
167248
}
168249

250+
private buildSSE(out: string[]): boolean {
251+
if (this.buildSSE2(out)){
252+
return true;
253+
}
254+
return this.buildSSE42(out);
255+
}
256+
169257
private buildTable(): ITable {
170258
const table: number[] = new Array(MAX_CHAR + 1).fill(0);
171259

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
const z3 = require('z3-solver');
2+
import { runAsWorker } from 'synckit'
3+
4+
runAsWorker(async (byte_lookup_table: Array<number>) => {
5+
const { Context } = await z3.init();
6+
const { BitVec, Solver, Int, Array, Select } = new Context('main');
7+
8+
const tlo = Array.const('TLO', Int.sort(), BitVec.sort(8));
9+
const thi = Array.const('THI', Int.sort(), BitVec.sort(8));
10+
const lut = Array.const('LUT', Int.sort(), BitVec.sort(8));
11+
12+
const solver = new Solver();
13+
14+
for (let i = 0; i < 256; i++) {
15+
if (byte_lookup_table[i] > 0) {
16+
solver.add(Select(lut, i).neq(BitVec.val(0, 8)));
17+
} else {
18+
solver.add(Select(lut, i).eq(BitVec.val(0, 8)));
19+
}
20+
21+
solver.add(Select(tlo, i & 0xf).and(Select(thi, i >> 4)).eq(Select(lut, i)));
22+
}
23+
24+
const solved = await solver.check();
25+
if (solved === 'unsat') {
26+
return null;
27+
}
28+
29+
const model = await solver.model();
30+
31+
let aa = [];
32+
let bb = [];
33+
34+
for (let i = 0; i < 16; i++) {
35+
aa.push(Number(model.eval(Select(tlo, i)).value()));
36+
bb.push(Number(model.eval(Select(thi, i)).value()));
37+
}
38+
39+
return [ aa, bb];
40+
})

0 commit comments

Comments
 (0)