From 5b7c3a93e3a97c39cb2ff6af50c3f81434606329 Mon Sep 17 00:00:00 2001 From: ngrodzitski Date: Mon, 9 Oct 2023 22:32:43 +0200 Subject: [PATCH 1/2] Optimize a scan of non state-chaning bytes with SSE2 instructions This commit optimizes the scan of non-state-changing bytes using SSE2 instructions. A [_mm_cmpestri](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri) operation appears to be quite slow compared to alternative approach that involves [_mm_shuffle_epi8](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8) for low/high nibble of the input and using bitwise-and for the results to get a 16 bytes of LUT in one go (it also involves a bunch of other SSE2 operations which all have nice latency/throughput properties). The resulting LUT of 16 bytes can be analyzed (also vectorized) to get the index of the first byte (if any) that changes the state. That is done by figuring out the first byte that LUTs to zero. The tricky part here is the following: ``` Find A, B arrays (uint8_t[16]) such that * `A[i] & B[j] == 0` if `LUT[i | (j <<4)] == 0` * `A[i] & B[j] != 0` if `LUT[i | (j <<4)] != 0` // Note we don't need any specific non-zero value for all i,j = 0..15. ``` To find `A` and `B` satisfying the above conditions a [Z3](https://github.com/Z3Prover/z3) library is used. The npm package that wrapps z3 for using in ts is not particularly friendly to the author of this change so another package (synckit) was required to handle the async API for z3-wrapper. Using llhttp as a benchmark framework this change draws the following improvemnts: ``` Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz http: "seanmonstar/httparse" (C) BEFORE: 8192.00 mb | 1456.72 mb/s | 2172811.81 ops/sec | 5.62 s AFTER: 8192.00 mb | 1752.90 mb/s | 2614577.82 ops/sec | 4.67 s ~20% improvement http: "nodejs/http-parser" (C) BEFORE: 8192.00 mb | 1050.60 mb/s | 2118535.14 ops/sec | 7.80 s AFTER: 8192.00 mb | 1167.42 mb/s | 2354101.76 ops/sec | 7.02 s ~11% improvement ``` For more header-fields-heavy messages numbers might be even more convincing. --- package.json | 8 +- src/implementation/c/compilation.ts | 24 ++++- src/implementation/c/node/table-lookup.ts | 92 ++++++++++++++++++- src/implementation/c/node/z3-lookup-solver.ts | 40 ++++++++ 4 files changed, 155 insertions(+), 9 deletions(-) create mode 100644 src/implementation/c/node/z3-lookup-solver.ts diff --git a/package.json b/package.json index 344153e..4d0eef5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "llparse", - "version": "7.1.1", + "version": "7.2.0", "description": "Compile incremental parsers to C code", "main": "lib/api.js", "types": "lib/api.d.ts", @@ -34,7 +34,7 @@ "devDependencies": { "@types/debug": "^4.1.5", "@types/mocha": "^8.0.3", - "@types/node": "^14.11.8", + "@types/node": "^14.14.14", "esm": "^3.2.25", "llparse-test-fixture": "^5.0.1", "mocha": "^9.2.2", @@ -44,6 +44,8 @@ }, "dependencies": { "debug": "^4.2.0", - "llparse-frontend": "^3.0.0" + "llparse-frontend": "^3.0.0", + "z3-solver": "^4.12.2", + "synckit": "^0.8.5" } } diff --git a/src/implementation/c/compilation.ts b/src/implementation/c/compilation.ts index 4df05a6..2a8c4b4 100644 --- a/src/implementation/c/compilation.ts +++ b/src/implementation/c/compilation.ts @@ -19,10 +19,17 @@ const BLOB_GROUP_SIZE = 11; type WrappedNode = frontend.IWrap; +// The SSE versions in use with the generator. +export enum SseFamily { + SSE2 = 'SSE2', + SSE4_2 = 'SSE4_2', +} + interface IBlob { readonly alignment: number | undefined; readonly buffer: Buffer; readonly name: string; + sseFamily: SseFamily; } // TODO(indutny): deduplicate @@ -78,7 +85,7 @@ export class Compilation { } if (blob.alignment) { - out.push('#ifdef __SSE4_2__'); + out.push(`#ifdef __${blob.sseFamily.toString()}__`); } out.push(`static const unsigned char${align} ${blob.name}[] = {`); @@ -107,7 +114,7 @@ export class Compilation { out.push(`};`); if (blob.alignment) { - out.push('#endif /* __SSE4_2__ */'); + out.push(`#endif /* __${blob.sseFamily.toString()}__ */`); } } out.push(''); @@ -320,9 +327,17 @@ export class Compilation { return JSON.stringify(value); } - public blob(value: Buffer, alignment?: number): string { + public blob(value: Buffer, alignment?: number, sseFamily?: SseFamily): string { + if(!sseFamily) { + sseFamily = SseFamily.SSE4_2 + } if (this.blobs.has(value)) { - return this.blobs.get(value)!.name; + let b = this.blobs.get(value)!; + if( b.sseFamily > sseFamily ) { + b.sseFamily = sseFamily; + } + + return b.name; } const res = BLOB_PREFIX + this.blobs.size; @@ -330,6 +345,7 @@ export class Compilation { alignment, buffer: value, name: res, + sseFamily: sseFamily, }); return res; } diff --git a/src/implementation/c/node/table-lookup.ts b/src/implementation/c/node/table-lookup.ts index 6a400a3..95fb887 100644 --- a/src/implementation/c/node/table-lookup.ts +++ b/src/implementation/c/node/table-lookup.ts @@ -1,7 +1,8 @@ import * as assert from 'assert'; import * as frontend from 'llparse-frontend'; +import { createSyncFn } from 'synckit' -import { Compilation } from '../compilation'; +import { Compilation, SseFamily } from '../compilation'; import { Node } from './base'; const MAX_CHAR = 0xff; @@ -65,7 +66,86 @@ export class TableLookup extends Node { out.push('}'); } - private buildSSE(out: string[]): boolean { + private buildSSE2(out: string[]): boolean { + // return false; + const ctx = this.compilation; + + // Transformation is not supported atm + if (this.ref.transform && this.ref.transform.ref.name !== 'id') { + return false; + } + + if (this.ref.edges.length !== 1) { + return false; + } + + const edge = this.ref.edges[0]; + if (edge.node.ref !== this.ref) { + return false; + } + + let initial_lut = new Array(256).fill(0); + edge.keys.forEach(i => { initial_lut[i] = 1;}); + + // the worker path must be absolute + const lutLowNibbleHighNibbleResolver = createSyncFn(require.resolve('./z3-lookup-solver'), { + tsRunner: 'ts-node', // optional, can be `'ts-node' | 'esbuild-register' | 'esbuild-runner' | 'tsx'` + }) + + // do whatever you want, you will get the result synchronously! + const result = lutLowNibbleHighNibbleResolver(initial_lut) + + if (!result) { + return false; + } + + const blob1 = ctx.blob(Buffer.from(result[0]), SSE_ALIGNMENT, SseFamily.SSE2); + const blob2 = ctx.blob(Buffer.from(result[1]), SSE_ALIGNMENT, SseFamily.SSE2); + + out.push('#ifdef __SSE2__'); + out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`); + out.push(' int index;'); + out.push(' __m128i lut_tlo;'); + out.push(' __m128i lut_thi;'); + out.push(` lut_tlo = _mm_load_si128((__m128i const*) ${blob1});`); + out.push(` lut_thi = _mm_load_si128((__m128i const*) ${blob2});`); + out.push(''); + out.push(` for( ;${ctx.endPosArg()} - ${ctx.posArg()} >= 16; ${ctx.posArg()} += 16) {`); + + out.push(' __m128i lut_res_lo;'); + out.push(' __m128i lut_res_hi;'); + out.push(' __m128i input;'); + out.push(` input = _mm_loadu_si128((__m128i const*) ${ctx.posArg()});`); + out.push(''); + out.push(' lut_res_lo = _mm_shuffle_epi8(lut_tlo, _mm_and_si128(input, _mm_set1_epi8(0x0F)));'); + out.push(' lut_res_hi = _mm_shuffle_epi8(lut_thi, _mm_srli_epi16(_mm_and_si128(input, _mm_set1_epi8(0xF0)), 4));'); + out.push(''); + out.push(' input = _mm_cmpeq_epi8(_mm_and_si128(lut_res_lo, lut_res_hi), _mm_setzero_si128());'); + out.push(' index = _mm_movemask_epi8(input);'); + out.push(' if( 0 != index )'); + out.push(' {'); + out.push(' p += __builtin_ctz(index);'); + { + const tmp: string[] = []; + this.tailTo(tmp, this.ref.otherwise!); + ctx.indent(out, tmp, ' '); + } + out.push(' }'); + + out.push(' }'); + const tmp: string[] = []; + assert.strictEqual(edge.noAdvance, false); + this.tailTo(tmp, { + noAdvance: true, + node: edge.node, + }); + ctx.indent(out, tmp, ' '); + out.push('}'); + out.push('#endif /* __SSE2__ */'); + return true; + } + + private buildSSE42(out: string[]): boolean { const ctx = this.compilation; // Transformation is not supported atm @@ -114,6 +194,7 @@ export class TableLookup extends Node { } out.push('#ifdef __SSE4_2__'); + out.push('// ${}'); out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`); out.push(' __m128i ranges;'); out.push(' __m128i input;'); @@ -166,6 +247,13 @@ export class TableLookup extends Node { return true; } + private buildSSE(out: string[]): boolean { + if (this.buildSSE2(out)){ + return true; + } + return this.buildSSE42(out); + } + private buildTable(): ITable { const table: number[] = new Array(MAX_CHAR + 1).fill(0); diff --git a/src/implementation/c/node/z3-lookup-solver.ts b/src/implementation/c/node/z3-lookup-solver.ts new file mode 100644 index 0000000..9f63ab8 --- /dev/null +++ b/src/implementation/c/node/z3-lookup-solver.ts @@ -0,0 +1,40 @@ +const z3 = require('z3-solver'); +import { runAsWorker } from 'synckit' + +runAsWorker(async (byte_lookup_table: Array) => { + const { Context } = await z3.init(); + const { BitVec, Solver, Int, Array, Select } = new Context('main'); + + const tlo = Array.const('TLO', Int.sort(), BitVec.sort(8)); + const thi = Array.const('THI', Int.sort(), BitVec.sort(8)); + const lut = Array.const('LUT', Int.sort(), BitVec.sort(8)); + + const solver = new Solver(); + + for (let i = 0; i < 256; i++) { + if (byte_lookup_table[i] > 0) { + solver.add(Select(lut, i).neq(BitVec.val(0, 8))); + } else { + solver.add(Select(lut, i).eq(BitVec.val(0, 8))); + } + + solver.add(Select(tlo, i & 0xf).and(Select(thi, i >> 4)).eq(Select(lut, i))); + } + + const solved = await solver.check(); + if (solved === 'unsat') { + return null; + } + + const model = await solver.model(); + + let aa = []; + let bb = []; + + for (let i = 0; i < 16; i++) { + aa.push(Number(model.eval(Select(tlo, i)).value())); + bb.push(Number(model.eval(Select(thi, i)).value())); + } + + return [ aa, bb]; +}) \ No newline at end of file From 7602ea1a26b60e48c88b9a7ec1fc42e41e98ee1f Mon Sep 17 00:00:00 2001 From: ngrodzitski Date: Tue, 10 Oct 2023 21:48:24 +0200 Subject: [PATCH 2/2] Fix SSE families The previous commit actually uses SSSE3 instruction. --- src/implementation/c/compilation.ts | 2 +- src/implementation/c/node/table-lookup.ts | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/implementation/c/compilation.ts b/src/implementation/c/compilation.ts index 2a8c4b4..ae0fe77 100644 --- a/src/implementation/c/compilation.ts +++ b/src/implementation/c/compilation.ts @@ -21,7 +21,7 @@ type WrappedNode = frontend.IWrap; // The SSE versions in use with the generator. export enum SseFamily { - SSE2 = 'SSE2', + SSSE3 = 'SSSE3', SSE4_2 = 'SSE4_2', } diff --git a/src/implementation/c/node/table-lookup.ts b/src/implementation/c/node/table-lookup.ts index 95fb887..84e2892 100644 --- a/src/implementation/c/node/table-lookup.ts +++ b/src/implementation/c/node/table-lookup.ts @@ -66,7 +66,7 @@ export class TableLookup extends Node { out.push('}'); } - private buildSSE2(out: string[]): boolean { + private buildSSSE3(out: string[]): boolean { // return false; const ctx = this.compilation; @@ -99,10 +99,10 @@ export class TableLookup extends Node { return false; } - const blob1 = ctx.blob(Buffer.from(result[0]), SSE_ALIGNMENT, SseFamily.SSE2); - const blob2 = ctx.blob(Buffer.from(result[1]), SSE_ALIGNMENT, SseFamily.SSE2); + const blob1 = ctx.blob(Buffer.from(result[0]), SSE_ALIGNMENT, SseFamily.SSSE3); + const blob2 = ctx.blob(Buffer.from(result[1]), SSE_ALIGNMENT, SseFamily.SSSE3); - out.push('#ifdef __SSE2__'); + out.push('#ifdef __SSSE3__'); out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`); out.push(' int index;'); out.push(' __m128i lut_tlo;'); @@ -141,7 +141,7 @@ export class TableLookup extends Node { }); ctx.indent(out, tmp, ' '); out.push('}'); - out.push('#endif /* __SSE2__ */'); + out.push('#endif /* __SSSE3__ */'); return true; } @@ -248,7 +248,7 @@ export class TableLookup extends Node { } private buildSSE(out: string[]): boolean { - if (this.buildSSE2(out)){ + if (this.buildSSSE3(out)){ return true; } return this.buildSSE42(out);