Skip to content

Commit 25d4378

Browse files
committed
c: vectorize table lookups
Use SSE4.2 vector instructions to speed up matching. When node has a lot characters to match and loops to itself, the table lookups could be replaced with a call to `_mm_cmpestri`, skipping all matching characters in the input stream. PR-URL: #24
1 parent 2a53b39 commit 25d4378

File tree

5 files changed

+169
-11
lines changed

5 files changed

+169
-11
lines changed

src/implementation/c/compilation.ts

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ const BLOB_GROUP_SIZE = 11;
1919

2020
type WrappedNode = frontend.IWrap<frontend.node.Node>;
2121

22+
interface IBlob {
23+
readonly alignment: number | undefined;
24+
readonly buffer: Buffer;
25+
readonly name: string;
26+
}
27+
2228
// TODO(indutny): deduplicate
2329
export interface ICompilationOptions {
2430
readonly debug?: string;
@@ -32,7 +38,7 @@ export interface ICompilationProperty {
3238

3339
export class Compilation {
3440
private readonly stateMap: Map<string, ReadonlyArray<string>> = new Map();
35-
private readonly blobs: Map<Buffer, string> = new Map();
41+
private readonly blobs: Map<Buffer, IBlob> = new Map();
3642
private readonly codeMap: Map<string, Code<frontend.code.Code>> = new Map();
3743
private readonly matchSequence:
3844
Map<string, MatchSequence> = new Map();
@@ -64,14 +70,20 @@ export class Compilation {
6470
return;
6571
}
6672

67-
for (const [ blob, name ] of this.blobs) {
68-
out.push(`static const unsigned char ${name}[] = {`);
73+
for (const blob of this.blobs.values()) {
74+
const buffer = blob.buffer;
75+
let align = '';
76+
if (blob.alignment) {
77+
align = ` ALIGN(${blob.alignment})`;
78+
}
6979

70-
for (let i = 0; i < blob.length; i += BLOB_GROUP_SIZE) {
71-
const limit = Math.min(blob.length, i + BLOB_GROUP_SIZE);
80+
out.push(`static const unsigned char${align} ${blob.name}[] = {`);
81+
82+
for (let i = 0; i < buffer.length; i += BLOB_GROUP_SIZE) {
83+
const limit = Math.min(buffer.length, i + BLOB_GROUP_SIZE);
7284
const hex: string[] = [];
7385
for (let j = i; j < limit; j++) {
74-
const value = blob[j] as number;
86+
const value = buffer[j] as number;
7587

7688
const ch = String.fromCharCode(value);
7789
// `'`, `\`
@@ -84,7 +96,7 @@ export class Compilation {
8496
}
8597
}
8698
let line = ' ' + hex.join(', ');
87-
if (limit !== blob.length) {
99+
if (limit !== buffer.length) {
88100
line += ',';
89101
}
90102
out.push(line);
@@ -304,12 +316,17 @@ export class Compilation {
304316
return JSON.stringify(value);
305317
}
306318

307-
public blob(value: Buffer): string {
319+
public blob(value: Buffer, alignment?: number): string {
308320
if (this.blobs.has(value)) {
309-
return this.blobs.get(value)!;
321+
return this.blobs.get(value)!.name;
310322
}
323+
311324
const res = BLOB_PREFIX + this.blobs.size;
312-
this.blobs.set(value, res);
325+
this.blobs.set(value, {
326+
alignment,
327+
buffer: value,
328+
name: res,
329+
});
313330
return res;
314331
}
315332
}

src/implementation/c/index.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,27 @@ export class CCompiler {
3535
out.push('#include <stdlib.h>');
3636
out.push('#include <stdint.h>');
3737
out.push('#include <string.h>');
38+
out.push('');
39+
40+
// NOTE: Inspired by https://github.com/h2o/picohttpparser
41+
// TODO(indutny): Windows support for SSE4.2.
42+
// See: https://github.com/nodejs/llparse/pull/24#discussion_r299789676
43+
// (There is no `__SSE4_2__` define for MSVC)
44+
out.push('#ifdef __SSE4_2__');
45+
out.push(' #ifdef _MSC_VER');
46+
out.push(' #include <nmmintrin.h>');
47+
out.push(' #else /* !_MSC_VER */');
48+
out.push(' #include <x86intrin.h>');
49+
out.push(' #endif /* _MSC_VER */');
50+
out.push('#endif /* __SSE4_2__ */');
51+
out.push('');
52+
53+
out.push('#ifdef _MSC_VER');
54+
out.push(' #define ALIGN(n) _declspec(align(n))');
55+
out.push('#else /* !_MSC_VER */');
56+
out.push(' #define ALIGN(n) __attribute__((aligned(n)))');
57+
out.push('#endif /* _MSC_VER */');
58+
3859
out.push('');
3960
out.push(`#include "${this.options.header || info.prefix}.h"`);
4061
out.push(``);

src/implementation/c/node/table-lookup.ts

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ import { Node } from './base';
77
const MAX_CHAR = 0xff;
88
const TABLE_GROUP = 16;
99

10+
// _mm_cmpestri takes 8 ranges
11+
const SSE_RANGES_LEN = 16;
12+
const MAX_SSE_CALLS = 2;
13+
const SSE_ALIGNMENT = 16;
14+
1015
interface ITable {
1116
readonly name: string;
1217
readonly declaration: ReadonlyArray<string>;
@@ -24,8 +29,13 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
2429
this.prologue(out);
2530

2631
const transform = ctx.unwrapTransform(this.ref.transform!);
27-
const current = transform.build(ctx, `*${ctx.posArg()}`);
2832

33+
// Try to vectorize nodes matching characters and looping to themselves
34+
// NOTE: `switch` below triggers when there is not enough characters in the
35+
// stream for vectorized processing.
36+
this.buildSSE(out);
37+
38+
const current = transform.build(ctx, `*${ctx.posArg()}`);
2939
out.push(`switch (${table.name}[(uint8_t) ${current}]) {`);
3040

3141
for (const [ index, edge ] of this.ref.edges.entries()) {
@@ -53,6 +63,102 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
5363
out.push('}');
5464
}
5565

66+
private buildSSE(out: string[]): boolean {
67+
const ctx = this.compilation;
68+
69+
// Transformation is not supported atm
70+
if (this.ref.transform && this.ref.transform.ref.name !== 'id') {
71+
return false;
72+
}
73+
74+
if (this.ref.edges.length !== 1) {
75+
return false;
76+
}
77+
78+
const edge = this.ref.edges[0];
79+
if (edge.node.ref !== this.ref) {
80+
return false;
81+
}
82+
83+
// NOTE: keys are sorted
84+
let ranges: number[] = [];
85+
let first: number | undefined;
86+
let last: number | undefined;
87+
for (const key of edge.keys) {
88+
if (first === undefined) {
89+
first = key;
90+
}
91+
if (last === undefined) {
92+
last = key;
93+
}
94+
95+
if (key - last > 1) {
96+
ranges.push(first, last);
97+
first = key;
98+
}
99+
last = key;
100+
}
101+
if (first !== undefined && last !== undefined) {
102+
ranges.push(first, last);
103+
}
104+
105+
if (ranges.length === 0) {
106+
return false;
107+
}
108+
109+
// Way too many calls would be required
110+
if (ranges.length > MAX_SSE_CALLS * SSE_RANGES_LEN) {
111+
return false;
112+
}
113+
114+
out.push('#ifdef __SSE4_2__');
115+
out.push(`if (${ctx.endPosArg()} - ${ctx.posArg()} >= 16) {`);
116+
out.push(' __m128i ranges;');
117+
out.push(' __m128i input;');
118+
out.push(' int avail;');
119+
out.push(' int match_len;');
120+
out.push('');
121+
out.push(' /* Load input */');
122+
out.push(` input = _mm_loadu_si128((__m128i const*) ${ctx.posArg()});`);
123+
for (let off = 0; off < ranges.length; off += SSE_RANGES_LEN) {
124+
const subRanges = ranges.slice(off, off + SSE_RANGES_LEN);
125+
126+
const blob = ctx.blob(Buffer.from(subRanges), SSE_ALIGNMENT);
127+
out.push(` ranges = _mm_loadu_si128((__m128i const*) ${blob});`);
128+
out.push('');
129+
130+
out.push(' /* Find first character that does not match `ranges` */');
131+
out.push(` match_len = _mm_cmpestri(ranges, ${subRanges.length},`);
132+
out.push(' input, 16,');
133+
out.push(' _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES |');
134+
out.push(' _SIDD_NEGATIVE_POLARITY);');
135+
out.push('');
136+
out.push(' if (match_len != 0) {');
137+
out.push(` ${ctx.posArg()} += match_len;`);
138+
139+
const tmp: string[] = [];
140+
assert.strictEqual(edge.noAdvance, false);
141+
this.tailTo(tmp, {
142+
noAdvance: true,
143+
node: edge.node,
144+
});
145+
ctx.indent(out, tmp, ' ');
146+
147+
out.push(' }');
148+
}
149+
150+
{
151+
const tmp: string[] = [];
152+
this.tailTo(tmp, this.ref.otherwise!);
153+
ctx.indent(out, tmp, ' ');
154+
}
155+
out.push('}');
156+
157+
out.push('#endif /* __SSE4_2__ */');
158+
159+
return true;
160+
}
161+
56162
// TODO(indutny): reduce copy-paste between `C` and `bitcode` implementations
57163
private buildTable(): ITable {
58164
const table: number[] = new Array(MAX_CHAR + 1).fill(0);

test/compiler-test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,19 @@ describe('llparse/Compiler', () => {
200200
const binary = build(p, start, 'escape-char');
201201
await binary.check('\\\'', 'off=1\noff=2\n');
202202
});
203+
204+
it('should hit SSE4.2 optimization for table-lookup', async () => {
205+
const start = p.node('start');
206+
207+
start
208+
.match(ALPHA, start)
209+
.skipTo(printOff(p, start));
210+
211+
// TODO(indutny): validate compilation result?
212+
const binary = build(p, start, 'match-bit-check-sse');
213+
await binary.check('abcdabcdabcdabcdabcdabcdabcd.abcd.',
214+
'off=29\noff=34\n');
215+
});
203216
});
204217

205218
describe('`.peek()`', () => {

test/fixtures/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export { ERROR_PAUSE } from 'llparse-test-fixture';
99
const fixtures = new Fixture({
1010
buildDir: path.join(__dirname, '..', 'tmp'),
1111
extra: [
12+
'-msse4.2',
1213
'-DLLPARSE__TEST_INIT=llparse__test_init',
1314
path.join(__dirname, 'extra.c'),
1415
],

0 commit comments

Comments
 (0)