@@ -7,6 +7,11 @@ import { Node } from './base';
77const MAX_CHAR = 0xff ;
88const TABLE_GROUP = 16 ;
99
10+ // _mm_cmpestri takes 8 ranges
11+ const SSE_RANGES_LEN = 16 ;
12+ const MAX_SSE_CALLS = 2 ;
13+ const SSE_ALIGNMENT = 16 ;
14+
1015interface ITable {
1116 readonly name : string ;
1217 readonly declaration : ReadonlyArray < string > ;
@@ -24,8 +29,13 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
2429 this . prologue ( out ) ;
2530
2631 const transform = ctx . unwrapTransform ( this . ref . transform ! ) ;
27- const current = transform . build ( ctx , `*${ ctx . posArg ( ) } ` ) ;
2832
33+ // Try to vectorize nodes matching characters and looping to themselves
34+ // NOTE: `switch` below triggers when there is not enough characters in the
35+ // stream for vectorized processing.
36+ this . buildSSE ( out ) ;
37+
38+ const current = transform . build ( ctx , `*${ ctx . posArg ( ) } ` ) ;
2939 out . push ( `switch (${ table . name } [(uint8_t) ${ current } ]) {` ) ;
3040
3141 for ( const [ index , edge ] of this . ref . edges . entries ( ) ) {
@@ -53,6 +63,102 @@ export class TableLookup extends Node<frontend.node.TableLookup> {
5363 out . push ( '}' ) ;
5464 }
5565
66+ private buildSSE ( out : string [ ] ) : boolean {
67+ const ctx = this . compilation ;
68+
69+ // Transformation is not supported atm
70+ if ( this . ref . transform && this . ref . transform . ref . name !== 'id' ) {
71+ return false ;
72+ }
73+
74+ if ( this . ref . edges . length !== 1 ) {
75+ return false ;
76+ }
77+
78+ const edge = this . ref . edges [ 0 ] ;
79+ if ( edge . node . ref !== this . ref ) {
80+ return false ;
81+ }
82+
83+ // NOTE: keys are sorted
84+ let ranges : number [ ] = [ ] ;
85+ let first : number | undefined ;
86+ let last : number | undefined ;
87+ for ( const key of edge . keys ) {
88+ if ( first === undefined ) {
89+ first = key ;
90+ }
91+ if ( last === undefined ) {
92+ last = key ;
93+ }
94+
95+ if ( key - last > 1 ) {
96+ ranges . push ( first , last ) ;
97+ first = key ;
98+ }
99+ last = key ;
100+ }
101+ if ( first !== undefined && last !== undefined ) {
102+ ranges . push ( first , last ) ;
103+ }
104+
105+ if ( ranges . length === 0 ) {
106+ return false ;
107+ }
108+
109+ // Way too many calls would be required
110+ if ( ranges . length > MAX_SSE_CALLS * SSE_RANGES_LEN ) {
111+ return false ;
112+ }
113+
114+ out . push ( '#ifdef __SSE4_2__' ) ;
115+ out . push ( `if (${ ctx . endPosArg ( ) } - ${ ctx . posArg ( ) } >= 16) {` ) ;
116+ out . push ( ' __m128i ranges;' ) ;
117+ out . push ( ' __m128i input;' ) ;
118+ out . push ( ' int avail;' ) ;
119+ out . push ( ' int match_len;' ) ;
120+ out . push ( '' ) ;
121+ out . push ( ' /* Load input */' ) ;
122+ out . push ( ` input = _mm_loadu_si128((__m128i const*) ${ ctx . posArg ( ) } );` ) ;
123+ for ( let off = 0 ; off < ranges . length ; off += SSE_RANGES_LEN ) {
124+ const subRanges = ranges . slice ( off , off + SSE_RANGES_LEN ) ;
125+
126+ const blob = ctx . blob ( Buffer . from ( subRanges ) , SSE_ALIGNMENT ) ;
127+ out . push ( ` ranges = _mm_loadu_si128((__m128i const*) ${ blob } );` ) ;
128+ out . push ( '' ) ;
129+
130+ out . push ( ' /* Find first character that does not match `ranges` */' ) ;
131+ out . push ( ` match_len = _mm_cmpestri(ranges, ${ subRanges . length } ,` ) ;
132+ out . push ( ' input, 16,' ) ;
133+ out . push ( ' _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES |' ) ;
134+ out . push ( ' _SIDD_NEGATIVE_POLARITY);' ) ;
135+ out . push ( '' ) ;
136+ out . push ( ' if (match_len != 0) {' ) ;
137+ out . push ( ` ${ ctx . posArg ( ) } += match_len;` ) ;
138+
139+ const tmp : string [ ] = [ ] ;
140+ assert . strictEqual ( edge . noAdvance , false ) ;
141+ this . tailTo ( tmp , {
142+ noAdvance : true ,
143+ node : edge . node ,
144+ } ) ;
145+ ctx . indent ( out , tmp , ' ' ) ;
146+
147+ out . push ( ' }' ) ;
148+ }
149+
150+ {
151+ const tmp : string [ ] = [ ] ;
152+ this . tailTo ( tmp , this . ref . otherwise ! ) ;
153+ ctx . indent ( out , tmp , ' ' ) ;
154+ }
155+ out . push ( '}' ) ;
156+
157+ out . push ( '#endif /* __SSE4_2__ */' ) ;
158+
159+ return true ;
160+ }
161+
56162 // TODO(indutny): reduce copy-paste between `C` and `bitcode` implementations
57163 private buildTable ( ) : ITable {
58164 const table : number [ ] = new Array ( MAX_CHAR + 1 ) . fill ( 0 ) ;
0 commit comments