@@ -65,8 +65,6 @@ Idle mem after: 120 MB
6565
6666*/
6767
68- var runtimeId = 1 ;
69-
7068var filterDict = { } ;
7169var filterDictFrozenCount = 0 ;
7270var filterIndex = { } ;
@@ -82,7 +80,6 @@ var reToken = /[%0-9A-Za-z]{2,}/g;
8280/******************************************************************************/
8381
8482var FilterEntry = function ( token ) {
85- this . id = runtimeId ++ ;
8683 this . token = token ;
8784 this . prefix = '' ;
8885 this . suffix = '' ;
@@ -101,7 +98,6 @@ FilterEntry.prototype.matchString = function(s, tokenBeg, tokenEnd) {
10198// Reset all, thus reducing to a minimum memory footprint of the context.
10299
103100var reset = function ( ) {
104- runtimeId = 1 ;
105101 filterDict = { } ;
106102 filterDictFrozenCount = 0 ;
107103 filterIndex = { } ;
@@ -110,9 +106,10 @@ var reset = function() {
110106/******************************************************************************/
111107
112108// Given a string, find a good token. Tokens which are too generic, i.e. very
113- // common while likely to be false positives, are not good, if possible.
114- // These are collated manually . This has a *significant* positive impact on
109+ // common with a high probability of ending up as a false positive, are not
110+ // good. Avoid if possible . This has a *significant* positive impact on
115111// performance.
112+ // These "bad tokens" are collated manually.
116113
117114var badTokens = {
118115 'com' : true ,
@@ -207,136 +204,34 @@ var add = function(s) {
207204 var suffixKey = suffix . length > 0 ? suffix . charAt ( 0 ) : '0' ;
208205
209206 var fidx = filterIndex ;
210- if ( fidx [ token ] === undefined ) {
211- fidx [ token ] = { } ;
212- }
213- var listkey = prefixKey + suffixKey ;
214- if ( fidx [ token ] [ listkey ] === undefined ) {
215- fidx [ token ] [ listkey ] = [ filter . id ] ;
207+ var tokenKey = prefixKey + token + suffixKey ;
208+ var tokenEntry = fidx [ tokenKey ] ;
209+ if ( tokenEntry === undefined ) {
210+ fidx [ tokenKey ] = filter ;
211+ } else if ( tokenEntry instanceof FilterEntry ) {
212+ fidx [ tokenKey ] = [ tokenEntry , filter ] ;
216213 } else {
217- fidx [ token ] [ listkey ] . push ( filter . id ) ;
214+ tokenEntry . push ( filter ) ;
218215 }
219216
220217 return true ;
221218} ;
222219
223-
224- /******************************************************************************/
225-
226- var mergeSubdict = function ( token ) {
227- var tokenEntry = filterIndex [ token ] ;
228- if ( tokenEntry === undefined ) {
229- return ;
230- }
231- var list = [ ] ;
232- var value ;
233- for ( var key in tokenEntry ) {
234- if ( ! tokenEntry . hasOwnProperty ( key ) ) {
235- continue ;
236- }
237- value = tokenEntry [ key ] ;
238- if ( typeof value === 'number' ) {
239- list . push ( value ) ;
240- } else {
241- list = list . concat ( value ) ;
242- }
243- }
244- filterIndex [ token ] = list . join ( ' ' ) ;
245- } ;
246-
247220/******************************************************************************/
248221
249222var freeze = function ( ) {
250- // TODO: find out if JS engine translate the stringified id into
251- // a number internally. I would think not, but if so, than there might
252- // be a performance hit. The JS array results in a smaller memory
253- // footprint... Need to evaluate the optimal representation.
254- var farr = [ ] ;
255- var fdict = filterDict ;
256-
257- var f ;
258- for ( var s in fdict ) {
259- if ( ! fdict . hasOwnProperty ( s ) ) {
260- continue ;
261- }
262- f = fdict [ s ] ;
263- farr [ f . id ] = f ;
264- }
265- filterDict = farr ;
266-
267- var tokenEntry ;
268- var key , value ;
269- var lastKey ;
270- var kCount , vCount , vCountTotal ;
271- var tokenCountMax , kCountMax , vCountMax = 0 ;
272- for ( var token in filterIndex ) {
273- if ( ! filterIndex . hasOwnProperty ( token ) ) {
274- continue ;
275- }
276- tokenEntry = filterIndex [ token ] ;
277- kCount = vCount = vCountTotal = 0 ;
278- for ( key in tokenEntry ) {
279- if ( ! tokenEntry . hasOwnProperty ( key ) ) {
280- continue ;
281- }
282- // No need to mutate to a string if there is only one
283- // element in the array.
284- lastKey = key ;
285- value = tokenEntry [ key ] ;
286- kCount += 1 ;
287- vCount = value . length ;
288- vCountTotal += vCount ;
289- if ( vCount < 2 ) {
290- tokenEntry [ key ] = value [ 0 ] ;
291- } else {
292- tokenEntry [ key ] = value . join ( ' ' ) ;
293- }
294- if ( vCount > vCountMax ) {
295- tokenCountMax = token ;
296- kCountMax = key ;
297- vCountMax = vCount ;
298- }
299- }
300- // Merge all sub-dicts into a single one at token dict level, if there
301- // is not enough keys or values to justify the overhead.
302- // Also, no need for a sub-dict if there is only one key.
303- if ( kCount < 2 ) {
304- filterIndex [ token ] = tokenEntry [ lastKey ] ;
305- continue ;
306- }
307- if ( vCountTotal < 4 ) {
308- mergeSubdict ( token ) ;
309- continue ;
310- }
311- }
312-
313- filterDictFrozenCount = farr . length ;
314-
315- // console.log('Dict stats:');
316- // console.log('\tToken count:', Object.keys(filterIndex).length);
317- // console.log('\tLargest list: "%s %s" has %d ids', tokenCountMax, kCountMax, vCountMax);
223+ filterDictFrozenCount = Object . keys ( filterDict ) . length ;
224+ filterDict = null ;
318225} ;
319226
320227/******************************************************************************/
321228
322- var matchFromFilterIndex = function ( s , tokenBeg , tokenEnd , index ) {
323- return filterDict [ index ] . matchString ( s , tokenBeg , tokenEnd ) ;
324- } ;
325-
326- /******************************************************************************/
327-
328- var matchFromFilterIndices = function ( s , tokenBeg , tokenEnd , indices ) {
329- var indicesEnd = indices . length ;
330- var indexBeg = 0 , indexEnd ;
331- while ( indexBeg < indicesEnd ) {
332- indexEnd = indices . indexOf ( ' ' , indexBeg ) ;
333- if ( indexEnd < 0 ) {
334- indexEnd = indicesEnd ;
335- }
336- if ( filterDict [ indices . slice ( indexBeg , indexEnd ) ] . matchString ( s , tokenBeg , tokenEnd ) ) {
229+ var matchFromFilterArray = function ( s , tokenBeg , tokenEnd , filters ) {
230+ var i = filters . length ;
231+ while ( i -- ) {
232+ if ( filters [ i ] . matchString ( s , tokenBeg , tokenEnd ) ) {
337233 return true ;
338234 }
339- indexBeg = indexEnd + 1 ;
340235 }
341236 return false ;
342237} ;
@@ -347,54 +242,54 @@ var matchFromSomething = function(s, tokenBeg, tokenEnd, something) {
347242 if ( something === undefined ) {
348243 return false ;
349244 }
350- if ( typeof something === 'number' ) {
351- return filterDict [ something ] . matchString ( s , tokenBeg , tokenEnd ) ;
352- }
353- if ( typeof something === 'string' ) {
354- return matchFromFilterIndices ( s , tokenBeg , tokenEnd , something ) ;
355- }
356245 if ( something instanceof FilterEntry ) {
357246 return something . matchString ( s , tokenBeg , tokenEnd ) ;
358247 }
359- return false ;
248+ return matchFromFilterArray ( s , tokenBeg , tokenEnd , something ) ;
360249} ;
361250
362251/******************************************************************************/
363252
364253var matchString = function ( s ) {
365- if ( filterDictFrozenCount === 0 ) {
366- return false ;
367- }
368-
254+ var sLen = s . length ;
369255 var matches ;
370- var token , tokenEntry ;
256+ var token ;
371257 var tokenBeg , tokenEnd ;
372258 var prefixKey , suffixKey ;
259+ var fidx = filterIndex ;
373260
374261 reToken . lastIndex = 0 ;
375262 while ( matches = reToken . exec ( s ) ) {
376263 token = matches [ 0 ] ;
377- tokenEntry = filterIndex [ token ] ;
378- if ( tokenEntry === undefined ) {
379- continue ;
380- }
381264 tokenBeg = matches . index ;
382265 tokenEnd = reToken . lastIndex ;
383- if ( typeof tokenEntry !== 'object' ) {
384- if ( matchFromSomething ( s , tokenBeg , tokenEnd , tokenEntry ) ) {
266+ prefixKey = tokenBeg > 0 ? s . charAt ( matches . index - 1 ) : '0' ;
267+ suffixKey = tokenEnd < s . length ? s . charAt ( tokenEnd ) : '0' ;
268+
269+ if ( tokenBeg > 0 && tokenEnd < sLen ) {
270+ if ( matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ prefixKey + token + suffixKey ] ) ||
271+ matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ prefixKey + token + '0' ] ) ||
272+ matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + suffixKey ] ) ||
273+ matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + '0' ] ) ) {
385274 return true ;
386275 }
387276 continue ;
388277 }
389- prefixKey = tokenBeg > 0 ? s . charAt ( matches . index - 1 ) : '0' ;
390- suffixKey = tokenEnd < s . length ? s . charAt ( tokenEnd ) : '0' ;
391- if ( matchFromSomething ( s , tokenBeg , tokenEnd , tokenEntry [ prefixKey + suffixKey ] ) ) {
392- return true ;
278+ if ( tokenBeg > 0 ) {
279+ if ( matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ prefixKey + token + '0' ] ) ||
280+ matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + '0' ] ) ) {
281+ return true ;
282+ }
283+ continue ;
393284 }
394- if ( matchFromSomething ( s , tokenBeg , tokenEnd , tokenEntry [ prefixKey + '0' ] ) ) {
395- return true ;
285+ if ( tokenEnd < sLen ) {
286+ if ( matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + suffixKey ] ) ||
287+ matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + '0' ] ) ) {
288+ return true ;
289+ }
290+ continue ;
396291 }
397- if ( matchFromSomething ( s , tokenBeg , tokenEnd , tokenEntry [ '0' + suffixKey ] ) ) {
292+ if ( matchFromSomething ( s , tokenBeg , tokenEnd , fidx [ '0' + token + '0' ] ) ) {
398293 return true ;
399294 }
400295 }
0 commit comments