@@ -97,6 +97,9 @@ def logger_debug(*args):
9797 return logger .debug (' ' .join (isinstance (a , basestring ) and a or repr (a ) for a in args ))
9898
9999
100+ MAX_TOKENS_PER_LINE = 100
101+
102+
100103def build_query (location = None , query_string = None , idx = None ):
101104 """
102105 Return a Query built from location or query string given an index.
@@ -183,7 +186,7 @@ def __init__(self, location=None, query_string=None, idx=None,
183186 if _test_mode :
184187 return
185188
186- self .tokenize_and_build_runs (self .tokens_by_line (tokenizer = tokenizer ), line_threshold = line_threshold )
189+ self .tokenize_and_build_chunked_runs (self .token_slices_by_line (tokenizer = tokenizer ), line_threshold = line_threshold )
187190
188191 # sets of integers initialized after query tokenization
189192 len_junk = idx .len_junk
@@ -275,6 +278,56 @@ def tokens_by_line(self, tokenizer=query_tokenizer):
275278 # for intersection with the query span for scoring matches
276279 self .unknowns_span = Span (unknowns_pos )
277280
281+ def token_slices_by_line (self , tokenizer = query_tokenizer , tokens_per_line = MAX_TOKENS_PER_LINE ):
282+ """
283+ Yield a list of sequence of tokens chunk sfor each line in this query.
284+ Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
285+ `shorts_and_digits_pos` as a side effect.
286+ """
287+ # bind frequently called functions to local scope
288+ line_by_pos_append = self .line_by_pos .append
289+ self_unknowns_by_pos = self .unknowns_by_pos
290+ unknowns_pos = set ()
291+ unknowns_pos_add = unknowns_pos .add
292+ self_shorts_and_digits_pos_add = self .shorts_and_digits_pos .add
293+ dic_get = self .idx .dictionary .get
294+
295+ # note: positions start at zero
296+ # this is the absolute position, including the unknown tokens
297+ abs_pos = - 1
298+ # lines start at one
299+ line_start = 1
300+
301+ # this is a relative position, excluding the unknown tokens
302+ known_pos = - 1
303+
304+ started = False
305+ for lnum , line in enumerate (query_lines (self .location , self .query_string ), line_start ):
306+ line_tokens = []
307+ line_tokens_append = line_tokens .append
308+ for abs_pos , token in enumerate (tokenizer (line ), abs_pos + 1 ):
309+ tid = dic_get (token )
310+ if tid is not None :
311+ known_pos += 1
312+ started = True
313+ line_by_pos_append (lnum )
314+ if len (token ) == 1 or token .isdigit ():
315+ self_shorts_and_digits_pos_add (known_pos )
316+ else :
317+ # we have not yet started
318+ if not started :
319+ self_unknowns_by_pos [- 1 ] += 1
320+ else :
321+ self_unknowns_by_pos [known_pos ] += 1
322+ unknowns_pos_add (known_pos )
323+ line_tokens_append (tid )
324+ yield [line_tokens [i :i + tokens_per_line ] for i in xrange (0 , len (line_tokens ), tokens_per_line )]
325+
326+ # finally create a Span of positions followed by unknwons, used
327+ # for intersection with the query span for scoring matches
328+ self .unknowns_span = Span (unknowns_pos )
329+
330+
278331 def tokenize_and_build_runs (self , tokens_by_line , line_threshold = 4 ):
279332 """
280333 Tokenize this query and populate tokens and query_runs at each break point.
@@ -345,6 +398,77 @@ def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
345398 map (print , self .query_runs )
346399
347400
401+ def tokenize_and_build_chunked_runs (self , token_slices_by_line , line_threshold = 4 ):
402+ """
403+ Tokenize this query and populate tokens and query_runs at each break point.
404+ Only keep known token ids but consider unknown token ids to break a query in
405+ runs.
406+
407+ `tokens_by_line` is the output of the self.token_slices_by_line() method.
408+ `line_threshold` is the number of empty or junk lines to break a new run.
409+ """
410+ len_junk = self .idx .len_junk
411+
412+ # initial query run
413+ query_run = QueryRun (query = self , start = 0 )
414+
415+ # break in runs based on threshold of lines that are either empty, all
416+ # unknown or all low id/junk jokens.
417+ empty_lines = 0
418+
419+ # token positions start at zero
420+ pos = 0
421+
422+ # bind frequently called functions to local scope
423+ tokens_append = self .tokens .append
424+ query_runs_append = self .query_runs .append
425+
426+ for token_slice in token_slices_by_line :
427+ for tokens in token_slice :
428+ # have we reached a run break point?
429+ if (len (query_run ) > 0 and empty_lines >= line_threshold ):
430+ # start new query run
431+ query_runs_append (query_run )
432+ query_run = QueryRun (query = self , start = pos )
433+ empty_lines = 0
434+
435+ if len (query_run ) == 0 :
436+ query_run .start = pos
437+
438+ if not tokens :
439+ empty_lines += 1
440+ continue
441+
442+ line_has_known_tokens = False
443+ line_has_good_tokens = False
444+
445+ for token_id in tokens :
446+ if token_id is not None :
447+ tokens_append (token_id )
448+ line_has_known_tokens = True
449+ if token_id >= len_junk :
450+ line_has_good_tokens = True
451+ query_run .end = pos
452+ pos += 1
453+
454+ if not line_has_known_tokens :
455+ empty_lines += 1
456+ continue
457+
458+ if line_has_good_tokens :
459+ empty_lines = 0
460+ else :
461+ empty_lines += 1
462+
463+ # append final run if any
464+ if len (query_run ) > 0 :
465+ self .query_runs .append (query_run )
466+
467+ if TRACE :
468+ logger_debug ('Query runs for query:' , self .location )
469+ map (print , self .query_runs )
470+
471+
348472class QueryRun (object ):
349473 """
350474 A query run is a slice of query tokens identified by a start and end positions
0 commit comments