@@ -97,9 +97,6 @@ def logger_debug(*args):
9797 return logger .debug (' ' .join (isinstance (a , basestring ) and a or repr (a ) for a in args ))
9898
9999
100- MAX_TOKENS_PER_LINE = 100
101-
102-
103100def build_query (location = None , query_string = None , idx = None ):
104101 """
105102 Return a Query built from location or query string given an index.
@@ -186,7 +183,7 @@ def __init__(self, location=None, query_string=None, idx=None,
186183 if _test_mode :
187184 return
188185
189- self .tokenize_and_build_chunked_runs (self .token_slices_by_line (tokenizer = tokenizer ), line_threshold = line_threshold )
186+ self .tokenize_and_build_runs (self .tokens_by_line (tokenizer = tokenizer ), line_threshold = line_threshold )
190187
191188 # sets of integers initialized after query tokenization
192189 len_junk = idx .len_junk
@@ -278,56 +275,6 @@ def tokens_by_line(self, tokenizer=query_tokenizer):
278275 # for intersection with the query span for scoring matches
279276 self .unknowns_span = Span (unknowns_pos )
280277
281- def token_slices_by_line (self , tokenizer = query_tokenizer , tokens_per_line = MAX_TOKENS_PER_LINE ):
282- """
283- Yield a list of sequence of tokens chunk sfor each line in this query.
284- Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
285- `shorts_and_digits_pos` as a side effect.
286- """
287- # bind frequently called functions to local scope
288- line_by_pos_append = self .line_by_pos .append
289- self_unknowns_by_pos = self .unknowns_by_pos
290- unknowns_pos = set ()
291- unknowns_pos_add = unknowns_pos .add
292- self_shorts_and_digits_pos_add = self .shorts_and_digits_pos .add
293- dic_get = self .idx .dictionary .get
294-
295- # note: positions start at zero
296- # this is the absolute position, including the unknown tokens
297- abs_pos = - 1
298- # lines start at one
299- line_start = 1
300-
301- # this is a relative position, excluding the unknown tokens
302- known_pos = - 1
303-
304- started = False
305- for lnum , line in enumerate (query_lines (self .location , self .query_string ), line_start ):
306- line_tokens = []
307- line_tokens_append = line_tokens .append
308- for abs_pos , token in enumerate (tokenizer (line ), abs_pos + 1 ):
309- tid = dic_get (token )
310- if tid is not None :
311- known_pos += 1
312- started = True
313- line_by_pos_append (lnum )
314- if len (token ) == 1 or token .isdigit ():
315- self_shorts_and_digits_pos_add (known_pos )
316- else :
317- # we have not yet started
318- if not started :
319- self_unknowns_by_pos [- 1 ] += 1
320- else :
321- self_unknowns_by_pos [known_pos ] += 1
322- unknowns_pos_add (known_pos )
323- line_tokens_append (tid )
324- yield [line_tokens [i :i + tokens_per_line ] for i in xrange (0 , len (line_tokens ), tokens_per_line )]
325-
326- # finally create a Span of positions followed by unknwons, used
327- # for intersection with the query span for scoring matches
328- self .unknowns_span = Span (unknowns_pos )
329-
330-
331278 def tokenize_and_build_runs (self , tokens_by_line , line_threshold = 4 ):
332279 """
333280 Tokenize this query and populate tokens and query_runs at each break point.
@@ -398,77 +345,6 @@ def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
398345 map (print , self .query_runs )
399346
400347
401- def tokenize_and_build_chunked_runs (self , token_slices_by_line , line_threshold = 4 ):
402- """
403- Tokenize this query and populate tokens and query_runs at each break point.
404- Only keep known token ids but consider unknown token ids to break a query in
405- runs.
406-
407- `tokens_by_line` is the output of the self.token_slices_by_line() method.
408- `line_threshold` is the number of empty or junk lines to break a new run.
409- """
410- len_junk = self .idx .len_junk
411-
412- # initial query run
413- query_run = QueryRun (query = self , start = 0 )
414-
415- # break in runs based on threshold of lines that are either empty, all
416- # unknown or all low id/junk jokens.
417- empty_lines = 0
418-
419- # token positions start at zero
420- pos = 0
421-
422- # bind frequently called functions to local scope
423- tokens_append = self .tokens .append
424- query_runs_append = self .query_runs .append
425-
426- for token_slice in token_slices_by_line :
427- for tokens in token_slice :
428- # have we reached a run break point?
429- if (len (query_run ) > 0 and empty_lines >= line_threshold ):
430- # start new query run
431- query_runs_append (query_run )
432- query_run = QueryRun (query = self , start = pos )
433- empty_lines = 0
434-
435- if len (query_run ) == 0 :
436- query_run .start = pos
437-
438- if not tokens :
439- empty_lines += 1
440- continue
441-
442- line_has_known_tokens = False
443- line_has_good_tokens = False
444-
445- for token_id in tokens :
446- if token_id is not None :
447- tokens_append (token_id )
448- line_has_known_tokens = True
449- if token_id >= len_junk :
450- line_has_good_tokens = True
451- query_run .end = pos
452- pos += 1
453-
454- if not line_has_known_tokens :
455- empty_lines += 1
456- continue
457-
458- if line_has_good_tokens :
459- empty_lines = 0
460- else :
461- empty_lines += 1
462-
463- # append final run if any
464- if len (query_run ) > 0 :
465- self .query_runs .append (query_run )
466-
467- if TRACE :
468- logger_debug ('Query runs for query:' , self .location )
469- map (print , self .query_runs )
470-
471-
472348class QueryRun (object ):
473349 """
474350 A query run is a slice of query tokens identified by a start and end positions
0 commit comments