Skip to content

Commit ff2948e

Browse files
committed
Break long query line in chunks #712
* ensure that more query rune are created for very long lines Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 25502a2 commit ff2948e

File tree

1 file changed

+125
-1
lines changed

1 file changed

+125
-1
lines changed

src/licensedcode/query.py

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ def logger_debug(*args):
9797
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
9898

9999

100+
MAX_TOKENS_PER_LINE = 100
101+
102+
100103
def build_query(location=None, query_string=None, idx=None):
101104
"""
102105
Return a Query built from location or query string given an index.
@@ -183,7 +186,7 @@ def __init__(self, location=None, query_string=None, idx=None,
183186
if _test_mode:
184187
return
185188

186-
self.tokenize_and_build_runs(self.tokens_by_line(tokenizer=tokenizer), line_threshold=line_threshold)
189+
self.tokenize_and_build_chunked_runs(self.token_slices_by_line(tokenizer=tokenizer), line_threshold=line_threshold)
187190

188191
# sets of integers initialized after query tokenization
189192
len_junk = idx.len_junk
@@ -275,6 +278,56 @@ def tokens_by_line(self, tokenizer=query_tokenizer):
275278
# for intersection with the query span for scoring matches
276279
self.unknowns_span = Span(unknowns_pos)
277280

281+
def token_slices_by_line(self, tokenizer=query_tokenizer, tokens_per_line=MAX_TOKENS_PER_LINE):
282+
"""
283+
Yield a list of sequence of tokens chunk sfor each line in this query.
284+
Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
285+
`shorts_and_digits_pos` as a side effect.
286+
"""
287+
# bind frequently called functions to local scope
288+
line_by_pos_append = self.line_by_pos.append
289+
self_unknowns_by_pos = self.unknowns_by_pos
290+
unknowns_pos = set()
291+
unknowns_pos_add = unknowns_pos.add
292+
self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
293+
dic_get = self.idx.dictionary.get
294+
295+
# note: positions start at zero
296+
# this is the absolute position, including the unknown tokens
297+
abs_pos = -1
298+
# lines start at one
299+
line_start = 1
300+
301+
# this is a relative position, excluding the unknown tokens
302+
known_pos = -1
303+
304+
started = False
305+
for lnum, line in enumerate(query_lines(self.location, self.query_string), line_start):
306+
line_tokens = []
307+
line_tokens_append = line_tokens.append
308+
for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
309+
tid = dic_get(token)
310+
if tid is not None:
311+
known_pos += 1
312+
started = True
313+
line_by_pos_append(lnum)
314+
if len(token) == 1 or token.isdigit():
315+
self_shorts_and_digits_pos_add(known_pos)
316+
else:
317+
# we have not yet started
318+
if not started:
319+
self_unknowns_by_pos[-1] += 1
320+
else:
321+
self_unknowns_by_pos[known_pos] += 1
322+
unknowns_pos_add(known_pos)
323+
line_tokens_append(tid)
324+
yield [line_tokens[i:i + tokens_per_line] for i in xrange(0, len(line_tokens), tokens_per_line)]
325+
326+
# finally create a Span of positions followed by unknwons, used
327+
# for intersection with the query span for scoring matches
328+
self.unknowns_span = Span(unknowns_pos)
329+
330+
278331
def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
279332
"""
280333
Tokenize this query and populate tokens and query_runs at each break point.
@@ -345,6 +398,77 @@ def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
345398
map(print, self.query_runs)
346399

347400

401+
def tokenize_and_build_chunked_runs(self, token_slices_by_line, line_threshold=4):
402+
"""
403+
Tokenize this query and populate tokens and query_runs at each break point.
404+
Only keep known token ids but consider unknown token ids to break a query in
405+
runs.
406+
407+
`tokens_by_line` is the output of the self.token_slices_by_line() method.
408+
`line_threshold` is the number of empty or junk lines to break a new run.
409+
"""
410+
len_junk = self.idx.len_junk
411+
412+
# initial query run
413+
query_run = QueryRun(query=self, start=0)
414+
415+
# break in runs based on threshold of lines that are either empty, all
416+
# unknown or all low id/junk jokens.
417+
empty_lines = 0
418+
419+
# token positions start at zero
420+
pos = 0
421+
422+
# bind frequently called functions to local scope
423+
tokens_append = self.tokens.append
424+
query_runs_append = self.query_runs.append
425+
426+
for token_slice in token_slices_by_line:
427+
for tokens in token_slice:
428+
# have we reached a run break point?
429+
if (len(query_run) > 0 and empty_lines >= line_threshold):
430+
# start new query run
431+
query_runs_append(query_run)
432+
query_run = QueryRun(query=self, start=pos)
433+
empty_lines = 0
434+
435+
if len(query_run) == 0:
436+
query_run.start = pos
437+
438+
if not tokens:
439+
empty_lines += 1
440+
continue
441+
442+
line_has_known_tokens = False
443+
line_has_good_tokens = False
444+
445+
for token_id in tokens:
446+
if token_id is not None:
447+
tokens_append(token_id)
448+
line_has_known_tokens = True
449+
if token_id >= len_junk:
450+
line_has_good_tokens = True
451+
query_run.end = pos
452+
pos += 1
453+
454+
if not line_has_known_tokens:
455+
empty_lines += 1
456+
continue
457+
458+
if line_has_good_tokens:
459+
empty_lines = 0
460+
else:
461+
empty_lines += 1
462+
463+
# append final run if any
464+
if len(query_run) > 0:
465+
self.query_runs.append(query_run)
466+
467+
if TRACE:
468+
logger_debug('Query runs for query:', self.location)
469+
map(print, self.query_runs)
470+
471+
348472
class QueryRun(object):
349473
"""
350474
A query run is a slice of query tokens identified by a start and end positions

0 commit comments

Comments
 (0)