Skip to content

Commit d47d150

Browse files
committed
Revert "Break long query line in chunks #712"
This reverts commit ff2948e.
1 parent ff2948e commit d47d150

File tree

1 file changed

+1
-125
lines changed

1 file changed

+1
-125
lines changed

src/licensedcode/query.py

Lines changed: 1 addition & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ def logger_debug(*args):
9797
return logger.debug(' '.join(isinstance(a, basestring) and a or repr(a) for a in args))
9898

9999

100-
MAX_TOKENS_PER_LINE = 100
101-
102-
103100
def build_query(location=None, query_string=None, idx=None):
104101
"""
105102
Return a Query built from location or query string given an index.
@@ -186,7 +183,7 @@ def __init__(self, location=None, query_string=None, idx=None,
186183
if _test_mode:
187184
return
188185

189-
self.tokenize_and_build_chunked_runs(self.token_slices_by_line(tokenizer=tokenizer), line_threshold=line_threshold)
186+
self.tokenize_and_build_runs(self.tokens_by_line(tokenizer=tokenizer), line_threshold=line_threshold)
190187

191188
# sets of integers initialized after query tokenization
192189
len_junk = idx.len_junk
@@ -278,56 +275,6 @@ def tokens_by_line(self, tokenizer=query_tokenizer):
278275
# for intersection with the query span for scoring matches
279276
self.unknowns_span = Span(unknowns_pos)
280277

281-
def token_slices_by_line(self, tokenizer=query_tokenizer, tokens_per_line=MAX_TOKENS_PER_LINE):
282-
"""
283-
Yield a list of sequence of tokens chunk sfor each line in this query.
284-
Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos` and
285-
`shorts_and_digits_pos` as a side effect.
286-
"""
287-
# bind frequently called functions to local scope
288-
line_by_pos_append = self.line_by_pos.append
289-
self_unknowns_by_pos = self.unknowns_by_pos
290-
unknowns_pos = set()
291-
unknowns_pos_add = unknowns_pos.add
292-
self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add
293-
dic_get = self.idx.dictionary.get
294-
295-
# note: positions start at zero
296-
# this is the absolute position, including the unknown tokens
297-
abs_pos = -1
298-
# lines start at one
299-
line_start = 1
300-
301-
# this is a relative position, excluding the unknown tokens
302-
known_pos = -1
303-
304-
started = False
305-
for lnum, line in enumerate(query_lines(self.location, self.query_string), line_start):
306-
line_tokens = []
307-
line_tokens_append = line_tokens.append
308-
for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1):
309-
tid = dic_get(token)
310-
if tid is not None:
311-
known_pos += 1
312-
started = True
313-
line_by_pos_append(lnum)
314-
if len(token) == 1 or token.isdigit():
315-
self_shorts_and_digits_pos_add(known_pos)
316-
else:
317-
# we have not yet started
318-
if not started:
319-
self_unknowns_by_pos[-1] += 1
320-
else:
321-
self_unknowns_by_pos[known_pos] += 1
322-
unknowns_pos_add(known_pos)
323-
line_tokens_append(tid)
324-
yield [line_tokens[i:i + tokens_per_line] for i in xrange(0, len(line_tokens), tokens_per_line)]
325-
326-
# finally create a Span of positions followed by unknwons, used
327-
# for intersection with the query span for scoring matches
328-
self.unknowns_span = Span(unknowns_pos)
329-
330-
331278
def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
332279
"""
333280
Tokenize this query and populate tokens and query_runs at each break point.
@@ -398,77 +345,6 @@ def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
398345
map(print, self.query_runs)
399346

400347

401-
def tokenize_and_build_chunked_runs(self, token_slices_by_line, line_threshold=4):
402-
"""
403-
Tokenize this query and populate tokens and query_runs at each break point.
404-
Only keep known token ids but consider unknown token ids to break a query in
405-
runs.
406-
407-
`tokens_by_line` is the output of the self.token_slices_by_line() method.
408-
`line_threshold` is the number of empty or junk lines to break a new run.
409-
"""
410-
len_junk = self.idx.len_junk
411-
412-
# initial query run
413-
query_run = QueryRun(query=self, start=0)
414-
415-
# break in runs based on threshold of lines that are either empty, all
416-
# unknown or all low id/junk jokens.
417-
empty_lines = 0
418-
419-
# token positions start at zero
420-
pos = 0
421-
422-
# bind frequently called functions to local scope
423-
tokens_append = self.tokens.append
424-
query_runs_append = self.query_runs.append
425-
426-
for token_slice in token_slices_by_line:
427-
for tokens in token_slice:
428-
# have we reached a run break point?
429-
if (len(query_run) > 0 and empty_lines >= line_threshold):
430-
# start new query run
431-
query_runs_append(query_run)
432-
query_run = QueryRun(query=self, start=pos)
433-
empty_lines = 0
434-
435-
if len(query_run) == 0:
436-
query_run.start = pos
437-
438-
if not tokens:
439-
empty_lines += 1
440-
continue
441-
442-
line_has_known_tokens = False
443-
line_has_good_tokens = False
444-
445-
for token_id in tokens:
446-
if token_id is not None:
447-
tokens_append(token_id)
448-
line_has_known_tokens = True
449-
if token_id >= len_junk:
450-
line_has_good_tokens = True
451-
query_run.end = pos
452-
pos += 1
453-
454-
if not line_has_known_tokens:
455-
empty_lines += 1
456-
continue
457-
458-
if line_has_good_tokens:
459-
empty_lines = 0
460-
else:
461-
empty_lines += 1
462-
463-
# append final run if any
464-
if len(query_run) > 0:
465-
self.query_runs.append(query_run)
466-
467-
if TRACE:
468-
logger_debug('Query runs for query:', self.location)
469-
map(print, self.query_runs)
470-
471-
472348
class QueryRun(object):
473349
"""
474350
A query run is a slice of query tokens identified by a start and end positions

0 commit comments

Comments
 (0)