Skip to content

Commit 24d7707

Browse files
committed
Renamed short but cryptic variables idoc/qdoc/ipos/qpos to long form.
* query_doc and query_position * index_doc and index_position
1 parent 32e8153 commit 24d7707

File tree

5 files changed

+168
-168
lines changed

5 files changed

+168
-168
lines changed

src/licensedcode/detect.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ def detect_license(location=None, perfect=True):
6969
# yielding the default license if provided
7070
for detected_license in match.rule.licenses:
7171
yield (detected_license,
72-
match.qpos.start_line, match.qpos.end_line,
73-
match.qpos.start_char, match.qpos.end_char,
72+
match.query_position.start_line, match.query_position.end_line,
73+
match.query_position.start_char, match.query_position.end_char,
7474
match.rule.identifier,
7575
match.score,)
7676

@@ -160,8 +160,8 @@ def match(self, location, perfect=True):
160160
for rule_id, matched_pos in matches.items():
161161
rule = self.rules_by_id[rule_id]
162162
for match in matched_pos:
163-
ipos, qpos = match
164-
lmatch = LicenseMatch(rule, qpos, ipos, score=100)
163+
index_position, query_position = match
164+
lmatch = LicenseMatch(rule, query_position, index_position, score=100)
165165
license_matches.append(lmatch)
166166
return filter_matches(license_matches)
167167

@@ -185,20 +185,20 @@ class LicenseMatch(object):
185185
- the span of the matched region: start and end positions of the analyzed
186186
text where the rule was matched.
187187
188-
- ipos and qpos: the detailed position Token of the match and matched to texts
188+
- index_position and query_position: the detailed position Token of the match and matched to texts
189189
- score: a float normalized between 0 and 100. Higher means better.
190190
Exact match score is always 100.
191191
"""
192192

193-
def __init__(self, rule, qpos, ipos=None, score=0):
193+
def __init__(self, rule, query_position, index_position=None, score=0):
194194
self.rule = rule
195195

196196
# pos matched, for reference (such as displaying matches)
197-
self.ipos = increment_line_numbers(ipos)
198-
self.qpos = increment_line_numbers(qpos)
197+
self.index_position = increment_line_numbers(index_position)
198+
self.query_position = increment_line_numbers(query_position)
199199

200200
# position span
201-
self.span = Span(qpos.start, qpos.end)
201+
self.span = Span(query_position.start, query_position.end)
202202
self.score = score
203203

204204
def __repr__(self):

src/licensedcode/index.py

Lines changed: 76 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,16 @@ def build_empty_indexes(ngram_len):
6161
1. The unigrams index is in indexes[1] with this structure:
6262
{1:
6363
{
64-
u1: {idocid1: [posting_list1], idocid2: [posting_list2]},
65-
u2: {idocid1: [posting_list3], idocid3: [posting_list4]}
64+
u1: {index_docid1: [posting_list1], index_docid2: [posting_list2]},
65+
u2: {index_docid1: [posting_list3], index_docid3: [posting_list4]}
6666
}
6767
}
6868
6969
2. The bigrams index is in indexes[2] with this structure:
7070
{2:
7171
{
72-
u3, u4: {idocid1: [posting_list7], idocid2: [posting_list6]},
73-
u5, u6: {idocid1: [posting_list5], idocid3: [posting_list8]}
72+
u3, u4: {index_docid1: [posting_list7], index_docid2: [posting_list6]},
73+
u5, u6: {index_docid1: [posting_list5], index_docid3: [posting_list8]}
7474
}
7575
}
7676
and so on, until ngram_len
@@ -98,13 +98,13 @@ class Index(object):
9898
against these reference documents.
9999
100100
Terms used here:
101-
- idoc: indexed document
102-
- idocid: indexed document ID
103-
- qdoc: query document
104-
- qdocid: query document ID
101+
- index_doc: indexed document
102+
- index_docid: indexed document ID
103+
- query_doc: query document
104+
- query_docid: query document ID
105105
106106
We use several inverted indexes mapping a Token value to a list of
107-
per Token positions for each indexed document ID (idocid): There is one
107+
per Token positions for each indexed document ID (index_docid): There is one
108108
index for every ngram length from one up to ngram_len.
109109
110110
These multiple indexes handle cases where the a query document text to
@@ -122,15 +122,15 @@ class Index(object):
122122
lengths.
123123
124124
These cases are supported:
125-
- small idoc or qdoc with fewer tokens than ngram length.
125+
- small index_doc or query_doc with fewer tokens than ngram length.
126126
127127
- small regions of text between two template regions with fewer tokens
128128
than an ngram length.
129129
130-
- small regions of text at the beginning of an idoc just before a
130+
- small regions of text at the beginning of an index_doc just before a
131131
template region and with fewer tokens than an ngram length.
132132
133-
- small regions of text at the end of an idoc and just after a template
133+
- small regions of text at the end of an index_doc and just after a template
134134
region and with fewer tokens than an ngram length.
135135
"""
136136

@@ -141,14 +141,14 @@ def __init__(self, ngram_len=analysis.DEFAULT_NGRAM_LEN):
141141
# the nested indexes structure
142142
self.indexes = build_empty_indexes(ngram_len)
143143

144-
# a mapping of docid to a count of Tokens in an idoc
145-
self.tokens_count_per_idoc = {}
144+
# a mapping of docid to a count of Tokens in an index_doc
145+
self.tokens_count_per_index_doc = {}
146146

147-
def get_tokens_count(self, idocid):
148-
return self.tokens_count_per_idoc[idocid]
147+
def get_tokens_count(self, index_docid):
148+
return self.tokens_count_per_index_doc[index_docid]
149149

150-
def set_tokens_count(self, idocid, val):
151-
self.tokens_count_per_idoc[idocid] = val
150+
def set_tokens_count(self, index_docid, val):
151+
self.tokens_count_per_index_doc[index_docid] = val
152152

153153
def index_one(self, docid, doc, template=False):
154154
"""
@@ -210,30 +210,30 @@ def match(self, query_doc, perfect=True):
210210

211211
all_results = defaultdict(list)
212212

213-
by_ipos_start = lambda x: x[0].start
213+
by_index_position_start = lambda x: x[0].start
214214

215215
# first find contiguous matches
216216
for docid, matches in candidate_matches.items():
217-
for idx, match in enumerate(sorted(matches, key=by_ipos_start)):
218-
ipos, qpos = match
219-
# perfect contiguous matches must start at ipos 0
220-
if ipos.start != 0:
217+
for idx, match in enumerate(sorted(matches, key=by_index_position_start)):
218+
index_position, query_position = match
219+
# perfect contiguous matches must start at index_position 0
220+
if index_position.start != 0:
221221
break
222222
else:
223223
# TODO: "if not perfect " if we are not starting at 0
224224
# collect partial matches
225225
pass
226-
# start of a possible full match at ipos 0
226+
# start of a possible full match at index_position 0
227227
subset = matches[idx + 1:]
228228

229229
if DEBUG:
230230
lsub = len(subset) + 1
231231
print(' Index.match: about to align %(lsub)r '
232232
'candidate matches for %(docid)r:\n'
233-
'ipos: %(ipos)r\nqpos: %(qpos)r\n'
233+
'index_position: %(index_position)r\nquery_position: %(query_position)r\n'
234234
% locals())
235235

236-
matched_positions = self.align_matches(ipos, qpos, subset)
236+
matched_positions = self.align_matches(index_position, query_position, subset)
237237

238238
if DEBUG:
239239
lmp = len(matched_positions)
@@ -256,18 +256,18 @@ def match(self, query_doc, perfect=True):
256256
filtered = self.filter_matches(all_results, perfect)
257257
return filtered
258258

259-
def align_matches(self, cur_ipos, cur_qpos, matches):
259+
def align_matches(self, cur_index_position, cur_query_position, matches):
260260
"""
261261
Given a first match and subsequent potential matches, try to find a
262262
longer match skipping eventual gaps to yield the best alignment.
263263
264264
This how ngrams are handled with ngram_len of 3:
265265
-----------------------------------------------
266-
With this idoc and this qdoc:
267-
idoc: name is joker, name is joker
266+
With this index_doc and this query_doc:
267+
index_doc: name is joker, name is joker
268268
ngrams: name is joker, is joker name, joker name is, name is joker
269269
0 1 2 3
270-
qdoc: Hi my name is joker, name is joker yes.
270+
query_doc: Hi my name is joker, name is joker yes.
271271
ngrams: hi my name, my name is, name is joker, is joker name, joker name is, name is joker, is joker yes
272272
0 1 2 3 4 5 6
273273
will yield these candidates:
@@ -280,19 +280,19 @@ def align_matches(self, cur_ipos, cur_qpos, matches):
280280
281281
And this how gaps are handled:
282282
------------------------------
283-
With this idoc and this qdoc::
284-
idoc: my name is {{2 Joe}} the joker
283+
With this index_doc and this query_doc::
284+
index_doc: my name is {{2 Joe}} the joker
285285
i0 i1 i2-g2 i3 i4
286-
qdoc: Yet, my name is Jane Heinz the joker.
286+
query_doc: Yet, my name is Jane Heinz the joker.
287287
q0 q1 q2 q3 q4 q5 q6 q7
288288
will yield these candidates:
289289
i0, q1
290290
i1, q2
291291
i2-g2, q3
292292
i3, q6 : here q6 <= q3 + 1 + g2
293293
i4, q7
294-
With the same idoc and this qdoc:
295-
qdoc: Yet, my name is Jane the joker.
294+
With the same index_doc and this query_doc:
295+
query_doc: Yet, my name is Jane the joker.
296296
q0 q1 q2 q3 q4 q5 q6
297297
will yet these candidates:
298298
i0, q1
@@ -303,71 +303,71 @@ def align_matches(self, cur_ipos, cur_qpos, matches):
303303
"""
304304

305305
# add first match
306-
matched = [(cur_ipos, cur_qpos,)]
306+
matched = [(cur_index_position, cur_query_position,)]
307307
cumulative_gap = 0
308308

309309
if DEBUG_ALIGN:
310310
print()
311311

312312
for match in iter(matches):
313-
prev_ipos, prev_qpos = matched[-1]
314-
cumulative_gap += prev_ipos.gap
315-
cur_ipos, cur_qpos = match
313+
prev_index_position, prev_query_position = matched[-1]
314+
cumulative_gap += prev_index_position.gap
315+
cur_index_position, cur_query_position = match
316316

317317
if DEBUG_ALIGN:
318318
print(''.join(['Index.aligned match: positions \n',
319-
' prev_ipos: %(start)r %(end)r %(value)r\n'
320-
% prev_ipos._asdict(),
321-
' cur_ipos : %(start)r %(end)r %(value)r\n'
322-
% cur_ipos._asdict(),
323-
' prev_qpos: %(start)r %(end)r %(value)r\n'
324-
% prev_qpos._asdict(),
325-
' cur_qpos : %(start)r %(end)r %(value)r'
326-
% cur_qpos._asdict(),
319+
' prev_index_position: %(start)r %(end)r %(value)r\n'
320+
% prev_index_position._asdict(),
321+
' cur_index_position : %(start)r %(end)r %(value)r\n'
322+
% cur_index_position._asdict(),
323+
' prev_query_position: %(start)r %(end)r %(value)r\n'
324+
% prev_query_position._asdict(),
325+
' cur_query_position : %(start)r %(end)r %(value)r'
326+
% cur_query_position._asdict(),
327327
]))
328328

329-
print('Index.aligned match: prev_ipos.start:%d < '
330-
'cur_ipos.start:%d <= prev_ipos.end + 1:%d'
331-
% (prev_ipos.start, cur_ipos.start,
332-
prev_ipos.end + 1,))
329+
print('Index.aligned match: prev_index_position.start:%d < '
330+
'cur_index_position.start:%d <= prev_index_position.end + 1:%d'
331+
% (prev_index_position.start, cur_index_position.start,
332+
prev_index_position.end + 1,))
333333

334-
if prev_ipos.start < cur_ipos.start <= prev_ipos.end + 1:
334+
if prev_index_position.start < cur_index_position.start <= prev_index_position.end + 1:
335335

336336
if DEBUG_ALIGN:
337337
print('Index.aligned match: possible contiguous tokens')
338338

339-
# we are contiguous in ipos: are we contiguous in qpos?
340-
if prev_qpos.start + 1 == cur_qpos.start:
339+
# we are contiguous in index_position: are we contiguous in query_position?
340+
if prev_query_position.start + 1 == cur_query_position.start:
341341

342342
if DEBUG_ALIGN:
343343
print('Index.aligned match: Keeping contiguous '
344-
'tokens: prev_qpos.start + 1 '
345-
'== cur_qpos.start\n')
344+
'tokens: prev_query_position.start + 1 '
345+
'== cur_query_position.start\n')
346346

347-
matched.append((cur_ipos, cur_qpos,))
347+
matched.append((cur_index_position, cur_query_position,))
348348
continue
349349
else:
350350
# we are not contiguous, but could we be when gaps are
351351
# considered?
352352

353353
if DEBUG_ALIGN:
354354
print('Index.aligned match: '
355-
'prev_qpos.start:%d < cur_qpos.start:%d '
356-
'<= prev_qpos.start + 1 + cumulative_gap '
355+
'prev_query_position.start:%d < cur_query_position.start:%d '
356+
'<= prev_query_position.start + 1 + cumulative_gap '
357357
'+ self.ngram_len: %d' %
358-
(prev_qpos.start, cur_qpos.start,
359-
prev_qpos.start + cumulative_gap
358+
(prev_query_position.start, cur_query_position.start,
359+
prev_query_position.start + cumulative_gap
360360
+ self.ngram_len,))
361361

362-
if (prev_qpos.start < cur_qpos.start and
363-
cur_qpos.start <= (prev_qpos.start + cumulative_gap + self.ngram_len)):
362+
if (prev_query_position.start < cur_query_position.start and
363+
cur_query_position.start <= (prev_query_position.start + cumulative_gap + self.ngram_len)):
364364
# we are contiguous gap-wise, keep this match
365365

366366
if DEBUG_ALIGN:
367367
print('Index.aligned match: '
368368
'Keeping gap-wise contiguous tokens\n')
369369

370-
matched.append((cur_ipos, cur_qpos,))
370+
matched.append((cur_index_position, cur_query_position,))
371371
continue
372372
else:
373373
if DEBUG_ALIGN:
@@ -392,11 +392,11 @@ def candidates(self, query_doc):
392392
% len(query_doc))
393393
print(u''.join(query_doc))
394394
print()
395-
qdoc = iter(query_doc)
395+
query_doc = iter(query_doc)
396396

397-
# map idocid -> sorted set of tuples (ipos, qpos)
397+
# map index_docid -> sorted set of tuples (index_position, query_position)
398398
candidate_matches = defaultdict(list)
399-
# iterate over qdoc tokens using query_tknzr
399+
# iterate over query_doc tokens using query_tknzr
400400
for qtoken in self.query_tknzr(query_doc):
401401

402402
if DEBUG_CANDIDATES:
@@ -434,22 +434,22 @@ def filter_matches(self, all_matches, perfect=True):
434434
kept_results = defaultdict(list)
435435
for docid, matches in all_matches.iteritems():
436436
tok_cnt = self.get_tokens_count(docid)
437-
for ipos, qpos in matches:
438-
# perfect matches length must match the idoc token count
437+
for index_position, query_position in matches:
438+
# perfect matches length must match the index_doc token count
439439
# the token count is 1-based, the end is zero-based
440-
if tok_cnt == ipos.end + 1:
441-
kept_results[docid].append((ipos, qpos))
440+
if tok_cnt == index_position.end + 1:
441+
kept_results[docid].append((index_position, query_position))
442442
return kept_results
443443

444444

445445
def merge_aligned_positions(positions):
446446
"""
447-
Given a sequence of tuples of (idoc, qdoc) Token positions, return a single
448-
tuple of new (idoc, qdoc) Token positions representing the merged positions
449-
from every ipos and every qpos.
447+
Given a sequence of tuples of (index_doc, query_doc) Token positions, return a single
448+
tuple of new (index_doc, query_doc) Token positions representing the merged positions
449+
from every index_position and every query_position.
450450
"""
451-
idocs, qdocs = zip(*positions)
452-
return merge_positions(idocs), merge_positions(qdocs)
451+
index_docs, query_docs = zip(*positions)
452+
return merge_positions(index_docs), merge_positions(query_docs)
453453

454454

455455
def merge_positions(positions):

src/scancode/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ def get_licenses(location=None):
9494
'dejacode_url': DEJACODE_LICENSE_URL.format(license.key),
9595
'spdx_license_key': license.spdx_license_key,
9696
'spdx_url': license.spdx_url,
97-
'start_line': match.qpos.start_line,
98-
'end_line': match.qpos.end_line,
97+
'start_line': match.query_position.start_line,
98+
'end_line': match.query_position.end_line,
9999
}
100100

101101

0 commit comments

Comments
 (0)