@@ -61,16 +61,16 @@ def build_empty_indexes(ngram_len):
6161 1. The unigrams index is in indexes[1] with this structure:
6262 {1:
6363 {
64- u1: {idocid1 : [posting_list1], idocid2 : [posting_list2]},
65- u2: {idocid1 : [posting_list3], idocid3 : [posting_list4]}
64+ u1: {index_docid1 : [posting_list1], index_docid2 : [posting_list2]},
65+ u2: {index_docid1 : [posting_list3], index_docid3 : [posting_list4]}
6666 }
6767 }
6868
6969 2. The bigrams index is in indexes[2] with this structure:
7070 {2:
7171 {
72- u3, u4: {idocid1 : [posting_list7], idocid2 : [posting_list6]},
73- u5, u6: {idocid1 : [posting_list5], idocid3 : [posting_list8]}
72+ u3, u4: {index_docid1 : [posting_list7], index_docid2 : [posting_list6]},
73+ u5, u6: {index_docid1 : [posting_list5], index_docid3 : [posting_list8]}
7474 }
7575 }
7676 and so on, until ngram_len
@@ -98,13 +98,13 @@ class Index(object):
9898 against these reference documents.
9999
100100 Terms used here:
101- - idoc : indexed document
102- - idocid : indexed document ID
103- - qdoc : query document
104- - qdocid : query document ID
101+ - index_doc : indexed document
102+ - index_docid : indexed document ID
103+ - query_doc : query document
104+ - query_docid : query document ID
105105
106106 We use several inverted indexes mapping a Token value to a list of
107- per Token positions for each indexed document ID (idocid ): There is one
107+ per Token positions for each indexed document ID (index_docid ): There is one
108108 index for every ngram length from one up to ngram_len.
109109
110110 These multiple indexes handle cases where the a query document text to
@@ -122,15 +122,15 @@ class Index(object):
122122 lengths.
123123
124124 These cases are supported:
125- - small idoc or qdoc with fewer tokens than ngram length.
125+ - small index_doc or query_doc with fewer tokens than ngram length.
126126
127127 - small regions of text between two template regions with fewer tokens
128128 than an ngram length.
129129
130- - small regions of text at the beginning of an idoc just before a
130+ - small regions of text at the beginning of an index_doc just before a
131131 template region and with fewer tokens than an ngram length.
132132
133- - small regions of text at the end of an idoc and just after a template
133+ - small regions of text at the end of an index_doc and just after a template
134134 region and with fewer tokens than an ngram length.
135135 """
136136
@@ -141,14 +141,14 @@ def __init__(self, ngram_len=analysis.DEFAULT_NGRAM_LEN):
141141 # the nested indexes structure
142142 self .indexes = build_empty_indexes (ngram_len )
143143
144- # a mapping of docid to a count of Tokens in an idoc
145- self .tokens_count_per_idoc = {}
144+ # a mapping of docid to a count of Tokens in an index_doc
145+ self .tokens_count_per_index_doc = {}
146146
147- def get_tokens_count (self , idocid ):
148- return self .tokens_count_per_idoc [ idocid ]
147+ def get_tokens_count (self , index_docid ):
148+ return self .tokens_count_per_index_doc [ index_docid ]
149149
150- def set_tokens_count (self , idocid , val ):
151- self .tokens_count_per_idoc [ idocid ] = val
150+ def set_tokens_count (self , index_docid , val ):
151+ self .tokens_count_per_index_doc [ index_docid ] = val
152152
153153 def index_one (self , docid , doc , template = False ):
154154 """
@@ -210,30 +210,30 @@ def match(self, query_doc, perfect=True):
210210
211211 all_results = defaultdict (list )
212212
213- by_ipos_start = lambda x : x [0 ].start
213+ by_index_position_start = lambda x : x [0 ].start
214214
215215 # first find contiguous matches
216216 for docid , matches in candidate_matches .items ():
217- for idx , match in enumerate (sorted (matches , key = by_ipos_start )):
218- ipos , qpos = match
219- # perfect contiguous matches must start at ipos 0
220- if ipos .start != 0 :
217+ for idx , match in enumerate (sorted (matches , key = by_index_position_start )):
218+ index_position , query_position = match
219+ # perfect contiguous matches must start at index_position 0
220+ if index_position .start != 0 :
221221 break
222222 else :
223223 # TODO: "if not perfect " if we are not starting at 0
224224 # collect partial matches
225225 pass
226- # start of a possible full match at ipos 0
226+ # start of a possible full match at index_position 0
227227 subset = matches [idx + 1 :]
228228
229229 if DEBUG :
230230 lsub = len (subset ) + 1
231231 print (' Index.match: about to align %(lsub)r '
232232 'candidate matches for %(docid)r:\n '
233- 'ipos : %(ipos )r\n qpos : %(qpos )r\n '
233+ 'index_position : %(index_position )r\n query_position : %(query_position )r\n '
234234 % locals ())
235235
236- matched_positions = self .align_matches (ipos , qpos , subset )
236+ matched_positions = self .align_matches (index_position , query_position , subset )
237237
238238 if DEBUG :
239239 lmp = len (matched_positions )
@@ -256,18 +256,18 @@ def match(self, query_doc, perfect=True):
256256 filtered = self .filter_matches (all_results , perfect )
257257 return filtered
258258
259- def align_matches (self , cur_ipos , cur_qpos , matches ):
259+ def align_matches (self , cur_index_position , cur_query_position , matches ):
260260 """
261261 Given a first match and subsequent potential matches, try to find a
262262 longer match skipping eventual gaps to yield the best alignment.
263263
264264 This how ngrams are handled with ngram_len of 3:
265265 -----------------------------------------------
266- With this idoc and this qdoc :
267- idoc : name is joker, name is joker
266+ With this index_doc and this query_doc :
267+ index_doc : name is joker, name is joker
268268 ngrams: name is joker, is joker name, joker name is, name is joker
269269 0 1 2 3
270- qdoc : Hi my name is joker, name is joker yes.
270+ query_doc : Hi my name is joker, name is joker yes.
271271 ngrams: hi my name, my name is, name is joker, is joker name, joker name is, name is joker, is joker yes
272272 0 1 2 3 4 5 6
273273 will yield these candidates:
@@ -280,19 +280,19 @@ def align_matches(self, cur_ipos, cur_qpos, matches):
280280
281281 And this how gaps are handled:
282282 ------------------------------
283- With this idoc and this qdoc ::
284- idoc : my name is {{2 Joe}} the joker
283+ With this index_doc and this query_doc ::
284+ index_doc : my name is {{2 Joe}} the joker
285285 i0 i1 i2-g2 i3 i4
286- qdoc : Yet, my name is Jane Heinz the joker.
286+ query_doc : Yet, my name is Jane Heinz the joker.
287287 q0 q1 q2 q3 q4 q5 q6 q7
288288 will yield these candidates:
289289 i0, q1
290290 i1, q2
291291 i2-g2, q3
292292 i3, q6 : here q6 <= q3 + 1 + g2
293293 i4, q7
294- With the same idoc and this qdoc :
295- qdoc : Yet, my name is Jane the joker.
294+ With the same index_doc and this query_doc :
295+ query_doc : Yet, my name is Jane the joker.
296296 q0 q1 q2 q3 q4 q5 q6
297297 will yet these candidates:
298298 i0, q1
@@ -303,71 +303,71 @@ def align_matches(self, cur_ipos, cur_qpos, matches):
303303 """
304304
305305 # add first match
306- matched = [(cur_ipos , cur_qpos ,)]
306+ matched = [(cur_index_position , cur_query_position ,)]
307307 cumulative_gap = 0
308308
309309 if DEBUG_ALIGN :
310310 print ()
311311
312312 for match in iter (matches ):
313- prev_ipos , prev_qpos = matched [- 1 ]
314- cumulative_gap += prev_ipos .gap
315- cur_ipos , cur_qpos = match
313+ prev_index_position , prev_query_position = matched [- 1 ]
314+ cumulative_gap += prev_index_position .gap
315+ cur_index_position , cur_query_position = match
316316
317317 if DEBUG_ALIGN :
318318 print ('' .join (['Index.aligned match: positions \n ' ,
319- ' prev_ipos : %(start)r %(end)r %(value)r\n '
320- % prev_ipos ._asdict (),
321- ' cur_ipos : %(start)r %(end)r %(value)r\n '
322- % cur_ipos ._asdict (),
323- ' prev_qpos : %(start)r %(end)r %(value)r\n '
324- % prev_qpos ._asdict (),
325- ' cur_qpos : %(start)r %(end)r %(value)r'
326- % cur_qpos ._asdict (),
319+ ' prev_index_position : %(start)r %(end)r %(value)r\n '
320+ % prev_index_position ._asdict (),
321+ ' cur_index_position : %(start)r %(end)r %(value)r\n '
322+ % cur_index_position ._asdict (),
323+ ' prev_query_position : %(start)r %(end)r %(value)r\n '
324+ % prev_query_position ._asdict (),
325+ ' cur_query_position : %(start)r %(end)r %(value)r'
326+ % cur_query_position ._asdict (),
327327 ]))
328328
329- print ('Index.aligned match: prev_ipos .start:%d < '
330- 'cur_ipos .start:%d <= prev_ipos .end + 1:%d'
331- % (prev_ipos .start , cur_ipos .start ,
332- prev_ipos .end + 1 ,))
329+ print ('Index.aligned match: prev_index_position .start:%d < '
330+ 'cur_index_position .start:%d <= prev_index_position .end + 1:%d'
331+ % (prev_index_position .start , cur_index_position .start ,
332+ prev_index_position .end + 1 ,))
333333
334- if prev_ipos .start < cur_ipos .start <= prev_ipos .end + 1 :
334+ if prev_index_position .start < cur_index_position .start <= prev_index_position .end + 1 :
335335
336336 if DEBUG_ALIGN :
337337 print ('Index.aligned match: possible contiguous tokens' )
338338
339- # we are contiguous in ipos : are we contiguous in qpos ?
340- if prev_qpos .start + 1 == cur_qpos .start :
339+ # we are contiguous in index_position : are we contiguous in query_position ?
340+ if prev_query_position .start + 1 == cur_query_position .start :
341341
342342 if DEBUG_ALIGN :
343343 print ('Index.aligned match: Keeping contiguous '
344- 'tokens: prev_qpos .start + 1 '
345- '== cur_qpos .start\n ' )
344+ 'tokens: prev_query_position .start + 1 '
345+ '== cur_query_position .start\n ' )
346346
347- matched .append ((cur_ipos , cur_qpos ,))
347+ matched .append ((cur_index_position , cur_query_position ,))
348348 continue
349349 else :
350350 # we are not contiguous, but could we be when gaps are
351351 # considered?
352352
353353 if DEBUG_ALIGN :
354354 print ('Index.aligned match: '
355- 'prev_qpos .start:%d < cur_qpos .start:%d '
356- '<= prev_qpos .start + 1 + cumulative_gap '
355+ 'prev_query_position .start:%d < cur_query_position .start:%d '
356+ '<= prev_query_position .start + 1 + cumulative_gap '
357357 '+ self.ngram_len: %d' %
358- (prev_qpos .start , cur_qpos .start ,
359- prev_qpos .start + cumulative_gap
358+ (prev_query_position .start , cur_query_position .start ,
359+ prev_query_position .start + cumulative_gap
360360 + self .ngram_len ,))
361361
362- if (prev_qpos .start < cur_qpos .start and
363- cur_qpos .start <= (prev_qpos .start + cumulative_gap + self .ngram_len )):
362+ if (prev_query_position .start < cur_query_position .start and
363+ cur_query_position .start <= (prev_query_position .start + cumulative_gap + self .ngram_len )):
364364 # we are contiguous gap-wise, keep this match
365365
366366 if DEBUG_ALIGN :
367367 print ('Index.aligned match: '
368368 'Keeping gap-wise contiguous tokens\n ' )
369369
370- matched .append ((cur_ipos , cur_qpos ,))
370+ matched .append ((cur_index_position , cur_query_position ,))
371371 continue
372372 else :
373373 if DEBUG_ALIGN :
@@ -392,11 +392,11 @@ def candidates(self, query_doc):
392392 % len (query_doc ))
393393 print (u'' .join (query_doc ))
394394 print ()
395- qdoc = iter (query_doc )
395+ query_doc = iter (query_doc )
396396
397- # map idocid -> sorted set of tuples (ipos, qpos )
397+ # map index_docid -> sorted set of tuples (index_position, query_position )
398398 candidate_matches = defaultdict (list )
399- # iterate over qdoc tokens using query_tknzr
399+ # iterate over query_doc tokens using query_tknzr
400400 for qtoken in self .query_tknzr (query_doc ):
401401
402402 if DEBUG_CANDIDATES :
@@ -434,22 +434,22 @@ def filter_matches(self, all_matches, perfect=True):
434434 kept_results = defaultdict (list )
435435 for docid , matches in all_matches .iteritems ():
436436 tok_cnt = self .get_tokens_count (docid )
437- for ipos , qpos in matches :
438- # perfect matches length must match the idoc token count
437+ for index_position , query_position in matches :
438+ # perfect matches length must match the index_doc token count
439439 # the token count is 1-based, the end is zero-based
440- if tok_cnt == ipos .end + 1 :
441- kept_results [docid ].append ((ipos , qpos ))
440+ if tok_cnt == index_position .end + 1 :
441+ kept_results [docid ].append ((index_position , query_position ))
442442 return kept_results
443443
444444
445445def merge_aligned_positions (positions ):
446446 """
447- Given a sequence of tuples of (idoc, qdoc ) Token positions, return a single
448- tuple of new (idoc, qdoc ) Token positions representing the merged positions
449- from every ipos and every qpos .
447+ Given a sequence of tuples of (index_doc, query_doc ) Token positions, return a single
448+ tuple of new (index_doc, query_doc ) Token positions representing the merged positions
449+ from every index_position and every query_position .
450450 """
451- idocs , qdocs = zip (* positions )
452- return merge_positions (idocs ), merge_positions (qdocs )
451+ index_docs , query_docs = zip (* positions )
452+ return merge_positions (index_docs ), merge_positions (query_docs )
453453
454454
455455def merge_positions (positions ):
0 commit comments