Skip to content

Commit c65c833

Browse files
Predd0omatheusvirManoelNetto26Lucaslg7RailtonDantas
committed
feat(whoosh): implement SkipLists for documents searching
Co-authored-by: Matheus Virgolino <matheus.virgolino.abilio.da.silva@ccc.ufcg.edu.br> Co-authored-by: Manoel Netto <manoel.da.nobrega.eustaqueo.netto@ccc.ufcg.edu.br> Co-authored-by: Pedro <pedroalmeida1896@gmail.com> Co-authored-by: Lucaslg7 <lucasmoizinholg7@gmail.com> Co-authored-by: RailtonDantas <railtondantas.code@gmail.com> Co-authored-by: João Pereira <joao.pereira.de.oliveira@ccc.ufcg.edu.br>
1 parent da74c78 commit c65c833

File tree

3 files changed

+82
-1
lines changed

3 files changed

+82
-1
lines changed

src/whoosh/matching/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
NullMatcher,
4949
NullMatcherClass,
5050
ReadTooFar,
51+
SkipListMatcher,
5152
)
5253
from whoosh.matching.wrappers import (
5354
ConstantScoreWrapperMatcher,

src/whoosh/matching/mcore.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@
5151

5252
from abc import abstractmethod
5353
from itertools import repeat
54-
54+
from whoosh.matching.skiplist import SkipList
55+
from bisect import bisect_left
5556
# Exceptions
5657

5758

@@ -572,6 +573,22 @@ def score(self):
572573
else:
573574
return self.weight()
574575

576+
class SkipListMatcher(ListMatcher):
577+
def __init__(self, ids, **kwargs):
578+
super().__init__(ids, **kwargs)
579+
self._skiplist = SkipList(ids)
580+
581+
def skip_to(self, id):
582+
if not self.is_active():
583+
raise ReadTooFar
584+
if id < self.id():
585+
return
586+
587+
node = self._skiplist.skip_to(id)
588+
if node is not None:
589+
self._i = bisect_left(self._ids, node.doc_id, self._i)
590+
else:
591+
self._i = len(self._ids)
575592

576593
# Term/vector leaf posting matcher middleware
577594

src/whoosh/matching/skiplist.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import random
2+
3+
class SkipNode:
4+
__slots__ = ("doc_id", "forward")
5+
6+
def __init__(self, doc_id, level):
7+
self.doc_id = doc_id
8+
self.forward = [None] * (level + 1)
9+
10+
class SkipList:
11+
def __init__(self, ids, max_level=16, p = 0.5):
12+
self.max_level = max_level
13+
self.p = p
14+
self.header = SkipNode(-1, max_level)
15+
self.level = 0
16+
self.size = len(ids)
17+
self._build(ids)
18+
19+
def _random_level(self):
20+
level = 0
21+
while random.random() < self.p and level < self.max_level:
22+
level += 1
23+
return level
24+
25+
def _build(self, ids):
26+
update = [self.header] * (self.max_level + 1)
27+
for doc_id in ids:
28+
lvl = self._random_level()
29+
if lvl > self.level:
30+
self.level = lvl
31+
32+
node = SkipNode(doc_id, lvl)
33+
for i in range(lvl + 1):
34+
node.forward[i] = update[i].forward[i]
35+
update[i].forward[i] = node
36+
update[i] = node
37+
38+
39+
def skip_to(self, target_id):
40+
current = self.header
41+
42+
for i in range(self.level, -1, -1):
43+
while (
44+
current.forward[i] is not None
45+
and current.forward[i].doc_id < target_id
46+
):
47+
current = current.forward[i]
48+
49+
current = current.forward[0]
50+
return current
51+
52+
def __len__(self):
53+
return self.size
54+
55+
def __iter__(self):
56+
node = self.header.forward[0]
57+
while node is not None:
58+
yield node.doc_id
59+
node = node.forward[0]
60+
def __contains__(self, doc_id):
61+
node = self.skip_to(doc_id)
62+
return node is not None and node.doc_id == doc_id
63+

0 commit comments

Comments
 (0)