Skip to content

Commit f273246

Browse files
committed
Modify CyDifflib the same way we modified difflib
Signed-off-by: Jono Yang <[email protected]>
1 parent e411b8b commit f273246

File tree

1 file changed

+202
-0
lines changed

1 file changed

+202
-0
lines changed

src/licensedcode/seq.pyx

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# Based off of https://github.com/rapidfuzz/CyDifflib/blob/ef0d1cb49abbdd551e9a27065032fc5317c731fd/src/cydifflib/_initialize.pyx
2+
3+
from collections import namedtuple as _namedtuple
4+
5+
cimport cython
6+
from libcpp.vector cimport vector
7+
from libcpp.algorithm cimport fill, sort as cpp_sort
8+
9+
10+
Match = _namedtuple('Match', 'a b size')
11+
12+
13+
ctypedef struct MatchingBlockQueueElem:
14+
Py_ssize_t alo
15+
Py_ssize_t ahi
16+
Py_ssize_t blo
17+
Py_ssize_t bhi
18+
19+
20+
ctypedef struct CMatch:
21+
Py_ssize_t a
22+
Py_ssize_t b
23+
Py_ssize_t size
24+
25+
26+
cdef int CMatch_sorter(const CMatch& lhs, const CMatch& rhs):
27+
if lhs.a != rhs.a:
28+
return lhs.a < rhs.a
29+
if lhs.b != rhs.b:
30+
return lhs.b < rhs.b
31+
return lhs.size < rhs.size
32+
33+
34+
cdef CMatch find_longest_match(a, b, Py_ssize_t alo, Py_ssize_t ahi, Py_ssize_t blo, Py_ssize_t bhi, b2j, Py_ssize_t len_good, matchables) except *:
35+
"""
36+
Find longest matching block of a and b in a[alo:ahi] and b[blo:bhi].
37+
38+
`b2j` is a mapping of b high token ids -> list of position in b
39+
`len_good` is such that token ids smaller than `_good_good` are treated as
40+
good, non-junk tokens. `matchables` is a set of matchable positions.
41+
Positions absent from this set are ignored.
42+
43+
Return (i,j,k) Match tuple where:
44+
"i" in the start in "a"
45+
"j" in the start in "b"
46+
"k" in the size of the match
47+
48+
and such that a[i:i+k] is equal to b[j:j+k], where
49+
alo <= i <= i+k <= ahi
50+
blo <= j <= j+k <= bhi
51+
52+
and for all (i',j',k') matchable token positions meeting those conditions,
53+
k >= k'
54+
i <= i'
55+
and if i == i', j <= j'
56+
57+
In other words, of all maximal matching blocks, return one that starts
58+
earliest in a, and of all those maximal matching blocks that start earliest
59+
in a, return the one that starts earliest in b.
60+
61+
First the longest matching block (aka contiguous substring) is determined
62+
where no junk element appears in the block. Then that block is extended as
63+
far as possible by matching other tokens including junk on both sides. So
64+
the resulting block never matches on junk.
65+
66+
If no blocks match, return (alo, blo, 0).
67+
"""
68+
cdef Py_ssize_t besti, bestj, bestsize
69+
cdef Py_ssize_t i, j, k
70+
cdef vector[Py_ssize_t] j2len
71+
cdef vector[Py_ssize_t] newj2len
72+
73+
besti, bestj, bestsize = alo, blo, 0
74+
a_len = <size_t>len(a)
75+
b_len = <size_t>len(b)
76+
bufsize = max(a_len, b_len) + 1
77+
j2len.resize(bufsize)
78+
newj2len.resize(bufsize)
79+
# find longest junk-free match
80+
# during an iteration of the loop, j2len[j] = length of longest
81+
# junk-free match ending with a[i-1] and b[j]
82+
nothing = []
83+
for i in range(alo, ahi):
84+
# we cannot do LCS on junk or non matchable
85+
cura = a[i]
86+
if cura < len_good and i in matchables:
87+
# look at all instances of a[i] in b; note that because
88+
# b2j has no junk keys, the loop is skipped if a[i] is junk
89+
for j in b2j.get(a[i], nothing):
90+
# a[i] matches b[j]
91+
if j < blo:
92+
continue
93+
if j >= bhi:
94+
break
95+
k = j2len[j] + 1
96+
newj2len[j + 1] = k
97+
if k > bestsize:
98+
besti = i - k + 1
99+
bestj = j - k + 1
100+
bestsize = k
101+
102+
j2len.swap(newj2len)
103+
fill(newj2len.begin() + blo, newj2len.begin() + bhi + 1, 0)
104+
105+
fill(j2len.begin() + blo, j2len.begin() + bhi + 1, 0)
106+
return extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables)
107+
108+
109+
cdef CMatch extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables):
110+
"""
111+
Extend a match identifier by (besti, bestj, bestsize) with any matching
112+
tokens on each end. Return a new CMatch.
113+
"""
114+
if bestsize:
115+
while (besti > alo and bestj > blo
116+
and a[besti - 1] == b[bestj - 1]
117+
and (besti - 1) in matchables):
118+
119+
besti -= 1
120+
bestj -= 1
121+
bestsize += 1
122+
123+
while (besti + bestsize < ahi and bestj + bestsize < bhi
124+
and a[besti + bestsize] == b[bestj + bestsize]
125+
and (besti + bestsize) in matchables):
126+
127+
bestsize += 1
128+
129+
return CMatch(besti, bestj, bestsize)
130+
131+
132+
def match_blocks(a, b, Py_ssize_t a_start, Py_ssize_t a_end, b2j, Py_ssize_t len_good, matchables, *args, **kwargs):
133+
"""
134+
Return a list of matching block Match triples describing matching
135+
subsequences of `a` in `b` starting from the `a_start` position in `a` up to
136+
the `a_end` position in `a`.
137+
138+
`b2j` is a mapping of b "high" token ids -> list of positions in b, e.g. a
139+
posting list.
140+
141+
`len_good` is such that token ids smaller than `len_good` are treated as
142+
important, non-junk tokens.
143+
144+
`matchables` is a set of matchable positions. Positions absent from this set
145+
are ignored.
146+
147+
Each triple is of the form (i, j, n), and means that a[i:i+n] == b[j:j+n].
148+
The triples are monotonically increasing in i and in j. It is also
149+
guaranteed that adjacent triples never describe adjacent equal blocks.
150+
Instead adjacent blocks are merged and collapsed in a single block.
151+
"""
152+
cdef Py_ssize_t i, j, k, i1, j1, k1, i2, j2, k2
153+
cdef Py_ssize_t alo, ahi, blo, bhi
154+
cdef vector[MatchingBlockQueueElem] queue
155+
cdef vector[CMatch] matching_blocks
156+
157+
# This non-recursive algorithm is using a list as a queue of blocks. We
158+
# still need to look at and append partial results to matching_blocks in a
159+
# loop. The matches are sorted at the end.
160+
queue.push_back(MatchingBlockQueueElem(a_start, a_end, 0, len(b)))
161+
while not queue.empty():
162+
elem = queue.back()
163+
alo, ahi, blo, bhi = elem.alo, elem.ahi, elem.blo, elem.bhi
164+
queue.pop_back()
165+
x = find_longest_match(a, b, alo, ahi, blo, bhi, b2j, len_good, matchables)
166+
i, j, k = x.a, x.b, x.size
167+
# a[alo:i] vs b[blo:j] unknown
168+
# a[i:i+k] same as b[j:j+k]
169+
# a[i+k:ahi] vs b[j+k:bhi] unknown
170+
if k: # if k is 0, there was no matching block
171+
matching_blocks.push_back(x)
172+
if alo < i and blo < j:
173+
# there is unprocessed things remaining to the left
174+
queue.push_back(MatchingBlockQueueElem(alo, i, blo, j))
175+
if i + k < ahi and j + k < bhi:
176+
# there is unprocessed things remaining to the right
177+
queue.push_back(MatchingBlockQueueElem(i+k, ahi, j+k, bhi))
178+
179+
cpp_sort(matching_blocks.begin(), matching_blocks.end(), &CMatch_sorter)
180+
181+
# collapse adjacent blocks
182+
i1 = j1 = k1 = 0
183+
non_adjacent = []
184+
for match in matching_blocks:
185+
i2, j2, k2 = match.a, match.b, match.size
186+
# Is this block adjacent to i1, j1, k1?
187+
if i1 + k1 == i2 and j1 + k1 == j2:
188+
# Yes, so collapse them -- this just increases the length of
189+
# the first block by the length of the second, and the first
190+
# block so lengthened remains the block to compare against.
191+
k1 += k2
192+
else:
193+
# Not adjacent. Remember the first block (k1==0 means it's
194+
# the dummy we started with), and make the second block the
195+
# new block to compare against.
196+
if k1:
197+
non_adjacent.append((i1, j1, k1))
198+
i1, j1, k1 = i2, j2, k2
199+
if k1:
200+
non_adjacent.append((i1, j1, k1))
201+
202+
return [Match._make(na) for na in non_adjacent]

0 commit comments

Comments
 (0)