|
| 1 | +# Based off of https://github.com/rapidfuzz/CyDifflib/blob/ef0d1cb49abbdd551e9a27065032fc5317c731fd/src/cydifflib/_initialize.pyx |
| 2 | + |
| 3 | +from collections import namedtuple as _namedtuple |
| 4 | + |
| 5 | +cimport cython |
| 6 | +from libcpp.vector cimport vector |
| 7 | +from libcpp.algorithm cimport fill, sort as cpp_sort |
| 8 | + |
| 9 | + |
| 10 | +Match = _namedtuple('Match', 'a b size') |
| 11 | + |
| 12 | + |
| 13 | +ctypedef struct MatchingBlockQueueElem: |
| 14 | + Py_ssize_t alo |
| 15 | + Py_ssize_t ahi |
| 16 | + Py_ssize_t blo |
| 17 | + Py_ssize_t bhi |
| 18 | + |
| 19 | + |
| 20 | +ctypedef struct CMatch: |
| 21 | + Py_ssize_t a |
| 22 | + Py_ssize_t b |
| 23 | + Py_ssize_t size |
| 24 | + |
| 25 | + |
| 26 | +cdef int CMatch_sorter(const CMatch& lhs, const CMatch& rhs): |
| 27 | + if lhs.a != rhs.a: |
| 28 | + return lhs.a < rhs.a |
| 29 | + if lhs.b != rhs.b: |
| 30 | + return lhs.b < rhs.b |
| 31 | + return lhs.size < rhs.size |
| 32 | + |
| 33 | + |
| 34 | +cdef CMatch find_longest_match(a, b, Py_ssize_t alo, Py_ssize_t ahi, Py_ssize_t blo, Py_ssize_t bhi, b2j, Py_ssize_t len_good, matchables) except *: |
| 35 | + """ |
| 36 | + Find longest matching block of a and b in a[alo:ahi] and b[blo:bhi]. |
| 37 | +
|
| 38 | + `b2j` is a mapping of b high token ids -> list of position in b |
| 39 | + `len_good` is such that token ids smaller than `_good_good` are treated as |
| 40 | + good, non-junk tokens. `matchables` is a set of matchable positions. |
| 41 | + Positions absent from this set are ignored. |
| 42 | +
|
| 43 | + Return (i,j,k) Match tuple where: |
| 44 | + "i" in the start in "a" |
| 45 | + "j" in the start in "b" |
| 46 | + "k" in the size of the match |
| 47 | +
|
| 48 | + and such that a[i:i+k] is equal to b[j:j+k], where |
| 49 | + alo <= i <= i+k <= ahi |
| 50 | + blo <= j <= j+k <= bhi |
| 51 | +
|
| 52 | + and for all (i',j',k') matchable token positions meeting those conditions, |
| 53 | + k >= k' |
| 54 | + i <= i' |
| 55 | + and if i == i', j <= j' |
| 56 | +
|
| 57 | + In other words, of all maximal matching blocks, return one that starts |
| 58 | + earliest in a, and of all those maximal matching blocks that start earliest |
| 59 | + in a, return the one that starts earliest in b. |
| 60 | +
|
| 61 | + First the longest matching block (aka contiguous substring) is determined |
| 62 | + where no junk element appears in the block. Then that block is extended as |
| 63 | + far as possible by matching other tokens including junk on both sides. So |
| 64 | + the resulting block never matches on junk. |
| 65 | +
|
| 66 | + If no blocks match, return (alo, blo, 0). |
| 67 | + """ |
| 68 | + cdef Py_ssize_t besti, bestj, bestsize |
| 69 | + cdef Py_ssize_t i, j, k |
| 70 | + cdef vector[Py_ssize_t] j2len |
| 71 | + cdef vector[Py_ssize_t] newj2len |
| 72 | + |
| 73 | + besti, bestj, bestsize = alo, blo, 0 |
| 74 | + a_len = <size_t>len(a) |
| 75 | + b_len = <size_t>len(b) |
| 76 | + bufsize = max(a_len, b_len) + 1 |
| 77 | + j2len.resize(bufsize) |
| 78 | + newj2len.resize(bufsize) |
| 79 | + # find longest junk-free match |
| 80 | + # during an iteration of the loop, j2len[j] = length of longest |
| 81 | + # junk-free match ending with a[i-1] and b[j] |
| 82 | + nothing = [] |
| 83 | + for i in range(alo, ahi): |
| 84 | + # we cannot do LCS on junk or non matchable |
| 85 | + cura = a[i] |
| 86 | + if cura < len_good and i in matchables: |
| 87 | + # look at all instances of a[i] in b; note that because |
| 88 | + # b2j has no junk keys, the loop is skipped if a[i] is junk |
| 89 | + for j in b2j.get(a[i], nothing): |
| 90 | + # a[i] matches b[j] |
| 91 | + if j < blo: |
| 92 | + continue |
| 93 | + if j >= bhi: |
| 94 | + break |
| 95 | + k = j2len[j] + 1 |
| 96 | + newj2len[j + 1] = k |
| 97 | + if k > bestsize: |
| 98 | + besti = i - k + 1 |
| 99 | + bestj = j - k + 1 |
| 100 | + bestsize = k |
| 101 | + |
| 102 | + j2len.swap(newj2len) |
| 103 | + fill(newj2len.begin() + blo, newj2len.begin() + bhi + 1, 0) |
| 104 | + |
| 105 | + fill(j2len.begin() + blo, j2len.begin() + bhi + 1, 0) |
| 106 | + return extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables) |
| 107 | + |
| 108 | + |
| 109 | +cdef CMatch extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables): |
| 110 | + """ |
| 111 | + Extend a match identifier by (besti, bestj, bestsize) with any matching |
| 112 | + tokens on each end. Return a new CMatch. |
| 113 | + """ |
| 114 | + if bestsize: |
| 115 | + while (besti > alo and bestj > blo |
| 116 | + and a[besti - 1] == b[bestj - 1] |
| 117 | + and (besti - 1) in matchables): |
| 118 | + |
| 119 | + besti -= 1 |
| 120 | + bestj -= 1 |
| 121 | + bestsize += 1 |
| 122 | + |
| 123 | + while (besti + bestsize < ahi and bestj + bestsize < bhi |
| 124 | + and a[besti + bestsize] == b[bestj + bestsize] |
| 125 | + and (besti + bestsize) in matchables): |
| 126 | + |
| 127 | + bestsize += 1 |
| 128 | + |
| 129 | + return CMatch(besti, bestj, bestsize) |
| 130 | + |
| 131 | + |
| 132 | +def match_blocks(a, b, Py_ssize_t a_start, Py_ssize_t a_end, b2j, Py_ssize_t len_good, matchables, *args, **kwargs): |
| 133 | + """ |
| 134 | + Return a list of matching block Match triples describing matching |
| 135 | + subsequences of `a` in `b` starting from the `a_start` position in `a` up to |
| 136 | + the `a_end` position in `a`. |
| 137 | +
|
| 138 | + `b2j` is a mapping of b "high" token ids -> list of positions in b, e.g. a |
| 139 | + posting list. |
| 140 | +
|
| 141 | + `len_good` is such that token ids smaller than `len_good` are treated as |
| 142 | + important, non-junk tokens. |
| 143 | +
|
| 144 | + `matchables` is a set of matchable positions. Positions absent from this set |
| 145 | + are ignored. |
| 146 | +
|
| 147 | + Each triple is of the form (i, j, n), and means that a[i:i+n] == b[j:j+n]. |
| 148 | + The triples are monotonically increasing in i and in j. It is also |
| 149 | + guaranteed that adjacent triples never describe adjacent equal blocks. |
| 150 | + Instead adjacent blocks are merged and collapsed in a single block. |
| 151 | + """ |
| 152 | + cdef Py_ssize_t i, j, k, i1, j1, k1, i2, j2, k2 |
| 153 | + cdef Py_ssize_t alo, ahi, blo, bhi |
| 154 | + cdef vector[MatchingBlockQueueElem] queue |
| 155 | + cdef vector[CMatch] matching_blocks |
| 156 | + |
| 157 | + # This non-recursive algorithm is using a list as a queue of blocks. We |
| 158 | + # still need to look at and append partial results to matching_blocks in a |
| 159 | + # loop. The matches are sorted at the end. |
| 160 | + queue.push_back(MatchingBlockQueueElem(a_start, a_end, 0, len(b))) |
| 161 | + while not queue.empty(): |
| 162 | + elem = queue.back() |
| 163 | + alo, ahi, blo, bhi = elem.alo, elem.ahi, elem.blo, elem.bhi |
| 164 | + queue.pop_back() |
| 165 | + x = find_longest_match(a, b, alo, ahi, blo, bhi, b2j, len_good, matchables) |
| 166 | + i, j, k = x.a, x.b, x.size |
| 167 | + # a[alo:i] vs b[blo:j] unknown |
| 168 | + # a[i:i+k] same as b[j:j+k] |
| 169 | + # a[i+k:ahi] vs b[j+k:bhi] unknown |
| 170 | + if k: # if k is 0, there was no matching block |
| 171 | + matching_blocks.push_back(x) |
| 172 | + if alo < i and blo < j: |
| 173 | + # there is unprocessed things remaining to the left |
| 174 | + queue.push_back(MatchingBlockQueueElem(alo, i, blo, j)) |
| 175 | + if i + k < ahi and j + k < bhi: |
| 176 | + # there is unprocessed things remaining to the right |
| 177 | + queue.push_back(MatchingBlockQueueElem(i+k, ahi, j+k, bhi)) |
| 178 | + |
| 179 | + cpp_sort(matching_blocks.begin(), matching_blocks.end(), &CMatch_sorter) |
| 180 | + |
| 181 | + # collapse adjacent blocks |
| 182 | + i1 = j1 = k1 = 0 |
| 183 | + non_adjacent = [] |
| 184 | + for match in matching_blocks: |
| 185 | + i2, j2, k2 = match.a, match.b, match.size |
| 186 | + # Is this block adjacent to i1, j1, k1? |
| 187 | + if i1 + k1 == i2 and j1 + k1 == j2: |
| 188 | + # Yes, so collapse them -- this just increases the length of |
| 189 | + # the first block by the length of the second, and the first |
| 190 | + # block so lengthened remains the block to compare against. |
| 191 | + k1 += k2 |
| 192 | + else: |
| 193 | + # Not adjacent. Remember the first block (k1==0 means it's |
| 194 | + # the dummy we started with), and make the second block the |
| 195 | + # new block to compare against. |
| 196 | + if k1: |
| 197 | + non_adjacent.append((i1, j1, k1)) |
| 198 | + i1, j1, k1 = i2, j2, k2 |
| 199 | + if k1: |
| 200 | + non_adjacent.append((i1, j1, k1)) |
| 201 | + |
| 202 | + return [Match._make(na) for na in non_adjacent] |
0 commit comments