Skip to content

Commit 6e14d1d

Browse files
committed
add docs
1 parent d900e41 commit 6e14d1d

File tree

1 file changed

+72
-4
lines changed

1 file changed

+72
-4
lines changed

sequence_matcher.mbt

Lines changed: 72 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
///|
2+
/// A contiguous matching block between two sequences.
3+
///
4+
/// The coordinates are half-open intervals:
5+
/// - In the first sequence: `[first_start, first_start + size)`
6+
/// - In the second sequence: `[second_start, second_start + size)`
27
#valtype
38
priv struct Match {
49
first_start : Int
@@ -7,11 +12,18 @@ priv struct Match {
712
}
813

914
///|
15+
/// Build a `Match` value.
1016
fn Match::new(first_start : Int, second_start : Int, size : Int) -> Match {
1117
Match::{ first_start, second_start, size }
1218
}
1319

1420
///|
21+
/// Operation tag used in an edit script.
22+
///
23+
/// - `Equal`: unchanged range in both sequences.
24+
/// - `Insert`: range only exists in the second sequence.
25+
/// - `Delete`: range only exists in the first sequence.
26+
/// - `Replace`: range changed from first to second.
1527
priv enum OpTag {
1628
Equal
1729
Insert
@@ -20,6 +32,11 @@ priv enum OpTag {
2032
} derive(Eq)
2133

2234
///|
35+
/// One edit operation over two half-open intervals.
36+
///
37+
/// The operation always compares:
38+
/// - first sequence range: `[first_start, first_end)`
39+
/// - second sequence range: `[second_start, second_end)`
2340
priv struct OpCode {
2441
tag : OpTag
2542
first_start : Int
@@ -29,6 +46,7 @@ priv struct OpCode {
2946
}
3047

3148
///|
49+
/// Build an `OpCode` value.
3250
fn OpCode::new(
3351
tag : OpTag,
3452
first_start : Int,
@@ -40,13 +58,21 @@ fn OpCode::new(
4058
}
4159

4260
///|
61+
/// Sequence matcher used to compute LCS-like matching blocks and diff opcodes.
62+
///
63+
/// Internal cache `second_sequence_elements` maps each element in the second
64+
/// sequence to all its indices, which is reused by longest-match search.
4365
priv struct SequenceMatcher[T] {
4466
mut first_sequence : Array[T]
4567
mut second_sequence : Array[T]
4668
mut second_sequence_elements : @hashmap.HashMap[T, Array[Int]]
4769
}
4870

4971
///|
72+
/// Construct a matcher for two sequences.
73+
///
74+
/// `T` must support `Eq + Hash`, because matching uses hash-map based index
75+
/// lookups in `second_sequence_elements`.
5076
fn[T : Eq + Hash] SequenceMatcher::new(
5177
first_sequence : Array[T],
5278
second_sequence : Array[T],
@@ -61,6 +87,10 @@ fn[T : Eq + Hash] SequenceMatcher::new(
6187
}
6288

6389
///|
90+
/// Replace both sequences.
91+
///
92+
/// This updates the first sequence directly, then rebuilds the second-sequence
93+
/// index cache through `set_second_seq`.
6494
fn[T : Eq + Hash] SequenceMatcher::set_seqs(
6595
self : SequenceMatcher[T],
6696
first_sequence : Array[T],
@@ -71,6 +101,10 @@ fn[T : Eq + Hash] SequenceMatcher::set_seqs(
71101
}
72102

73103
///|
104+
/// Replace only the first sequence.
105+
///
106+
/// No cache rebuild is needed because the cache is keyed by the second
107+
/// sequence.
74108
fn[T] SequenceMatcher::set_first_seq(
75109
self : SequenceMatcher[T],
76110
sequence : Array[T],
@@ -79,6 +113,7 @@ fn[T] SequenceMatcher::set_first_seq(
79113
}
80114

81115
///|
116+
/// Replace only the second sequence and rebuild its index cache.
82117
fn[T : Eq + Hash] SequenceMatcher::set_second_seq(
83118
self : SequenceMatcher[T],
84119
sequence : Array[T],
@@ -88,17 +123,23 @@ fn[T : Eq + Hash] SequenceMatcher::set_second_seq(
88123
}
89124

90125
///|
126+
/// Build an index map from second-sequence element to all its positions.
127+
///
128+
/// For long sequences, very frequent elements are filtered out to avoid
129+
/// quadratic blowups in candidate expansion (`popular elements` optimization).
91130
fn[T : Eq + Hash] SequenceMatcher::chain_second_seq(
92131
self : SequenceMatcher[T],
93132
) -> Unit {
94133
let second_sequence = self.second_sequence
95134
let mut second_sequence_elements = @hashmap.HashMap::new()
96135
for i, item in second_sequence.iter2() {
136+
// Collect all indices where each element appears.
97137
let counter = second_sequence_elements.get_or_init(item, () => Array::new())
98138
counter.push(i)
99139
}
100140

101-
// keep popular elements in lookup table
141+
// Keep only non-popular elements in the lookup table.
142+
// Threshold follows difflib-style heuristic: frequency > len/100 + 1.
102143
let len = second_sequence.length()
103144
if len >= 200 {
104145
let test_len = (len.to_double() / 100.0).floor().to_int() + 1
@@ -117,6 +158,14 @@ fn[T : Eq + Hash] SequenceMatcher::chain_second_seq(
117158
}
118159

119160
///|
161+
/// Find the longest contiguous common block within two sub-ranges.
162+
///
163+
/// Search space is restricted to:
164+
/// - first sequence range `[first_start, first_end)`
165+
/// - second sequence range `[second_start, second_end)`
166+
///
167+
/// The core dynamic-programming state `j2len` stores the best length of a
168+
/// suffix match ending at the current `i` and each `j`.
120169
fn[T : Eq + Hash] SequenceMatcher::find_longest_match(
121170
self : SequenceMatcher[T],
122171
first_start : Int,
@@ -131,18 +180,24 @@ fn[T : Eq + Hash] SequenceMatcher::find_longest_match(
131180
let mut best_j = second_start
132181
let mut best_size = 0
133182
let mut j2len = @hashmap.HashMap::new()
183+
184+
// DP over diagonals: if first[i] == second[j],
185+
// new_j2len[j] = j2len[j - 1] + 1.
134186
for i = first_start; i < first_end; i = i + 1 {
135187
let item = first_sequence[i]
136188
let new_j2len = @hashmap.HashMap::new()
137189
match second_sequence_elements.get(item) {
138190
Some(indexes) =>
139191
for j in indexes {
192+
// Candidate index belongs to the current second-range window.
140193
if j < second_start {
141194
continue
142195
}
143196
if j >= second_end {
144197
break
145198
}
199+
200+
// Extend previous diagonal match by one.
146201
let mut size = match j2len.get(j - 1) {
147202
Some(k) if j > 0 => k
148203
_ => 0
@@ -160,7 +215,9 @@ fn[T : Eq + Hash] SequenceMatcher::find_longest_match(
160215
j2len = new_j2len
161216
}
162217

163-
// Extend the match backwards and forwards
218+
// Extend around the best core match to absorb adjacent equal elements.
219+
// Two passes are used so a backward extension discovered in pass 1 can
220+
// enable additional forward extension in pass 2.
164221

165222
for _ in 0..<2 {
166223
while best_i > first_start &&
@@ -181,6 +238,13 @@ fn[T : Eq + Hash] SequenceMatcher::find_longest_match(
181238
}
182239

183240
///|
241+
/// Compute edit operations that transform the first sequence into the second.
242+
///
243+
/// The algorithm:
244+
/// 1. Repeatedly find longest matches inside unmatched windows.
245+
/// 2. Sort and merge adjacent matches.
246+
/// 3. Convert gaps between matches into `Insert/Delete/Replace` operations.
247+
/// 4. Emit `Equal` operations for each merged match.
184248
fn[T : Eq + Hash] SequenceMatcher::get_opcodes(
185249
self : SequenceMatcher[T],
186250
) -> Array[OpCode] {
@@ -189,6 +253,8 @@ fn[T : Eq + Hash] SequenceMatcher::get_opcodes(
189253
let matches = Array::new()
190254
let queue = Array::new()
191255
queue.push((0, first_length, 0, second_length))
256+
257+
// Split problem recursively (implemented with an explicit stack/queue).
192258
while !queue.is_empty() {
193259
let (first_start, first_end, second_start, second_end) = queue.unsafe_pop()
194260
let m = self.find_longest_match(
@@ -213,7 +279,7 @@ fn[T : Eq + Hash] SequenceMatcher::get_opcodes(
213279
}
214280
}
215281

216-
// Sort matches
282+
// Ensure deterministic order before merging.
217283
matches.sort_by(fn(a, b) {
218284
if a.first_start < b.first_start {
219285
-1
@@ -228,7 +294,7 @@ fn[T : Eq + Hash] SequenceMatcher::get_opcodes(
228294
}
229295
})
230296

231-
// Merge adjacent matches
297+
// Merge consecutive blocks that are contiguous in both sequences.
232298
let mut first_start = 0
233299
let mut second_start = 0
234300
let mut size = 0
@@ -253,6 +319,8 @@ fn[T : Eq + Hash] SequenceMatcher::get_opcodes(
253319
let opcodes = Array::new()
254320
let mut i = 0
255321
let mut j = 0
322+
323+
// Emit gap operations, then emit the equal block itself.
256324
for m in non_adjacent {
257325
let tag = if i < m.first_start && j < m.second_start {
258326
Some(OpTag::Replace)

0 commit comments

Comments
 (0)