Skip to content

Commit 92f7c29

Browse files
committed
[diff/patience] implemented patience diff
1 parent c26d3be commit 92f7c29

File tree

6 files changed

+382
-3
lines changed

6 files changed

+382
-3
lines changed

diff/patience.js

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/**
2+
* A very simple diff algorithm. Slightly adapted to support splitting at different stages (e.g.
3+
* first diff lines, then diff words)
4+
*
5+
* https://bramcohen.livejournal.com/73318.html
6+
*
7+
* @experiemantal This API will likely change.
8+
*/
9+
10+
import * as map from '../map.js'
11+
import * as math from '../math.js'
12+
import * as array from '../array.js'
13+
14+
/**
15+
* Implementation of patience diff. Expects that content is pre-split (e.g. by newline).
16+
*
17+
* @param {Array<string>} as
18+
* @param {Array<string>} bs
19+
* @return {Array<{ index: number, remove: Array<string>, insert: Array<string>}>} changeset @todo should use delta instead
20+
*/
21+
export const diff = (as, bs) => {
22+
const {
23+
middleAs,
24+
middleBs,
25+
commonPrefix
26+
} = removeCommonPrefixAndSuffix(as, bs)
27+
return lcs(middleAs, middleBs, commonPrefix)
28+
}
29+
30+
31+
/**
32+
* @param {string} a
33+
* @param {string} b
34+
* @param {RegExp|string} _regexp
35+
*/
36+
export const diffSplitBy = (a, b, _regexp) => {
37+
const isStringSeparator = typeof _regexp === 'string'
38+
const separator = isStringSeparator ? _regexp : ''
39+
const regexp = isStringSeparator ? new RegExp(_regexp, 'g') : _regexp
40+
const as = splitByRegexp(a, regexp, !isStringSeparator)
41+
const bs = splitByRegexp(b, regexp, !isStringSeparator)
42+
const changes = diff(as, bs)
43+
let prevSplitIndex = 0
44+
let prevStringIndex = 0
45+
return changes.map(change => {
46+
for (; prevSplitIndex < change.index; prevSplitIndex++) {
47+
prevStringIndex += as[prevSplitIndex].length
48+
}
49+
return {
50+
index: prevStringIndex,
51+
remove: change.remove.join(separator),
52+
insert: change.insert.join(separator)
53+
}
54+
})
55+
}
56+
57+
/**
58+
* Sensible default for diffing strings using patience (it's fast though).
59+
*
60+
* Perform different types of patience diff on the content. Diff first by newline, then paragraphs, then by word
61+
* (split by space, brackets, punctuation)
62+
*
63+
* @param {string} a
64+
* @param {string} b
65+
*/
66+
export const diffAuto = (a, b) =>
67+
diffSplitBy(a, b, '\n').map(d =>
68+
diffSplitBy(d.remove, d.insert, /\.\ |[a-zA-Z0-9]+|[\.\ \(\)\[\]\,\;\{\}]/g).map(dd => ({
69+
insert: dd.insert,
70+
remove: dd.remove,
71+
index: dd.index + d.index
72+
}))
73+
).flat()
74+
75+
/**
76+
* @param {Array<string>} as
77+
* @param {Array<string>} bs
78+
*/
79+
const removeCommonPrefixAndSuffix = (as, bs) => {
80+
const commonLen = math.min(as.length, bs.length)
81+
let commonPrefix = 0
82+
let commonSuffix = 0
83+
// match start
84+
for (; commonPrefix < commonLen && as[commonPrefix] === bs[commonPrefix]; commonPrefix++) { /* nop */ }
85+
// match end
86+
for (; commonSuffix < commonLen - commonPrefix && as[as.length - 1 - commonSuffix] === bs[bs.length - 1 - commonSuffix]; commonSuffix++) { /* nop */ }
87+
const middleAs = as.slice(commonPrefix, as.length - commonSuffix)
88+
const middleBs = bs.slice(commonPrefix, bs.length - commonSuffix)
89+
return {
90+
middleAs, middleBs, commonPrefix, commonSuffix
91+
}
92+
}
93+
94+
/**
95+
* Splits string by regex and returns all strings as an array. The matched parts are also returned.
96+
*
97+
* @param {string} str
98+
* @param {RegExp} regexp
99+
* @param {boolean} includeSeparator
100+
*/
101+
const splitByRegexp = (str, regexp, includeSeparator) => {
102+
const matches = [...str.matchAll(regexp)]
103+
let prevIndex = 0
104+
/**
105+
* @type {Array<string>}
106+
*/
107+
const res = []
108+
matches.forEach(m => {
109+
prevIndex < (m.index || 0) && res.push(str.slice(prevIndex, m.index))
110+
includeSeparator && res.push(m[0]) // is always non-empty
111+
prevIndex = /** @type {number} */ (m.index) + m[0].length
112+
})
113+
const end = str.slice(prevIndex)
114+
end.length > 0 && res.push(end)
115+
return res
116+
}
117+
118+
/**
119+
* An item may have multiple occurances (not when matching unique entries). It also may have a
120+
* reference to the stack of other items (from as to bs).
121+
*/
122+
class Item {
123+
constructor () {
124+
/**
125+
* @type {Array<number>}
126+
*/
127+
this.indexes = []
128+
/**
129+
* The matching item from the other side
130+
* @type {Item?}
131+
*/
132+
this.match = null
133+
/**
134+
* For patience sort. Reference (index of the stack) to the previous pile.
135+
*
136+
* @type {Item?}
137+
*/
138+
this.ref = null
139+
}
140+
}
141+
142+
/**
143+
* @param {Array<string>} xs
144+
*/
145+
const partition = xs => {
146+
/**
147+
* @type {Map<string,Item>}
148+
*/
149+
const refs = map.create()
150+
xs.forEach((x, index) => {
151+
map.setIfUndefined(refs, x, () => new Item()).indexes.push(index)
152+
})
153+
return refs
154+
}
155+
156+
/**
157+
* Find the longest common subsequence of items using patience sort.
158+
*
159+
* @param {Array<string>} as
160+
* @param {Array<string>} bs
161+
* @param {number} indexAdjust
162+
*/
163+
const lcs = (as, bs, indexAdjust) => {
164+
if (as.length === 0 && bs.length === 0) return []
165+
const aParts = partition(as)
166+
const bParts = partition(bs)
167+
/**
168+
* @type {Array<Array<Item>>} I.e. Array<Pile<Item>>
169+
*/
170+
const piles = []
171+
aParts.forEach((aItem, aKey) => {
172+
// skip if no match or if either item is not unique
173+
if (aItem.indexes.length > 1 || (aItem.match = bParts.get(aKey) || null) == null || aItem.match.indexes.length > 1) return
174+
for (let i = 0; i < piles.length; i++) {
175+
const pile = piles[i]
176+
if (aItem.match.indexes[0] < /** @type {Item} */ (pile[pile.length - 1].match).indexes[0]) {
177+
pile.push(aItem)
178+
if (i > 0) aItem.ref = array.last(piles[i - 1])
179+
return
180+
}
181+
}
182+
piles.length > 0 && (aItem.ref = array.last(piles[piles.length - 1]))
183+
piles.push([aItem])
184+
})
185+
/**
186+
* References to all matched items
187+
*
188+
* @type {Array<Item>}
189+
*/
190+
const matches = []
191+
/**
192+
* @type {Item?}
193+
*/
194+
let currPileItem = piles[piles.length - 1]?.[0]
195+
while (currPileItem != null) {
196+
matches.push(currPileItem)
197+
currPileItem = currPileItem.ref
198+
}
199+
matches.reverse()
200+
// add pseude match (assume the string terminal always matches)
201+
const pseudoA = new Item()
202+
const pseudoB = new Item()
203+
pseudoA.match = pseudoB
204+
pseudoA.indexes.push(as.length)
205+
pseudoB.indexes.push(bs.length)
206+
matches.push(pseudoA)
207+
/**
208+
* @type {Array<{ index: number, remove: Array<string>, insert: Array<string>}>}
209+
*/
210+
const changeset = []
211+
let diffAStart = 0
212+
let diffBStart = 0
213+
for (let i = 0; i < matches.length; i++) {
214+
const m = matches[i]
215+
const delLength = m.indexes[0] - diffAStart
216+
const insLength = /** @type {Item} */ (m.match).indexes[0] - diffBStart
217+
if (delLength !== 0 || insLength !== 0) {
218+
const stripped = removeCommonPrefixAndSuffix(as.slice(diffAStart, diffAStart + delLength), bs.slice(diffBStart, diffBStart + insLength))
219+
if (stripped.middleAs.length !== 0 || stripped.middleBs.length !== 0) {
220+
changeset.push({ index: diffAStart + indexAdjust + stripped.commonPrefix, remove: stripped.middleAs, insert: stripped.middleBs })
221+
}
222+
}
223+
diffAStart = m.indexes[0] + 1
224+
diffBStart = /** @type {Item} */ (m.match).indexes[0] + 1
225+
}
226+
return changeset
227+
}

diff/patience.test.js

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import * as prng from '../prng.js'
2+
import * as t from '../testing.js'
3+
import * as patience from './patience.js'
4+
5+
/**
6+
* @param {string} a
7+
* @param {string} b
8+
* @param {Array<{ insert: string, remove: string, index: number }>} expect
9+
*/
10+
const testDiffAuto = (a, b, expect) => {
11+
const res = patience.diffAuto(a, b)
12+
t.info(`Diffing "${a}" with "${b}"`)
13+
console.log(res)
14+
t.compare(res, expect)
15+
}
16+
17+
/**
18+
* @param {t.TestCase} _tc
19+
*/
20+
export const testDiffing = _tc => {
21+
testDiffAuto(
22+
'x ',
23+
' ',
24+
[{
25+
insert: '',
26+
remove: 'x ',
27+
index: 0
28+
}]
29+
)
30+
// no change
31+
testDiffAuto(
32+
'testa',
33+
'testa',
34+
[]
35+
)
36+
// single char change
37+
testDiffAuto(
38+
'testa',
39+
'testb',
40+
[{
41+
insert: 'testb',
42+
remove: 'testa',
43+
index: 0
44+
}]
45+
)
46+
// single word change
47+
testDiffAuto(
48+
'The rabbit jumped over the fence.\n',
49+
'The dog jumped over the fence.\n',
50+
[{
51+
insert: 'dog',
52+
remove: 'rabbit',
53+
index: 4
54+
}]
55+
)
56+
// similar sentences.
57+
testDiffAuto(
58+
'the dog. the cat.',
59+
'the cat. the rabbit.',
60+
[{
61+
insert: 'cat',
62+
remove: 'dog',
63+
index: 4
64+
}, {
65+
insert: 'rabbit',
66+
remove: 'cat',
67+
index: 13
68+
}]
69+
)
70+
testDiffAuto(
71+
'cat food',
72+
'my cat food',
73+
[{
74+
insert: 'my ',
75+
remove: '',
76+
index: 0
77+
}]
78+
)
79+
testDiffAuto(
80+
'the cat stuff',
81+
'my cat food',
82+
[{
83+
insert: 'my',
84+
remove: 'the',
85+
index: 0
86+
}, {
87+
insert: 'food',
88+
remove: 'stuff',
89+
index: 8
90+
}]
91+
)
92+
}
93+
94+
/**
95+
* @param {t.TestCase} tc
96+
*/
97+
export const testRepeatRandomWordReplace = tc => {
98+
const NWords = 600
99+
const NReplacements = Math.floor(NWords / 20)
100+
const NInserts = Math.floor(NWords / 20) + 1
101+
const NDeletes = Math.floor(NWords / 20) + 1
102+
const MaxWordLen = 6
103+
104+
t.group(`Diff on changed list of words (#words=${NWords},#replacements=${NReplacements},#inserts=${NInserts},#deletes=${NDeletes}})`, () => {
105+
const words = []
106+
for (let i = 0; i < NWords; i++) {
107+
words.push(prng.word(tc.prng, 0, MaxWordLen))
108+
}
109+
const newWords = words.slice()
110+
for (let i = 0; i < NReplacements; i++) {
111+
const pos = prng.int32(tc.prng, 0, words.length - 1)
112+
newWords[pos] = prng.word(tc.prng, 0, MaxWordLen)
113+
}
114+
for (let i = 0; i < NInserts; i++) {
115+
const pos = prng.int32(tc.prng, 0, words.length - 1)
116+
newWords.splice(pos, 0, prng.word(tc.prng, 0, MaxWordLen))
117+
}
118+
for (let i = 0; i < NDeletes; i++) {
119+
const pos = prng.int32(tc.prng, 0, words.length - 1)
120+
newWords.splice(pos, 1)
121+
}
122+
const before = words.join(' ')
123+
const after = newWords.join(' ')
124+
/**
125+
* @type {Array<{ insert: string, remove: string, index: number }>}
126+
*/
127+
let d = []
128+
t.measureTime(`time to calculate diff (a.length=${before.length},b.length=${after.length})`, () => {
129+
d = patience.diffAuto(before, after)
130+
})
131+
let updating = before
132+
console.log({ words, newWords, diff: d })
133+
// verify by applying
134+
for (let i = d.length - 1; i >= 0; i--) {
135+
const change = d[i]
136+
const spliced = updating.split('')
137+
spliced.splice(change.index, change.remove.length, change.insert)
138+
updating = spliced.join('')
139+
}
140+
t.compare(updating, after)
141+
t.assert(d.length <= NReplacements + 1 + NInserts + NDeletes) // Sanity check: A maximum of one fault
142+
})
143+
}
144+

package.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@
135135
"import": "./decoding.js",
136136
"require": "./dist/decoding.cjs"
137137
},
138+
"./diff/patience": {
139+
"types": "./diff/patience.d.ts",
140+
"module": "./diff/patience.js",
141+
"import": "./diff/patience.js",
142+
"require": "./dist/patience.cjs"
143+
},
138144
"./diff.js": "./diff.js",
139145
"./dist/diff.cjs": "./dist/diff.cjs",
140146
"./diff": {

0 commit comments

Comments
 (0)