Skip to content

Commit 327740b

Browse files
etiennedeggdalle
andauthored
Rewrite of edit_distance with edge costs. fix #111 (#137)
* fix edit_distance * some fixes * add tests; little bit of cleaning * make code type stable * use something; initiate cost with a float * Apply formatter * Fix docstring --------- Co-authored-by: Guillaume Dalle <[email protected]>
1 parent af28a4f commit 327740b

File tree

2 files changed

+210
-65
lines changed

2 files changed

+210
-65
lines changed

src/editdist.jl

Lines changed: 176 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,16 @@ representing vertex operations:
1212
1313
1414
### Optional Arguments
15-
- `insert_cost::Function=v->1.0`
16-
- `delete_cost::Function=u->1.0`
17-
- `subst_cost::Function=(u,v)->0.5`
15+
- `vertex_insert_cost::Function=v->0.`
16+
- `vertex_delete_cost::Function=u->0.`
17+
- `vertex_subst_cost::Function=(u, v)->0.`
18+
- `edge_insert_cost::Function=e->1.`
19+
- `edge_delete_cost::Function=e->1.`
20+
- `edge_subst_cost::Function=(e1, e2)->0.`
21+
22+
The algorithm will always try to match two edges if it can, so if it is
23+
preferrable to delete two edges rather than match these, it should be
24+
reflected in the `edge_subst_cost` function.
1825
1926
By default, the algorithm uses constant operation costs. The
2027
user can provide classical Minkowski costs computed from vertex
@@ -31,7 +38,7 @@ search in case the default heuristic is not satisfactory.
3138
- Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to
3239
compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments
3340
if involved costs are equivalent.
34-
- The use of simple Minkowski costs can improve performance considerably.
41+
- The use of a heuristic can improve performance considerably.
3542
- Exploit vertex attributes when designing operation costs.
3643
3744
### References
@@ -49,51 +56,163 @@ julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0
4956
julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]);
5057
5158
julia> edit_distance(g1, g2)
52-
(3.5, Tuple[(1, 2), (2, 1), (3, 0), (4, 3), (5, 0)])
59+
(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)])
5360
```
5461
"""
5562
function edit_distance(
5663
G₁::AbstractGraph,
5764
G₂::AbstractGraph;
58-
insert_cost::Function=v -> 1.0,
59-
delete_cost::Function=u -> 1.0,
60-
subst_cost::Function=(u, v) -> 0.5,
61-
heuristic::Function=DefaultEditHeuristic,
65+
vertex_insert_cost=nothing,
66+
vertex_delete_cost=nothing,
67+
vertex_subst_cost=nothing,
68+
edge_insert_cost=nothing,
69+
edge_delete_cost=nothing,
70+
edge_subst_cost=nothing,
71+
heuristic=nothing,
6272
)
73+
if isnothing(vertex_insert_cost) &&
74+
isnothing(vertex_delete_cost) &&
75+
isnothing(vertex_subst_cost) &&
76+
isnothing(edge_insert_cost) &&
77+
isnothing(edge_delete_cost) &&
78+
isnothing(edge_subst_cost) &&
79+
isnothing(heuristic)
80+
heuristic = default_edit_heuristic
81+
end
82+
vertex_insert_cost = something(vertex_insert_cost, v -> 0.0)
83+
vertex_delete_cost = something(vertex_delete_cost, v -> 0.0)
84+
vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0)
85+
edge_insert_cost = something(edge_insert_cost, e -> 1.0)
86+
edge_delete_cost = something(edge_delete_cost, e -> 1.0)
87+
edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0)
88+
heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0)
89+
return _edit_distance(
90+
G₁::AbstractGraph,
91+
G₂::AbstractGraph,
92+
vertex_insert_cost,
93+
vertex_delete_cost,
94+
vertex_subst_cost,
95+
edge_insert_cost,
96+
edge_delete_cost,
97+
edge_subst_cost,
98+
heuristic,
99+
)
100+
end
101+
102+
function _edit_distance(
103+
G₁::AbstractGraph{T},
104+
G₂::AbstractGraph{U},
105+
vertex_insert_cost::Function,
106+
vertex_delete_cost::Function,
107+
vertex_subst_cost::Function,
108+
edge_insert_cost::Function,
109+
edge_delete_cost::Function,
110+
edge_subst_cost::Function,
111+
heuristic::Function,
112+
) where {T<:Integer,U<:Integer}
113+
isdirected = is_directed(G₁) || is_directed(G₂)
114+
115+
# compute the cost on edges due to associate u1 to v1 and u2 to v2
116+
# u2 and v2 can eventually be 0
117+
function association_cost(u1, u2, v1, v2)
118+
cost = 0.0
119+
if has_edge(G₁, u1, u2)
120+
if has_edge(G₂, v1, v2)
121+
cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2))
122+
else
123+
cost += edge_delete_cost(Edge(u1, u2))
124+
end
125+
else
126+
if has_edge(G₂, v1, v2)
127+
cost += edge_insert_cost(Edge(v1, v2))
128+
end
129+
end
130+
if isdirected && u1 != u2
131+
if has_edge(G₁, u2, u1)
132+
if has_edge(G₂, v2, v1)
133+
cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1))
134+
else
135+
cost += edge_delete_cost(Edge(u2, u1))
136+
end
137+
else
138+
if has_edge(G₂, v2, v1)
139+
cost += edge_insert_cost(Edge(v2, v1))
140+
end
141+
end
142+
end
143+
return cost
144+
end
63145

64146
# A* search heuristic
65147
h(λ) = heuristic(λ, G₁, G₂)
66148

67149
# initialize open set
68150
OPEN = PriorityQueue{Vector{Tuple},Float64}()
69-
for v in 1:nv(G₂)
70-
enqueue!(OPEN, [(1, v)], subst_cost(1, v) + h([(1, v)]))
151+
for v in vertices(G₂)
152+
enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)]))
71153
end
72-
enqueue!(OPEN, [(1, 0)], delete_cost(1) + h([(1, 0)]))
154+
enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))]))
73155

156+
c = 0
74157
while true
75158
# minimum (partial) edit path
76159
λ, cost = peek(OPEN)
160+
c += 1
77161
dequeue!(OPEN)
78162

79163
if is_complete_path(λ, G₁, G₂)
80164
return cost, λ
81165
else
82-
k, _ = λ[end]
83-
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
166+
u1, _ = λ[end]
167+
u1 += T(1)
168+
vs = setdiff(vertices(G₂), [v for (u, v) in λ])
84169

85-
if k < nv(G₁) # there are still vertices to process in G₁?
86-
for v in vs
87-
λ⁺ = [λ; (k + 1, v)]
88-
enqueue!(OPEN, λ⁺, cost + subst_cost(k + 1, v) + h(λ⁺) - h(λ))
170+
if u1 <= nv(G₁) # there are still vertices to process in G₁?
171+
# we try every possible assignment of v1
172+
for v1 in vs
173+
λ⁺ = [λ; (u1, v1)]
174+
new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ)
175+
for (u2, v2) in λ
176+
new_cost += association_cost(u1, u2, v1, v2)
177+
end
178+
new_cost += association_cost(u1, u1, v1, v1) # handle self-loops
179+
180+
enqueue!(OPEN, λ⁺, new_cost)
181+
end
182+
# we try deleting v1
183+
λ⁺ = [λ; (u1, U(0))]
184+
new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ)
185+
for u2 in outneighbors(G₁, u1)
186+
# edges deleted later when assigning v2
187+
u2 > u1 && continue
188+
new_cost += edge_delete_cost(Edge(u1, u2))
89189
end
90-
λ⁺ = [λ; (k + 1, 0)]
91-
enqueue!(OPEN, λ⁺, cost + delete_cost(k + 1) + h(λ⁺) - h(λ))
190+
if isdirected
191+
for u2 in inneighbors(G₁, u1)
192+
# edges deleted later when assigning v2, and we should not count a self loop twice
193+
u2 >= u1 && continue
194+
new_cost += edge_delete_cost(Edge(u2, u1))
195+
end
196+
end
197+
enqueue!(OPEN, λ⁺, new_cost)
92198
else
93-
# add remaining vertices of G₂ to the path
94-
λ⁺ = [λ; [(0, v) for v in vs]]
95-
total_insert_cost = sum(insert_cost, vs)
96-
enqueue!(OPEN, λ⁺, cost + total_insert_cost + h(λ⁺) - h(λ))
199+
# add remaining vertices of G₂ to the path by deleting them
200+
λ⁺ = [λ; [(T(0), v) for v in vs]]
201+
new_cost = cost + sum(vertex_insert_cost, vs)
202+
for v1 in vs
203+
for v2 in outneighbors(G₂, v1)
204+
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
205+
new_cost += edge_insert_cost(Edge(v1, v2))
206+
end
207+
if isdirected
208+
for v2 in inneighbors(G₂, v1)
209+
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
210+
v1 == v2 && continue # we should not count a self loop twice
211+
new_cost += edge_insert_cost(Edge(v2, v1))
212+
end
213+
end
214+
end
215+
enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ))
97216
end
98217
end
99218
end
@@ -112,11 +231,40 @@ function is_complete_path(λ, G₁, G₂)
112231
return length(us) == nv(G₁) && length(vs) == nv(G₂)
113232
end
114233

115-
function DefaultEditHeuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
116-
vs = Set([v for (u, v) in λ])
117-
delete!(vs, 0)
234+
# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) =
235+
# edit_distance(G₁, G₂,
236+
# vertex_insert_cost=v -> 0.,
237+
# vertex_delete_cost=u -> 0.,
238+
# vertex_subst_cost=(u, v) -> 0.,
239+
# edge_insert_cost=e -> 1.,
240+
# edge_delete_cost=e -> 1.,
241+
# edge_subst_cost=(e1, e2) -> 0.,
242+
# heuristic=default_edit_heuristic)
118243

119-
return nv(G₂) - length(vs)
244+
"""
245+
compute an upper bound on the number of edges that can still be affected
246+
"""
247+
function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
248+
us = setdiff(1:nv(G₁), [u for (u, v) in λ])
249+
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
250+
total_free_edges_g1 = 0
251+
total_free_edges_g2 = 0
252+
if !isempty(us)
253+
total_free_edges_g1 = sum(u -> outdegree(G₁, u), us)
254+
end
255+
if !isempty(vs)
256+
total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs)
257+
end
258+
for (u1, v1) in λ
259+
(u1 == 0 || v1 == 0) && continue
260+
total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1))
261+
total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1))
262+
end
263+
if !is_directed(G₁) && !is_directed(G₂)
264+
total_free_edges_g1 = total_free_edges_g1 / 2
265+
total_free_edges_g2 = total_free_edges_g2 / 2
266+
end
267+
return abs(total_free_edges_g1 - total_free_edges_g2)
120268
end
121269

122270
#-------------------------

test/edit_distance.jl

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,51 +4,48 @@
44
gquad = random_regular_graph(4, 2; rng=rng)
55
gpent = random_regular_graph(5, 2; rng=rng)
66

7-
@testset "edit_distance $triangle, $quadrangle, $pentagon" for triangle in
8-
testgraphs(gtri),
9-
quadrangle in testgraphs(gquad),
10-
pentagon in testgraphs(gpent)
7+
g1 = star_graph(4)
8+
g2 = cycle_graph(3)
119

12-
d, λ = @inferred(
13-
edit_distance(triangle, quadrangle, subst_cost=MinkowskiCost(1:3, 1:4))
14-
)
15-
@test d == 1.0
16-
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4)]
17-
18-
d, λ = @inferred(
19-
edit_distance(quadrangle, triangle, subst_cost=MinkowskiCost(1:4, 1:3))
20-
)
21-
@test d == 1.0
22-
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0)]
10+
vertex_insert_cost = v -> 1.0
11+
vertex_delete_cost = v -> 2.0
12+
vertex_subst_cost = (u, v) -> 3.0
13+
edge_insert_cost = e -> 4.0
14+
edge_delete_cost = e -> 5.0
15+
edge_subst_cost = (e1, e2) -> 6.0
2316

24-
d, λ = @inferred(
25-
edit_distance(triangle, pentagon, subst_cost=MinkowskiCost(1:3, 1:5))
26-
)
17+
@testset "undirected edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
18+
d, λ = @inferred(edit_distance(G1, G2))
2719
@test d == 2.0
28-
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (0, 4), (0, 5)]
29-
3020
d, λ = @inferred(
31-
edit_distance(pentagon, triangle, subst_cost=MinkowskiCost(1:5, 1:3))
21+
edit_distance(
22+
G1,
23+
G2,
24+
vertex_insert_cost=vertex_insert_cost,
25+
vertex_delete_cost=vertex_delete_cost,
26+
vertex_subst_cost=vertex_subst_cost,
27+
edge_insert_cost=edge_insert_cost,
28+
edge_delete_cost=edge_delete_cost,
29+
edge_subst_cost=edge_subst_cost,
30+
)
3231
)
33-
@test d == 2.0
34-
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 0), (5, 0)]
32+
# 1 vertex deletion, 3 vertex substitution, 1 edge insertio n, 1 edge deletion, 2 edge substitution
33+
@test d == 32.0
3534
end
3635

37-
@testset "Minkowski cost / bounded Minkowski" begin
38-
cost = @inferred(MinkowskiCost(1:3, 1:3))
39-
bcost = @inferred(BoundedMinkowskiCost(1:3, 1:3))
40-
for i in 1:3
41-
@test cost(i, i) == 0.0
42-
@test bcost(i, i) == 2 / 3
43-
end
36+
g1 = DiGraph(4)
37+
edges = [(1, 2), (1, 4), (2, 3), (3, 1), (3, 4), (4, 1), (1, 1), (4, 4)]
38+
for e in edges
39+
add_edge!(g1, e)
40+
end
41+
g2 = DiGraph(4)
42+
edges = [(2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (2, 2), (3, 3)]
43+
for e in edges
44+
add_edge!(g2, e)
4445
end
4546

46-
g1c = complete_graph(4)
47-
g2c = complete_graph(4)
48-
rem_edge!(g2c, 1, 2)
49-
@testset "edit_distance $g1, $g2" for g1 in testgraphs(g1c), g2 in testgraphs(g2c)
50-
d, λ = @inferred(edit_distance(g1, g2))
51-
@test d == 2.0
52-
@test λ == Tuple[(1, 1), (2, 2), (3, 3), (4, 4)]
47+
@testset "directed edit_distance" for G1 in testgraphs(g1), G2 in testgraphs(g2)
48+
d, λ = @inferred(edit_distance(G1, G2))
49+
@test d == 4.0
5350
end
5451
end

0 commit comments

Comments
 (0)