Skip to content

Commit bce4708

Browse files
committed
Better levenshtein distances
1 parent 57ae3db commit bce4708

File tree

6 files changed

+45
-34
lines changed

6 files changed

+45
-34
lines changed

Gemfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,5 @@ gem "aws-sdk"
5959
gem "bcrypt"
6060

6161
# Levenshtein Distance Analyzer
62-
gem 'edits'
62+
gem 'edits'
63+
gem "table_print"

Gemfile.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,7 @@ GEM
886886
activesupport (>= 4.0)
887887
sprockets (>= 3.0.0)
888888
sqlite3 (1.3.13)
889+
table_print (1.5.6)
889890
thor (0.20.3)
890891
thread_safe (0.3.6)
891892
tilt (2.0.9)
@@ -944,6 +945,7 @@ DEPENDENCIES
944945
spring (~> 2.0)
945946
spring-watcher-listen (~> 2.0)
946947
sqlite3 (~> 1.3.6)
948+
table_print
947949
turbolinks (~> 5)
948950
tzinfo-data
949951
uglifier (>= 1.3.0)

Makefile

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,22 +45,13 @@ production_deploy:
4545
heroku restart --app confy-wecodeio
4646
heroku maintenance:off --app confy-wecodeio
4747

48-
capture_production_db:
48+
use_production_db:
4949
heroku pg:backups capture --app confy-wecodeio
50-
$(MAKE) download_production_db
51-
52-
download_production_db:
5350
curl -o tmp/latest.dump `heroku pg:backups public-url --app confy-wecodeio`
54-
55-
restore_production_db:
5651
docker-compose run web bundle exec rake db:drop db:create DISABLE_DATABASE_ENVIRONMENT_CHECK=1
5752
docker cp tmp/latest.dump confy_db_1:/latest.dump
5853
! docker exec confy_db_1 pg_restore --verbose --clean --no-acl --no-owner -h localhost -d confy_development -U confy /latest.dump
5954
docker-compose run web bundle exec rake db:migrate
6055

61-
get_current_production_db:
62-
$(MAKE) capture_production_db
63-
$(MAKE) restore_production_db
64-
6556
analyze_levenshtein:
6657
docker-compose run web rails runner utilities/data_analysis/distance.rb

app/controllers/admin/speakers_controller.rb

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,27 @@ def levenshtein
1010
field = params[:field] || "name"
1111

1212
by_size = items.group_by { |s| s.name.length }
13-
@pairs = []
13+
pairs = []
1414
(0..(by_size.keys.size - 1)).each do |k|
15-
interesting_values = []
16-
((-[k, THRESHOLD - 1].min)..0).each do |p|
17-
interesting_values << by_size[by_size.keys.sort[k + p]]
18-
end
19-
(0..interesting_values.length - 1).each do |o|
20-
(0..(interesting_values[o].length - 1)).each do |i|
21-
d = 0
22-
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
23-
p1 = interesting_values[o][i]
24-
p2 = interesting_values[interesting_values.length - 1][j]
25-
d = Edits::Levenshtein.distance_with_max(p1[field], p2[field], THRESHOLD + 1)
26-
@pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
15+
interesting_values = []
16+
((-[k, THRESHOLD - 1].min)..0).each do |p|
17+
interesting_values << by_size[by_size.keys.sort[k + p]]
18+
end
19+
(0..interesting_values.length - 1).each do |o|
20+
(0..(interesting_values[o].length - 1)).each do |i|
21+
d = 0
22+
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
23+
p1 = interesting_values[o][i]
24+
p2 = interesting_values[interesting_values.length - 1][j]
25+
d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1)
26+
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
27+
end
2728
end
2829
end
29-
end
3030
end
31-
@pairs.sort_by!(&:distance)
31+
pairs.sort_by!(&:distance)
32+
33+
@pairs = pairs
3234
end
3335

3436
def merge

app/models/speaker.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,9 @@ def merge_with(disposable)
1414
talks << disposable.talks
1515
disposable.delete
1616
end
17+
18+
def levenshtein_name
19+
name.gsub(/(\W|\d)/, "").split.sort.join(" ").downcase
20+
end
21+
1722
end

utilities/data_analysis/distance.rb

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,32 @@
33
clazz = Speaker
44
field = "name"
55

6+
THRESHOLD = 5
67

78
items = clazz.all
89
total = items.size
910
steps = total / 20.0
1011

12+
by_size = items.group_by { |s| s.name.length }
1113
pairs = []
12-
(0..(total - 1)).each do |i|
13-
((i+1)..(total - 1)).each do |j|
14-
p1 = items[i]
15-
p2 = items[j]
16-
d = Edits::Levenshtein.distance(p1[field], p2[field])
17-
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < 3
14+
(0..(by_size.keys.size - 1)).each do |k|
15+
interesting_values = []
16+
((-[k, THRESHOLD - 1].min)..0).each do |p|
17+
interesting_values << by_size[by_size.keys.sort[k + p]]
18+
end
19+
(0..interesting_values.length - 1).each do |o|
20+
(0..(interesting_values[o].length - 1)).each do |i|
21+
d = 0
22+
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
23+
p1 = interesting_values[o][i]
24+
p2 = interesting_values[interesting_values.length - 1][j]
25+
d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1)
26+
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
27+
end
28+
end
1829
end
19-
# printf("\rProgress: [%-20s]", "=" * (i/steps))
20-
# puts
2130
end
31+
pairs.sort_by!(&:distance)
2232

2333

2434
puts "Informe para #{clazz} por #{field}"

0 commit comments

Comments
 (0)