Skip to content

Commit 637957c

Browse files
committed
add logging to csv for all dups, with full path including volume
1 parent 4398be4 commit 637957c

File tree

1 file changed

+52
-23
lines changed

1 file changed

+52
-23
lines changed

lib/tasks/detect_duplicates.rake

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
# frozen_string_literal: true
22

3+
require "csv"
4+
35
namespace :duplicates do
46
desc "Detect and list duplicates according to Rule 3"
57
task detect: :environment do
68
puts "Starting Rule 3 duplicate detection..."
79

810
# Find all assets outside main areas with non-empty checksums
11+
output_path = "log/isilon-duplicate-paths.csv"
912
processed = 0
1013
batch_size = 1000
1114
progress_interval = batch_size * 5
@@ -20,37 +23,63 @@ namespace :duplicates do
2023
.having("COUNT(*) > 1")
2124
.pluck(:file_checksum)
2225

23-
duplicate_checksums.each_slice(batch_size) do |checksum_batch|
24-
checksum_batch.each do |checksum|
25-
asset_ids = IsilonAsset.where(file_checksum: checksum).pluck(:id)
26-
next if asset_ids.empty?
27-
28-
group = DuplicateGroup.find_or_create_by!(checksum: checksum)
29-
group.duplicate_group_memberships.delete_all
30-
31-
now = Time.current
32-
rows = asset_ids.map do |asset_id|
33-
{
34-
duplicate_group_id: group.id,
35-
isilon_asset_id: asset_id,
36-
created_at: now,
37-
updated_at: now
38-
}
26+
build_full_path = lambda do |asset|
27+
parent = asset.parent_folder
28+
return nil unless parent
29+
30+
volume = parent.volume
31+
return nil unless volume
32+
33+
path = asset.isilon_path.to_s
34+
path = "/#{path}" unless path.start_with?("/")
35+
"/#{volume.name}#{path}".gsub(%r{//+}, "/")
36+
end
37+
38+
written = 0
39+
CSV.open(output_path, "w", write_headers: true, headers: [ "FullPath" ]) do |csv|
40+
duplicate_checksums.each_slice(batch_size) do |checksum_batch|
41+
checksum_batch.each do |checksum|
42+
asset_ids = IsilonAsset.where(file_checksum: checksum).pluck(:id)
43+
next if asset_ids.empty?
44+
45+
group = DuplicateGroup.find_or_create_by!(checksum: checksum)
46+
group.duplicate_group_memberships.delete_all
47+
48+
now = Time.current
49+
rows = asset_ids.map do |asset_id|
50+
{
51+
duplicate_group_id: group.id,
52+
isilon_asset_id: asset_id,
53+
created_at: now,
54+
updated_at: now
55+
}
56+
end
57+
DuplicateGroupMembership.insert_all(rows) if rows.any?
58+
IsilonAsset.where(id: asset_ids).update_all(has_duplicates: true)
59+
60+
IsilonAsset.where(id: asset_ids)
61+
.includes(parent_folder: :volume)
62+
.find_each do |asset|
63+
full_path = build_full_path.call(asset)
64+
next unless full_path
65+
66+
csv << [ full_path ]
67+
written += 1
68+
end
3969
end
40-
DuplicateGroupMembership.insert_all(rows) if rows.any?
41-
IsilonAsset.where(id: asset_ids).update_all(has_duplicates: true)
42-
end
4370

44-
processed += checksum_batch.size
45-
GC.start
71+
processed += checksum_batch.size
72+
GC.start
4673

47-
if processed % progress_interval == 0
48-
puts "Processed #{processed} checksum groups..."
74+
if processed % progress_interval == 0
75+
puts "Processed #{processed} checksum groups..."
76+
end
4977
end
5078
end
5179

5280
puts "\n✓ Complete!"
5381
puts "Processed: #{processed} checksum groups"
82+
puts "Duplicate paths exported to #{output_path} (#{written} rows)"
5483
end
5584

5685
desc "Show duplicate statistics"

0 commit comments

Comments
 (0)