11# frozen_string_literal: true
22
3+ require "csv"
4+
35namespace :duplicates do
46 desc "Detect and list duplicates according to Rule 3"
57 task detect : :environment do
68 puts "Starting Rule 3 duplicate detection..."
79
810 # Find all assets outside main areas with non-empty checksums
11+ output_path = "log/isilon-duplicate-paths.csv"
912 processed = 0
1013 batch_size = 1000
1114 progress_interval = batch_size * 5
@@ -20,37 +23,63 @@ namespace :duplicates do
2023 . having ( "COUNT(*) > 1" )
2124 . pluck ( :file_checksum )
2225
23- duplicate_checksums . each_slice ( batch_size ) do |checksum_batch |
24- checksum_batch . each do |checksum |
25- asset_ids = IsilonAsset . where ( file_checksum : checksum ) . pluck ( :id )
26- next if asset_ids . empty?
27-
28- group = DuplicateGroup . find_or_create_by! ( checksum : checksum )
29- group . duplicate_group_memberships . delete_all
30-
31- now = Time . current
32- rows = asset_ids . map do |asset_id |
33- {
34- duplicate_group_id : group . id ,
35- isilon_asset_id : asset_id ,
36- created_at : now ,
37- updated_at : now
38- }
26+ build_full_path = lambda do |asset |
27+ parent = asset . parent_folder
28+ return nil unless parent
29+
30+ volume = parent . volume
31+ return nil unless volume
32+
33+ path = asset . isilon_path . to_s
34+ path = "/#{ path } " unless path . start_with? ( "/" )
35+ "/#{ volume . name } #{ path } " . gsub ( %r{//+} , "/" )
36+ end
37+
38+ written = 0
39+ CSV . open ( output_path , "w" , write_headers : true , headers : [ "FullPath" ] ) do |csv |
40+ duplicate_checksums . each_slice ( batch_size ) do |checksum_batch |
41+ checksum_batch . each do |checksum |
42+ asset_ids = IsilonAsset . where ( file_checksum : checksum ) . pluck ( :id )
43+ next if asset_ids . empty?
44+
45+ group = DuplicateGroup . find_or_create_by! ( checksum : checksum )
46+ group . duplicate_group_memberships . delete_all
47+
48+ now = Time . current
49+ rows = asset_ids . map do |asset_id |
50+ {
51+ duplicate_group_id : group . id ,
52+ isilon_asset_id : asset_id ,
53+ created_at : now ,
54+ updated_at : now
55+ }
56+ end
57+ DuplicateGroupMembership . insert_all ( rows ) if rows . any?
58+ IsilonAsset . where ( id : asset_ids ) . update_all ( has_duplicates : true )
59+
60+ IsilonAsset . where ( id : asset_ids )
61+ . includes ( parent_folder : :volume )
62+ . find_each do |asset |
63+ full_path = build_full_path . call ( asset )
64+ next unless full_path
65+
66+ csv << [ full_path ]
67+ written += 1
68+ end
3969 end
40- DuplicateGroupMembership . insert_all ( rows ) if rows . any?
41- IsilonAsset . where ( id : asset_ids ) . update_all ( has_duplicates : true )
42- end
4370
44- processed += checksum_batch . size
45- GC . start
71+ processed += checksum_batch . size
72+ GC . start
4673
47- if processed % progress_interval == 0
48- puts "Processed #{ processed } checksum groups..."
74+ if processed % progress_interval == 0
75+ puts "Processed #{ processed } checksum groups..."
76+ end
4977 end
5078 end
5179
5280 puts "\n ✓ Complete!"
5381 puts "Processed: #{ processed } checksum groups"
82+ puts "Duplicate paths exported to #{ output_path } (#{ written } rows)"
5483 end
5584
5685 desc "Show duplicate statistics"
0 commit comments