11# frozen_string_literal: true
22
33require "csv"
4- require "logger"
5-
64namespace :duplicates do
75 desc "Detect and list duplicates according to Rule 3"
86 task detect : :environment do
9- logger = Logger . new ( "log/isilon-duplicates-detect.log" )
10- log = lambda do |message |
11- puts message
12- logger . info ( message )
13- end
14-
15- log . call ( "Starting Rule 3 duplicate detection..." )
7+ puts "Starting Rule 3 duplicate detection..."
168
179 # Find all assets outside main areas with non-empty checksums
1810 output_path = "log/isilon-duplicate-paths.csv"
@@ -23,8 +15,8 @@ namespace :duplicates do
2315 slow_seconds = ENV . fetch ( "DUPLICATES_SLOW_SECONDS" , "10" ) . to_f
2416 large_group_size = ENV . fetch ( "DUPLICATES_LARGE_GROUP_SIZE" , "20000" ) . to_i
2517
26- log . call ( "Scanning assets with matching checksums..." )
27- log . call ( "Processing in batches of #{ batch_size } ..." )
18+ puts "Scanning assets with matching checksums..."
19+ puts "Processing in batches of #{ batch_size } ..."
2820
2921 main_volume_names = %w[ Deposit Media-Repository ]
3022
@@ -90,25 +82,25 @@ namespace :duplicates do
9082 elapsed = Process . clock_gettime ( Process ::CLOCK_MONOTONIC ) - started_at
9183 global_index = processed + index + 1
9284 if ( global_index % log_every == 0 ) || elapsed >= slow_seconds || asset_ids . length >= large_group_size
93- log . call ( "Processed checksum #{ global_index } /#{ duplicate_checksums . length } (assets=#{ asset_ids . length } ) in #{ format ( '%.2f' , elapsed ) } s" )
85+ puts "Processed checksum #{ global_index } /#{ duplicate_checksums . length } (assets=#{ asset_ids . length } ) in #{ format ( '%.2f' , elapsed ) } s"
9486 end
9587 end
9688
9789 processed += checksum_batch . size
9890 GC . start
9991
10092 if processed % progress_interval == 0
101- log . call ( "Processed #{ processed } checksum groups..." )
93+ puts "Processed #{ processed } checksum groups..."
10294 end
10395
10496 batch_elapsed = Process . clock_gettime ( Process ::CLOCK_MONOTONIC ) - batch_started_at
105- log . call ( "Batch complete (#{ checksum_batch . size } checksums) in #{ format ( '%.2f' , batch_elapsed ) } s" )
97+ puts "Batch complete (#{ checksum_batch . size } checksums) in #{ format ( '%.2f' , batch_elapsed ) } s"
10698 end
10799 end
108100
109- log . call ( "\n ✓ Complete!" )
110- log . call ( "Processed: #{ processed } checksum groups" )
111- log . call ( "Duplicate paths exported to #{ output_path } (#{ written } rows)" )
101+ puts "\n ✓ Complete!"
102+ puts "Processed: #{ processed } checksum groups"
103+ puts "Duplicate paths exported to #{ output_path } (#{ written } rows)"
112104 end
113105
114106 desc "Show duplicate statistics"
0 commit comments