Skip to content

Commit dbc173b

Browse files
committed
update csv export to child only, no parents
1 parent 6406f9a commit dbc173b

File tree

2 files changed

+46
-5
lines changed

2 files changed

+46
-5
lines changed

lib/tasks/detect_duplicates.rake

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ namespace :duplicates do
2626
log.call("Scanning assets with matching checksums...")
2727
log.call("Processing in batches of #{batch_size}...")
2828

29+
main_volume_names = %w[Deposit Media-Repository]
30+
2931
duplicate_checksums = IsilonAsset.where("NULLIF(TRIM(file_checksum), '') IS NOT NULL")
3032
.group(:file_checksum)
3133
.having("COUNT(*) > 1")
@@ -44,7 +46,8 @@ namespace :duplicates do
4446
end
4547

4648
written = 0
47-
CSV.open(output_path, "w", write_headers: true, headers: [ "FullPath" ]) do |csv|
49+
headers = [ "File", "Path", "Checksum", "File Size" ]
50+
CSV.open(output_path, "w", write_headers: true, headers: headers) do |csv|
4851
duplicate_checksums.each_slice(batch_size) do |checksum_batch|
4952
batch_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
5053
checksum_batch.each_with_index do |checksum, index|
@@ -67,13 +70,20 @@ namespace :duplicates do
6770
DuplicateGroupMembership.insert_all(rows) if rows.any?
6871
IsilonAsset.where(id: asset_ids).update_all(has_duplicates: true)
6972

70-
IsilonAsset.where(id: asset_ids)
71-
.includes(parent_folder: :volume)
72-
.find_each do |asset|
73+
main_scope = IsilonAsset.joins(parent_folder: :volume)
74+
.where(file_checksum: checksum, volumes: { name: main_volume_names })
75+
outside_scope = IsilonAsset.joins(parent_folder: :volume)
76+
.where(file_checksum: checksum)
77+
.where.not(volumes: { name: main_volume_names })
78+
79+
next unless main_scope.exists?
80+
next unless outside_scope.exists?
81+
82+
outside_scope.includes(parent_folder: :volume).find_each do |asset|
7383
full_path = build_full_path.call(asset)
7484
next unless full_path
7585

76-
csv << [ full_path ]
86+
csv << [ asset.isilon_name, full_path, checksum, asset.file_size ]
7787
written += 1
7888
end
7989

spec/tasks/duplicates_rake_spec.rb

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@
1818
let!(:media_volume) { create(:volume, name: "Media-Repository") }
1919
let!(:deposit_folder) { create(:isilon_folder, volume: deposit_volume, full_path: "/Deposit/project") }
2020
let!(:media_folder) { create(:isilon_folder, volume: media_volume, full_path: "/Media-Repository/project") }
21+
let(:export_path) { Rails.root.join("log/isilon-duplicate-paths.csv") }
22+
let(:detect_log_path) { Rails.root.join("log/isilon-duplicates-detect.log") }
23+
24+
after do
25+
File.delete(export_path) if File.exist?(export_path)
26+
File.delete(detect_log_path) if File.exist?(detect_log_path)
27+
end
2128

2229
it "groups assets with matching checksums" do
2330
original = create(:isilon_asset, parent_folder: deposit_folder, file_checksum: "abc", file_size: "100")
@@ -37,6 +44,30 @@
3744
expect(IsilonAsset.where(has_duplicates: true).count).to eq(4)
3845
expect(IsilonAsset.where(has_duplicates: false).count).to eq(2)
3946
end
47+
48+
it "exports child rows with checksum and file size for checksums shared across main and outside volumes" do
49+
other_volume = create(:volume, name: "Other")
50+
other_folder = create(:isilon_folder, volume: other_volume, full_path: "/Other/project")
51+
52+
create(:isilon_asset, parent_folder: deposit_folder, isilon_path: "/project/main.txt", isilon_name: "main.txt", file_checksum: "abc", file_size: "100")
53+
create(:isilon_asset, parent_folder: media_folder, isilon_path: "/project/main2.txt", isilon_name: "main2.txt", file_checksum: "abc", file_size: "100")
54+
outside_asset = create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/out.txt", isilon_name: "out.txt", file_checksum: "abc", file_size: "100")
55+
create(:isilon_asset, parent_folder: other_folder, isilon_path: "/project/solo.txt", isilon_name: "solo.txt", file_checksum: "xyz", file_size: "100")
56+
57+
Rake::Task["duplicates:detect"].invoke
58+
59+
exported = CSV.read(export_path, headers: true)
60+
child_row = exported.find { |row| row["File"] == "out.txt" }
61+
solo_row = exported.find { |row| row["File"] == "solo.txt" }
62+
63+
expect(child_row).to be_present
64+
expect(child_row["Path"]).to eq("/Other/project/out.txt")
65+
expect(child_row["Checksum"]).to eq("abc")
66+
expect(child_row["File Size"]).to eq("100")
67+
expect(exported.find { |row| row["File"] == "main.txt" }).to be_nil
68+
expect(exported.find { |row| row["File"] == "main2.txt" }).to be_nil
69+
expect(solo_row).to be_nil
70+
end
4071
end
4172

4273
describe "duplicates:clear" do

0 commit comments

Comments
 (0)