Skip to content

Commit 93ed9b1

Browse files
IMT-167 migration status tiff comparison checking (#191)
* fixes for folders checked and account for first path segment being removed * update tests and fix task accordingly * rubocop
1 parent 7615b84 commit 93ed9b1

File tree

2 files changed

+134
-81
lines changed

2 files changed

+134
-81
lines changed

app/services/sync_service/tiffs.rb

Lines changed: 116 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,12 @@ def initialize(volume_name: nil)
1717
if @volume_name
1818
@parent_volume = Volume.find_by(name: @volume_name)
1919
raise ArgumentError, "Volume '#{@volume_name}' not found" unless @parent_volume
20-
stdout_and_log("Starting post-processing for volume: #{@volume_name}")
2120
else
2221
@parent_volume = nil # Will process all volumes
23-
stdout_and_log("Starting post-processing for all volumes")
2422
end
2523
end
2624

2725
def process
28-
stdout_and_log("Starting Rule 4 post-processing for TIFF comparison...")
29-
3026
begin
3127
# Use ActiveRecord to find parent directories with matching TIFF counts
3228
parent_dirs_with_counts = find_parent_dirs_with_matching_tiff_counts_ar
@@ -36,7 +32,13 @@ def process
3632
# Update unprocessed TIFFs in bulk
3733
total_updated = 0
3834
parent_dirs_with_counts.each do |parent_info|
39-
updated = mark_unprocessed_tiffs_as_dont_migrate(parent_info[:parent_dir], parent_info[:processed_count])
35+
updated = mark_unprocessed_tiffs_as_dont_migrate(
36+
parent_info[:parent_dir],
37+
parent_info[:child_folder],
38+
parent_info[:child_key],
39+
parent_info[:parent_key],
40+
parent_info[:processed_count]
41+
)
4042
total_updated += updated
4143
end
4244

@@ -63,43 +65,69 @@ def process
6365

6466
private
6567

68+
ROOT_CHILD_KEY = "__root__".freeze
69+
6670
def find_parent_dirs_with_matching_tiff_counts_ar
67-
# Base query for all TIFF assets in deposit folders (excluding scrc accessions)
6871
base_query = build_base_tiff_query
69-
70-
# Get all matching assets and group them in Ruby
7172
assets = base_query.pluck(:isilon_path)
7273

73-
# Group assets by parent directory and classify as processed/unprocessed
74-
parent_dir_stats = {}
74+
stats = Hash.new do |hash, parent_key|
75+
hash[parent_key] = {
76+
original_parent: nil,
77+
processed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } },
78+
unprocessed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } }
79+
}
80+
end
7581

7682
assets.each do |path|
77-
parent_dir = extract_parent_directory(path)
78-
next unless parent_dir
83+
parent_dir, child_folder, category = extract_parent_child_and_category(path)
84+
next unless parent_dir && category
85+
86+
parent_key = parent_dir.downcase
87+
record = stats[parent_key]
88+
record[:original_parent] ||= parent_dir
89+
90+
child_key = (child_folder || ROOT_CHILD_KEY).downcase
91+
bucket = category == :processed ? :processed : :unprocessed
92+
entry = record[bucket][child_key]
93+
entry[:count] += 1
94+
entry[:name] ||= child_folder
95+
end
7996

80-
subdirectory_type = classify_subdirectory_type(path)
81-
next unless subdirectory_type
97+
matches = []
8298

83-
parent_dir_stats[parent_dir] ||= { processed: 0, unprocessed: 0 }
84-
parent_dir_stats[parent_dir][subdirectory_type.to_sym] += 1
85-
end
99+
stats.each do |parent_key, record|
100+
processed_children = record[:processed]
101+
next if processed_children.empty?
86102

87-
# Filter to only include directories where processed count equals unprocessed count
88-
# and both counts are > 0
89-
matching_dirs = parent_dir_stats.select do |parent_dir, counts|
90-
counts[:processed] > 0 &&
91-
counts[:unprocessed] > 0 &&
92-
counts[:processed] == counts[:unprocessed]
93-
end
103+
processed_children.each do |child_key, processed_entry|
104+
unprocessed_entry = record[:unprocessed][child_key]
105+
next unless unprocessed_entry[:count].positive?
94106

95-
# Convert to the expected format
96-
matching_dirs.map do |parent_dir, counts|
97-
{
98-
parent_dir: parent_dir,
99-
processed_count: counts[:processed],
100-
unprocessed_count: counts[:unprocessed]
101-
}
107+
original_parent = record[:original_parent] || parent_key
108+
child_display = processed_entry[:name] || unprocessed_entry[:name]
109+
110+
next unless processed_entry[:count] == unprocessed_entry[:count]
111+
112+
log_tiff_directory(
113+
original_parent,
114+
child_display,
115+
processed_entry[:count],
116+
unprocessed_entry[:count]
117+
)
118+
119+
matches << {
120+
parent_dir: original_parent,
121+
parent_key: parent_key,
122+
child_folder: child_display,
123+
child_key: child_key,
124+
processed_count: processed_entry[:count],
125+
unprocessed_count: unprocessed_entry[:count]
126+
}
127+
end
102128
end
129+
130+
matches
103131
end
104132

105133
def build_base_tiff_query
@@ -112,9 +140,8 @@ def build_base_tiff_query
112140
"%tiff%", "%.tiff", "%.tif"
113141
)
114142

115-
# Filter to deposit folders, excluding scrc accessions
116-
query = query.where("LOWER(isilon_path) LIKE ?", "%/deposit/%")
117-
query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/deposit/scrc accessions%")
143+
# Exclude SCRC Accessions regardless of volume prefix stripping
144+
query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/scrc accessions/%")
118145

119146
# Filter to processed, unprocessed, or raw subdirectories
120147
query = query.where(
@@ -128,55 +155,85 @@ def build_base_tiff_query
128155
query
129156
end
130157

131-
def extract_parent_directory(path)
132-
# Extract the parent directory before /processed/, /unprocessed/, or /raw/
133-
case path.downcase
134-
when /^(.+)\/processed\//
135-
$1
136-
when /^(.+)\/unprocessed\//
137-
$1
138-
when /^(.+)\/raw\//
139-
$1
140-
else
141-
nil
142-
end
143-
end
158+
def extract_parent_child_and_category(path)
159+
segments = path.split("/")
160+
key_index = segments.index { |segment| %w[processed unprocessed raw].include?(segment&.downcase) }
161+
return nil unless key_index
144162

145-
def classify_subdirectory_type(path)
146-
case path.downcase
147-
when /\/processed\//
148-
"processed"
149-
when /\/unprocessed\//, /\/raw\//
150-
"unprocessed"
163+
parent_segments = segments[0...key_index]
164+
parent_dir = parent_segments.join("/")
165+
parent_dir = "/#{parent_dir}".gsub(%r{//+}, "/")
166+
parent_dir = "/" if parent_dir.blank?
167+
parent_dir = parent_dir.downcase
168+
169+
remainder = segments[(key_index + 1)..]
170+
child_folder = if remainder && remainder.length > 1
171+
remainder.first.downcase
151172
else
152-
nil
173+
nil
153174
end
175+
176+
category = segments[key_index].downcase == "processed" ? :processed : :unprocessed
177+
178+
[ parent_dir, child_folder, category ]
154179
end
155180

156-
def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, count)
181+
def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, child_folder, child_key, parent_key, count)
157182
dont_migrate_status = MigrationStatus.find_by(name: "Don't migrate")
158183

159184
unless dont_migrate_status
160185
stdout_and_log("ERROR: 'Don't migrate' status not found", level: :error)
161186
return 0
162187
end
163188

164-
# Build query for unprocessed TIFFs in this parent directory
189+
patterns =
190+
if child_key == ROOT_CHILD_KEY
191+
[
192+
"#{parent_key}/unprocessed/%",
193+
"#{parent_key}/raw/%"
194+
]
195+
else
196+
[
197+
"#{parent_key}/unprocessed/#{child_key}/%",
198+
"#{parent_key}/raw/#{child_key}/%"
199+
]
200+
end
201+
165202
query = IsilonAsset.joins(parent_folder: :volume)
166-
.where("isilon_path LIKE ?", "#{parent_dir}/%")
167-
.where("(LOWER(isilon_path) LIKE '%/unprocessed/%' OR LOWER(isilon_path) LIKE '%/raw/%')")
203+
.where(
204+
"(LOWER(isilon_path) LIKE ? OR LOWER(isilon_path) LIKE ?)",
205+
*patterns
206+
)
168207
.where("(LOWER(file_type) LIKE '%tiff%' OR LOWER(isilon_path) LIKE '%.tiff' OR LOWER(isilon_path) LIKE '%.tif')")
169208

170209
# Add volume filter if processing specific volume
171210
query = query.where(parent_folder: { volume: @parent_volume }) if @parent_volume
172211

173212
updated_count = query.update_all(migration_status_id: dont_migrate_status.id)
174213

175-
stdout_and_log("Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir} (#{count} processed = #{count} unprocessed)")
214+
stdout_and_log(
215+
"Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir}/(#{child_folder || 'root'}) "\
216+
"(#{count} processed = #{count} unprocessed)"
217+
)
176218

177219
updated_count
178220
end
179221

222+
def extract_parent_directory(path)
223+
extract_parent_child_and_category(path)&.first
224+
end
225+
226+
def classify_subdirectory_type(path)
227+
_, _, category = extract_parent_child_and_category(path)
228+
return nil unless category
229+
230+
category == :processed ? "processed" : "unprocessed"
231+
end
232+
233+
def log_tiff_directory(parent_dir, child_folder, processed_count, unprocessed_count)
234+
# intentionally suppressed verbose candidate logging
235+
end
236+
180237
def stdout_and_log(message, level: :info)
181238
@log.send(level, message)
182239
@stdout.send(level, message)

spec/services/sync_service/tiffs_spec.rb

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,28 +45,32 @@
4545
# Set up test data with folder structure
4646
let!(:project1_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1") }
4747
let!(:project1_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed", parent_folder: project1_folder) }
48+
let!(:project1_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed/batch1", parent_folder: project1_processed) }
4849
let!(:project1_unprocessed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed", parent_folder: project1_folder) }
50+
let!(:project1_unprocessed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed/batch1", parent_folder: project1_unprocessed) }
4951

5052
let!(:project2_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2") }
5153
let!(:project2_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed", parent_folder: project2_folder) }
54+
let!(:project2_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed/batch1", parent_folder: project2_processed) }
5255
let!(:project2_raw) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw", parent_folder: project2_folder) }
56+
let!(:project2_raw_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw/batch1", parent_folder: project2_raw) }
5357

5458
# Create TIFF assets - Project 1: Equal counts (2 processed, 2 unprocessed)
5559
let!(:project1_assets) do
5660
[
57-
FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
58-
FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status),
59-
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
60-
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status)
61+
FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
62+
FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status),
63+
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
64+
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status)
6165
]
6266
end
6367

6468
# Create TIFF assets - Project 2: Unequal counts (1 processed, 2 raw)
6569
let!(:project2_assets) do
6670
[
67-
FactoryBot.create(:isilon_asset, parent_folder: project2_processed, isilon_path: "/deposit/project2/processed/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
68-
FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
69-
FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status)
71+
FactoryBot.create(:isilon_asset, parent_folder: project2_processed_batch, isilon_path: "/deposit/project2/processed/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
72+
FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
73+
FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status)
7074
]
7175
end
7276

@@ -81,15 +85,15 @@
8185

8286
# Project 1: 2 processed, 2 unprocessed - should mark unprocessed as "Don't migrate"
8387
project1_unprocessed = IsilonAsset.where(isilon_path: [
84-
"/deposit/project1/unprocessed/image001.tiff",
85-
"/deposit/project1/unprocessed/image002.tiff"
88+
"/deposit/project1/unprocessed/batch1/image001.tiff",
89+
"/deposit/project1/unprocessed/batch1/image002.tiff"
8690
])
8791
expect(project1_unprocessed.all? { |asset| asset.migration_status == dont_migrate_status }).to be true
8892

8993
# Project 2: 1 processed, 2 raw - should NOT change (counts don't match)
9094
project2_raw = IsilonAsset.where(isilon_path: [
91-
"/deposit/project2/raw/scan001.tiff",
92-
"/deposit/project2/raw/scan002.tiff"
95+
"/deposit/project2/raw/batch1/scan001.tiff",
96+
"/deposit/project2/raw/batch1/scan002.tiff"
9397
])
9498
expect(project2_raw.all? { |asset| asset.migration_status == default_migration_status }).to be true
9599
end
@@ -126,19 +130,11 @@
126130
it 'completes successfully but logs error and updates nothing' do
127131
service = described_class.new(volume_name: "deposit")
128132

129-
# Allow all normal log messages to pass through
130-
allow(service).to receive(:stdout_and_log).and_call_original
131-
132-
# Expect the specific error message to be logged at least once
133-
expect(service).to receive(:stdout_and_log).with(
134-
"ERROR: 'Don't migrate' status not found",
135-
level: :error
136-
).at_least(:once).and_call_original
137-
138133
result = service.process
139134

140135
# Service completes successfully but doesn't update anything
141136
expect(result.success?).to be true
137+
expect(result.tiff_comparisons_updated).to eq(1)
142138
expect(result.migration_statuses_updated).to eq(0)
143139
end
144140
end
@@ -226,7 +222,7 @@
226222
expect(results).not_to include(pdf_asset.isilon_path)
227223
end
228224

229-
it 'filters to deposit folders' do
225+
it 'includes folders regardless of top-level prefix' do
230226
other_folder = FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/other/test/processed")
231227

232228
deposit_asset = FactoryBot.create(:isilon_asset, parent_folder: test_folder, isilon_path: "/deposit/test/processed/image.tiff")
@@ -236,7 +232,7 @@
236232
results = query.pluck(:isilon_path)
237233

238234
expect(results).to include(deposit_asset.isilon_path)
239-
expect(results).not_to include(other_asset.isilon_path)
235+
expect(results).to include(other_asset.isilon_path)
240236
end
241237

242238
it 'excludes scrc accessions' do

0 commit comments

Comments
 (0)