@@ -17,16 +17,12 @@ def initialize(volume_name: nil)
1717 if @volume_name
1818 @parent_volume = Volume . find_by ( name : @volume_name )
1919 raise ArgumentError , "Volume '#{ @volume_name } ' not found" unless @parent_volume
20- stdout_and_log ( "Starting post-processing for volume: #{ @volume_name } " )
2120 else
2221 @parent_volume = nil # Will process all volumes
23- stdout_and_log ( "Starting post-processing for all volumes" )
2422 end
2523 end
2624
2725 def process
28- stdout_and_log ( "Starting Rule 4 post-processing for TIFF comparison..." )
29-
3026 begin
3127 # Use ActiveRecord to find parent directories with matching TIFF counts
3228 parent_dirs_with_counts = find_parent_dirs_with_matching_tiff_counts_ar
@@ -36,7 +32,13 @@ def process
3632 # Update unprocessed TIFFs in bulk
3733 total_updated = 0
3834 parent_dirs_with_counts . each do |parent_info |
39- updated = mark_unprocessed_tiffs_as_dont_migrate ( parent_info [ :parent_dir ] , parent_info [ :processed_count ] )
35+ updated = mark_unprocessed_tiffs_as_dont_migrate (
36+ parent_info [ :parent_dir ] ,
37+ parent_info [ :child_folder ] ,
38+ parent_info [ :child_key ] ,
39+ parent_info [ :parent_key ] ,
40+ parent_info [ :processed_count ]
41+ )
4042 total_updated += updated
4143 end
4244
@@ -63,43 +65,69 @@ def process
6365
6466 private
6567
68+ ROOT_CHILD_KEY = "__root__" . freeze
69+
6670 def find_parent_dirs_with_matching_tiff_counts_ar
67- # Base query for all TIFF assets in deposit folders (excluding scrc accessions)
6871 base_query = build_base_tiff_query
69-
70- # Get all matching assets and group them in Ruby
7172 assets = base_query . pluck ( :isilon_path )
7273
73- # Group assets by parent directory and classify as processed/unprocessed
74- parent_dir_stats = { }
74+ stats = Hash . new do |hash , parent_key |
75+ hash [ parent_key ] = {
76+ original_parent : nil ,
77+ processed : Hash . new { |child_hash , key | child_hash [ key ] = { count : 0 , name : nil } } ,
78+ unprocessed : Hash . new { |child_hash , key | child_hash [ key ] = { count : 0 , name : nil } }
79+ }
80+ end
7581
7682 assets . each do |path |
77- parent_dir = extract_parent_directory ( path )
78- next unless parent_dir
83+ parent_dir , child_folder , category = extract_parent_child_and_category ( path )
84+ next unless parent_dir && category
85+
86+ parent_key = parent_dir . downcase
87+ record = stats [ parent_key ]
88+ record [ :original_parent ] ||= parent_dir
89+
90+ child_key = ( child_folder || ROOT_CHILD_KEY ) . downcase
91+ bucket = category == :processed ? :processed : :unprocessed
92+ entry = record [ bucket ] [ child_key ]
93+ entry [ :count ] += 1
94+ entry [ :name ] ||= child_folder
95+ end
7996
80- subdirectory_type = classify_subdirectory_type ( path )
81- next unless subdirectory_type
97+ matches = [ ]
8298
83- parent_dir_stats [ parent_dir ] ||= { processed : 0 , unprocessed : 0 }
84- parent_dir_stats [ parent_dir ] [ subdirectory_type . to_sym ] += 1
85- end
99+ stats . each do | parent_key , record |
100+ processed_children = record [ :processed ]
101+ next if processed_children . empty?
86102
87- # Filter to only include directories where processed count equals unprocessed count
88- # and both counts are > 0
89- matching_dirs = parent_dir_stats . select do |parent_dir , counts |
90- counts [ :processed ] > 0 &&
91- counts [ :unprocessed ] > 0 &&
92- counts [ :processed ] == counts [ :unprocessed ]
93- end
103+ processed_children . each do |child_key , processed_entry |
104+ unprocessed_entry = record [ :unprocessed ] [ child_key ]
105+ next unless unprocessed_entry [ :count ] . positive?
94106
95- # Convert to the expected format
96- matching_dirs . map do |parent_dir , counts |
97- {
98- parent_dir : parent_dir ,
99- processed_count : counts [ :processed ] ,
100- unprocessed_count : counts [ :unprocessed ]
101- }
107+ original_parent = record [ :original_parent ] || parent_key
108+ child_display = processed_entry [ :name ] || unprocessed_entry [ :name ]
109+
110+ next unless processed_entry [ :count ] == unprocessed_entry [ :count ]
111+
112+ log_tiff_directory (
113+ original_parent ,
114+ child_display ,
115+ processed_entry [ :count ] ,
116+ unprocessed_entry [ :count ]
117+ )
118+
119+ matches << {
120+ parent_dir : original_parent ,
121+ parent_key : parent_key ,
122+ child_folder : child_display ,
123+ child_key : child_key ,
124+ processed_count : processed_entry [ :count ] ,
125+ unprocessed_count : unprocessed_entry [ :count ]
126+ }
127+ end
102128 end
129+
130+ matches
103131 end
104132
105133 def build_base_tiff_query
@@ -112,9 +140,8 @@ def build_base_tiff_query
112140 "%tiff%" , "%.tiff" , "%.tif"
113141 )
114142
115- # Filter to deposit folders, excluding scrc accessions
116- query = query . where ( "LOWER(isilon_path) LIKE ?" , "%/deposit/%" )
117- query = query . where ( "LOWER(isilon_path) NOT LIKE ?" , "%/deposit/scrc accessions%" )
143+ # Exclude SCRC Accessions regardless of volume prefix stripping
144+ query = query . where ( "LOWER(isilon_path) NOT LIKE ?" , "%/scrc accessions/%" )
118145
119146 # Filter to processed, unprocessed, or raw subdirectories
120147 query = query . where (
@@ -128,55 +155,85 @@ def build_base_tiff_query
128155 query
129156 end
130157
131- def extract_parent_directory ( path )
132- # Extract the parent directory before /processed/, /unprocessed/, or /raw/
133- case path . downcase
134- when /^(.+)\/ processed\/ /
135- $1
136- when /^(.+)\/ unprocessed\/ /
137- $1
138- when /^(.+)\/ raw\/ /
139- $1
140- else
141- nil
142- end
143- end
158+ def extract_parent_child_and_category ( path )
159+ segments = path . split ( "/" )
160+ key_index = segments . index { |segment | %w[ processed unprocessed raw ] . include? ( segment &.downcase ) }
161+ return nil unless key_index
144162
145- def classify_subdirectory_type ( path )
146- case path . downcase
147- when /\/ processed\/ /
148- "processed"
149- when /\/ unprocessed\/ / , /\/ raw\/ /
150- "unprocessed"
163+ parent_segments = segments [ 0 ...key_index ]
164+ parent_dir = parent_segments . join ( "/" )
165+ parent_dir = "/#{ parent_dir } " . gsub ( %r{//+} , "/" )
166+ parent_dir = "/" if parent_dir . blank?
167+ parent_dir = parent_dir . downcase
168+
169+ remainder = segments [ ( key_index + 1 ) ..]
170+ child_folder = if remainder && remainder . length > 1
171+ remainder . first . downcase
151172 else
152- nil
173+ nil
153174 end
175+
176+ category = segments [ key_index ] . downcase == "processed" ? :processed : :unprocessed
177+
178+ [ parent_dir , child_folder , category ]
154179 end
155180
156- def mark_unprocessed_tiffs_as_dont_migrate ( parent_dir , count )
181+ def mark_unprocessed_tiffs_as_dont_migrate ( parent_dir , child_folder , child_key , parent_key , count )
157182 dont_migrate_status = MigrationStatus . find_by ( name : "Don't migrate" )
158183
159184 unless dont_migrate_status
160185 stdout_and_log ( "ERROR: 'Don't migrate' status not found" , level : :error )
161186 return 0
162187 end
163188
164- # Build query for unprocessed TIFFs in this parent directory
189+ patterns =
190+ if child_key == ROOT_CHILD_KEY
191+ [
192+ "#{ parent_key } /unprocessed/%" ,
193+ "#{ parent_key } /raw/%"
194+ ]
195+ else
196+ [
197+ "#{ parent_key } /unprocessed/#{ child_key } /%" ,
198+ "#{ parent_key } /raw/#{ child_key } /%"
199+ ]
200+ end
201+
165202 query = IsilonAsset . joins ( parent_folder : :volume )
166- . where ( "isilon_path LIKE ?" , "#{ parent_dir } /%" )
167- . where ( "(LOWER(isilon_path) LIKE '%/unprocessed/%' OR LOWER(isilon_path) LIKE '%/raw/%')" )
203+ . where (
204+ "(LOWER(isilon_path) LIKE ? OR LOWER(isilon_path) LIKE ?)" ,
205+ *patterns
206+ )
168207 . where ( "(LOWER(file_type) LIKE '%tiff%' OR LOWER(isilon_path) LIKE '%.tiff' OR LOWER(isilon_path) LIKE '%.tif')" )
169208
170209 # Add volume filter if processing specific volume
171210 query = query . where ( parent_folder : { volume : @parent_volume } ) if @parent_volume
172211
173212 updated_count = query . update_all ( migration_status_id : dont_migrate_status . id )
174213
175- stdout_and_log ( "Rule 4: Marked #{ updated_count } unprocessed TIFFs as 'Don't migrate' in #{ parent_dir } (#{ count } processed = #{ count } unprocessed)" )
214+ stdout_and_log (
215+ "Rule 4: Marked #{ updated_count } unprocessed TIFFs as 'Don't migrate' in #{ parent_dir } /(#{ child_folder || 'root' } ) " \
216+ "(#{ count } processed = #{ count } unprocessed)"
217+ )
176218
177219 updated_count
178220 end
179221
222+ def extract_parent_directory ( path )
223+ extract_parent_child_and_category ( path ) &.first
224+ end
225+
226+ def classify_subdirectory_type ( path )
227+ _ , _ , category = extract_parent_child_and_category ( path )
228+ return nil unless category
229+
230+ category == :processed ? "processed" : "unprocessed"
231+ end
232+
233+ def log_tiff_directory ( parent_dir , child_folder , processed_count , unprocessed_count )
234+ # intentionally suppressed verbose candidate logging
235+ end
236+
180237 def stdout_and_log ( message , level : :info )
181238 @log . send ( level , message )
182239 @stdout . send ( level , message )
0 commit comments