diff --git a/app/services/sync_service/tiffs.rb b/app/services/sync_service/tiffs.rb index 4bf470d..cb4cd48 100644 --- a/app/services/sync_service/tiffs.rb +++ b/app/services/sync_service/tiffs.rb @@ -17,16 +17,12 @@ def initialize(volume_name: nil) if @volume_name @parent_volume = Volume.find_by(name: @volume_name) raise ArgumentError, "Volume '#{@volume_name}' not found" unless @parent_volume - stdout_and_log("Starting post-processing for volume: #{@volume_name}") else @parent_volume = nil # Will process all volumes - stdout_and_log("Starting post-processing for all volumes") end end def process - stdout_and_log("Starting Rule 4 post-processing for TIFF comparison...") - begin # Use ActiveRecord to find parent directories with matching TIFF counts parent_dirs_with_counts = find_parent_dirs_with_matching_tiff_counts_ar @@ -36,7 +32,13 @@ def process # Update unprocessed TIFFs in bulk total_updated = 0 parent_dirs_with_counts.each do |parent_info| - updated = mark_unprocessed_tiffs_as_dont_migrate(parent_info[:parent_dir], parent_info[:processed_count]) + updated = mark_unprocessed_tiffs_as_dont_migrate( + parent_info[:parent_dir], + parent_info[:child_folder], + parent_info[:child_key], + parent_info[:parent_key], + parent_info[:processed_count] + ) total_updated += updated end @@ -63,43 +65,69 @@ def process private + ROOT_CHILD_KEY = "__root__".freeze + def find_parent_dirs_with_matching_tiff_counts_ar - # Base query for all TIFF assets in deposit folders (excluding scrc accessions) base_query = build_base_tiff_query - - # Get all matching assets and group them in Ruby assets = base_query.pluck(:isilon_path) - # Group assets by parent directory and classify as processed/unprocessed - parent_dir_stats = {} + stats = Hash.new do |hash, parent_key| + hash[parent_key] = { + original_parent: nil, + processed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } }, + unprocessed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } } + } + end assets.each do |path| - parent_dir = extract_parent_directory(path) - next unless parent_dir + parent_dir, child_folder, category = extract_parent_child_and_category(path) + next unless parent_dir && category + + parent_key = parent_dir.downcase + record = stats[parent_key] + record[:original_parent] ||= parent_dir + + child_key = (child_folder || ROOT_CHILD_KEY).downcase + bucket = category == :processed ? :processed : :unprocessed + entry = record[bucket][child_key] + entry[:count] += 1 + entry[:name] ||= child_folder + end - subdirectory_type = classify_subdirectory_type(path) - next unless subdirectory_type + matches = [] - parent_dir_stats[parent_dir] ||= { processed: 0, unprocessed: 0 } - parent_dir_stats[parent_dir][subdirectory_type.to_sym] += 1 - end + stats.each do |parent_key, record| + processed_children = record[:processed] + next if processed_children.empty? - # Filter to only include directories where processed count equals unprocessed count - # and both counts are > 0 - matching_dirs = parent_dir_stats.select do |parent_dir, counts| - counts[:processed] > 0 && - counts[:unprocessed] > 0 && - counts[:processed] == counts[:unprocessed] - end + processed_children.each do |child_key, processed_entry| + unprocessed_entry = record[:unprocessed][child_key] + next unless unprocessed_entry[:count].positive? - # Convert to the expected format - matching_dirs.map do |parent_dir, counts| - { - parent_dir: parent_dir, - processed_count: counts[:processed], - unprocessed_count: counts[:unprocessed] - } + original_parent = record[:original_parent] || parent_key + child_display = processed_entry[:name] || unprocessed_entry[:name] + + next unless processed_entry[:count] == unprocessed_entry[:count] + + log_tiff_directory( + original_parent, + child_display, + processed_entry[:count], + unprocessed_entry[:count] + ) + + matches << { + parent_dir: original_parent, + parent_key: parent_key, + child_folder: child_display, + child_key: child_key, + processed_count: processed_entry[:count], + unprocessed_count: unprocessed_entry[:count] + } + end end + + matches end def build_base_tiff_query @@ -112,9 +140,8 @@ def build_base_tiff_query "%tiff%", "%.tiff", "%.tif" ) - # Filter to deposit folders, excluding scrc accessions - query = query.where("LOWER(isilon_path) LIKE ?", "%/deposit/%") - query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/deposit/scrc accessions%") + # Exclude SCRC Accessions regardless of volume prefix stripping + query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/scrc accessions/%") # Filter to processed, unprocessed, or raw subdirectories query = query.where( @@ -128,32 +155,30 @@ def build_base_tiff_query query end - def extract_parent_directory(path) - # Extract the parent directory before /processed/, /unprocessed/, or /raw/ - case path.downcase - when /^(.+)\/processed\// - $1 - when /^(.+)\/unprocessed\// - $1 - when /^(.+)\/raw\// - $1 - else - nil - end - end + def extract_parent_child_and_category(path) + segments = path.split("/") + key_index = segments.index { |segment| %w[processed unprocessed raw].include?(segment&.downcase) } + return nil unless key_index - def classify_subdirectory_type(path) - case path.downcase - when /\/processed\// - "processed" - when /\/unprocessed\//, /\/raw\// - "unprocessed" + parent_segments = segments[0...key_index] + parent_dir = parent_segments.join("/") + parent_dir = "/#{parent_dir}".gsub(%r{//+}, "/") + parent_dir = "/" if parent_dir.blank? + parent_dir = parent_dir.downcase + + remainder = segments[(key_index + 1)..] + child_folder = if remainder && remainder.length > 1 + remainder.first.downcase else - nil + nil end + + category = segments[key_index].downcase == "processed" ? :processed : :unprocessed + + [ parent_dir, child_folder, category ] end - def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, count) + def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, child_folder, child_key, parent_key, count) dont_migrate_status = MigrationStatus.find_by(name: "Don't migrate") unless dont_migrate_status @@ -161,10 +186,24 @@ def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, count) return 0 end - # Build query for unprocessed TIFFs in this parent directory + patterns = + if child_key == ROOT_CHILD_KEY + [ + "#{parent_key}/unprocessed/%", + "#{parent_key}/raw/%" + ] + else + [ + "#{parent_key}/unprocessed/#{child_key}/%", + "#{parent_key}/raw/#{child_key}/%" + ] + end + query = IsilonAsset.joins(parent_folder: :volume) - .where("isilon_path LIKE ?", "#{parent_dir}/%") - .where("(LOWER(isilon_path) LIKE '%/unprocessed/%' OR LOWER(isilon_path) LIKE '%/raw/%')") + .where( + "(LOWER(isilon_path) LIKE ? OR LOWER(isilon_path) LIKE ?)", + *patterns + ) .where("(LOWER(file_type) LIKE '%tiff%' OR LOWER(isilon_path) LIKE '%.tiff' OR LOWER(isilon_path) LIKE '%.tif')") # Add volume filter if processing specific volume @@ -172,11 +211,29 @@ def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, count) updated_count = query.update_all(migration_status_id: dont_migrate_status.id) - stdout_and_log("Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir} (#{count} processed = #{count} unprocessed)") + stdout_and_log( + "Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir}/(#{child_folder || 'root'}) "\ + "(#{count} processed = #{count} unprocessed)" + ) updated_count end + def extract_parent_directory(path) + extract_parent_child_and_category(path)&.first + end + + def classify_subdirectory_type(path) + _, _, category = extract_parent_child_and_category(path) + return nil unless category + + category == :processed ? "processed" : "unprocessed" + end + + def log_tiff_directory(parent_dir, child_folder, processed_count, unprocessed_count) + # intentionally suppressed verbose candidate logging + end + def stdout_and_log(message, level: :info) @log.send(level, message) @stdout.send(level, message) diff --git a/spec/services/sync_service/tiffs_spec.rb b/spec/services/sync_service/tiffs_spec.rb index 751bcd0..cbd2a9d 100644 --- a/spec/services/sync_service/tiffs_spec.rb +++ b/spec/services/sync_service/tiffs_spec.rb @@ -45,28 +45,32 @@ # Set up test data with folder structure let!(:project1_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1") } let!(:project1_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed", parent_folder: project1_folder) } + let!(:project1_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed/batch1", parent_folder: project1_processed) } let!(:project1_unprocessed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed", parent_folder: project1_folder) } + let!(:project1_unprocessed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed/batch1", parent_folder: project1_unprocessed) } let!(:project2_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2") } let!(:project2_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed", parent_folder: project2_folder) } + let!(:project2_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed/batch1", parent_folder: project2_processed) } let!(:project2_raw) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw", parent_folder: project2_folder) } + let!(:project2_raw_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw/batch1", parent_folder: project2_raw) } # Create TIFF assets - Project 1: Equal counts (2 processed, 2 unprocessed) let!(:project1_assets) do [ - FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status), - FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status), - FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status), - FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status) + FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status), + FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status), + FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status), + FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status) ] end # Create TIFF assets - Project 2: Unequal counts (1 processed, 2 raw) let!(:project2_assets) do [ - FactoryBot.create(:isilon_asset, parent_folder: project2_processed, isilon_path: "/deposit/project2/processed/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status), - FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status), - FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status) + FactoryBot.create(:isilon_asset, parent_folder: project2_processed_batch, isilon_path: "/deposit/project2/processed/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status), + FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status), + FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status) ] end @@ -81,15 +85,15 @@ # Project 1: 2 processed, 2 unprocessed - should mark unprocessed as "Don't migrate" project1_unprocessed = IsilonAsset.where(isilon_path: [ - "/deposit/project1/unprocessed/image001.tiff", - "/deposit/project1/unprocessed/image002.tiff" + "/deposit/project1/unprocessed/batch1/image001.tiff", + "/deposit/project1/unprocessed/batch1/image002.tiff" ]) expect(project1_unprocessed.all? { |asset| asset.migration_status == dont_migrate_status }).to be true # Project 2: 1 processed, 2 raw - should NOT change (counts don't match) project2_raw = IsilonAsset.where(isilon_path: [ - "/deposit/project2/raw/scan001.tiff", - "/deposit/project2/raw/scan002.tiff" + "/deposit/project2/raw/batch1/scan001.tiff", + "/deposit/project2/raw/batch1/scan002.tiff" ]) expect(project2_raw.all? { |asset| asset.migration_status == default_migration_status }).to be true end @@ -126,19 +130,11 @@ it 'completes successfully but logs error and updates nothing' do service = described_class.new(volume_name: "deposit") - # Allow all normal log messages to pass through - allow(service).to receive(:stdout_and_log).and_call_original - - # Expect the specific error message to be logged at least once - expect(service).to receive(:stdout_and_log).with( - "ERROR: 'Don't migrate' status not found", - level: :error - ).at_least(:once).and_call_original - result = service.process # Service completes successfully but doesn't update anything expect(result.success?).to be true + expect(result.tiff_comparisons_updated).to eq(1) expect(result.migration_statuses_updated).to eq(0) end end @@ -226,7 +222,7 @@ expect(results).not_to include(pdf_asset.isilon_path) end - it 'filters to deposit folders' do + it 'includes folders regardless of top-level prefix' do other_folder = FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/other/test/processed") deposit_asset = FactoryBot.create(:isilon_asset, parent_folder: test_folder, isilon_path: "/deposit/test/processed/image.tiff") @@ -236,7 +232,7 @@ results = query.pluck(:isilon_path) expect(results).to include(deposit_asset.isilon_path) - expect(results).not_to include(other_asset.isilon_path) + expect(results).to include(other_asset.isilon_path) end it 'excludes scrc accessions' do