Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 116 additions & 59 deletions app/services/sync_service/tiffs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,12 @@ def initialize(volume_name: nil)
if @volume_name
@parent_volume = Volume.find_by(name: @volume_name)
raise ArgumentError, "Volume '#{@volume_name}' not found" unless @parent_volume
stdout_and_log("Starting post-processing for volume: #{@volume_name}")
else
@parent_volume = nil # Will process all volumes
stdout_and_log("Starting post-processing for all volumes")
end
end

def process
stdout_and_log("Starting Rule 4 post-processing for TIFF comparison...")

begin
# Use ActiveRecord to find parent directories with matching TIFF counts
parent_dirs_with_counts = find_parent_dirs_with_matching_tiff_counts_ar
Expand All @@ -36,7 +32,13 @@ def process
# Update unprocessed TIFFs in bulk
total_updated = 0
parent_dirs_with_counts.each do |parent_info|
updated = mark_unprocessed_tiffs_as_dont_migrate(parent_info[:parent_dir], parent_info[:processed_count])
updated = mark_unprocessed_tiffs_as_dont_migrate(
parent_info[:parent_dir],
parent_info[:child_folder],
parent_info[:child_key],
parent_info[:parent_key],
parent_info[:processed_count]
)
total_updated += updated
end

Expand All @@ -63,43 +65,69 @@ def process

private

ROOT_CHILD_KEY = "__root__".freeze

def find_parent_dirs_with_matching_tiff_counts_ar
# Base query for all TIFF assets in deposit folders (excluding scrc accessions)
base_query = build_base_tiff_query

# Get all matching assets and group them in Ruby
assets = base_query.pluck(:isilon_path)

# Group assets by parent directory and classify as processed/unprocessed
parent_dir_stats = {}
stats = Hash.new do |hash, parent_key|
hash[parent_key] = {
original_parent: nil,
processed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } },
unprocessed: Hash.new { |child_hash, key| child_hash[key] = { count: 0, name: nil } }
}
end

assets.each do |path|
parent_dir = extract_parent_directory(path)
next unless parent_dir
parent_dir, child_folder, category = extract_parent_child_and_category(path)
next unless parent_dir && category

parent_key = parent_dir.downcase
record = stats[parent_key]
record[:original_parent] ||= parent_dir

child_key = (child_folder || ROOT_CHILD_KEY).downcase
bucket = category == :processed ? :processed : :unprocessed
entry = record[bucket][child_key]
entry[:count] += 1
entry[:name] ||= child_folder
end

subdirectory_type = classify_subdirectory_type(path)
next unless subdirectory_type
matches = []

parent_dir_stats[parent_dir] ||= { processed: 0, unprocessed: 0 }
parent_dir_stats[parent_dir][subdirectory_type.to_sym] += 1
end
stats.each do |parent_key, record|
processed_children = record[:processed]
next if processed_children.empty?

# Filter to only include directories where processed count equals unprocessed count
# and both counts are > 0
matching_dirs = parent_dir_stats.select do |parent_dir, counts|
counts[:processed] > 0 &&
counts[:unprocessed] > 0 &&
counts[:processed] == counts[:unprocessed]
end
processed_children.each do |child_key, processed_entry|
unprocessed_entry = record[:unprocessed][child_key]
next unless unprocessed_entry[:count].positive?

# Convert to the expected format
matching_dirs.map do |parent_dir, counts|
{
parent_dir: parent_dir,
processed_count: counts[:processed],
unprocessed_count: counts[:unprocessed]
}
original_parent = record[:original_parent] || parent_key
child_display = processed_entry[:name] || unprocessed_entry[:name]

next unless processed_entry[:count] == unprocessed_entry[:count]

log_tiff_directory(
original_parent,
child_display,
processed_entry[:count],
unprocessed_entry[:count]
)

matches << {
parent_dir: original_parent,
parent_key: parent_key,
child_folder: child_display,
child_key: child_key,
processed_count: processed_entry[:count],
unprocessed_count: unprocessed_entry[:count]
}
end
end

matches
end

def build_base_tiff_query
Expand All @@ -112,9 +140,8 @@ def build_base_tiff_query
"%tiff%", "%.tiff", "%.tif"
)

# Filter to deposit folders, excluding scrc accessions
query = query.where("LOWER(isilon_path) LIKE ?", "%/deposit/%")
query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/deposit/scrc accessions%")
# Exclude SCRC Accessions regardless of volume prefix stripping
query = query.where("LOWER(isilon_path) NOT LIKE ?", "%/scrc accessions/%")

# Filter to processed, unprocessed, or raw subdirectories
query = query.where(
Expand All @@ -128,55 +155,85 @@ def build_base_tiff_query
query
end

def extract_parent_directory(path)
# Extract the parent directory before /processed/, /unprocessed/, or /raw/
case path.downcase
when /^(.+)\/processed\//
$1
when /^(.+)\/unprocessed\//
$1
when /^(.+)\/raw\//
$1
else
nil
end
end
def extract_parent_child_and_category(path)
segments = path.split("/")
key_index = segments.index { |segment| %w[processed unprocessed raw].include?(segment&.downcase) }
return nil unless key_index

def classify_subdirectory_type(path)
case path.downcase
when /\/processed\//
"processed"
when /\/unprocessed\//, /\/raw\//
"unprocessed"
parent_segments = segments[0...key_index]
parent_dir = parent_segments.join("/")
parent_dir = "/#{parent_dir}".gsub(%r{//+}, "/")
parent_dir = "/" if parent_dir.blank?
parent_dir = parent_dir.downcase

remainder = segments[(key_index + 1)..]
child_folder = if remainder && remainder.length > 1
remainder.first.downcase
else
nil
nil
end

category = segments[key_index].downcase == "processed" ? :processed : :unprocessed

[ parent_dir, child_folder, category ]
end

def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, count)
def mark_unprocessed_tiffs_as_dont_migrate(parent_dir, child_folder, child_key, parent_key, count)
dont_migrate_status = MigrationStatus.find_by(name: "Don't migrate")

unless dont_migrate_status
stdout_and_log("ERROR: 'Don't migrate' status not found", level: :error)
return 0
end

# Build query for unprocessed TIFFs in this parent directory
patterns =
if child_key == ROOT_CHILD_KEY
[
"#{parent_key}/unprocessed/%",
"#{parent_key}/raw/%"
]
else
[
"#{parent_key}/unprocessed/#{child_key}/%",
"#{parent_key}/raw/#{child_key}/%"
]
end

query = IsilonAsset.joins(parent_folder: :volume)
.where("isilon_path LIKE ?", "#{parent_dir}/%")
.where("(LOWER(isilon_path) LIKE '%/unprocessed/%' OR LOWER(isilon_path) LIKE '%/raw/%')")
.where(
"(LOWER(isilon_path) LIKE ? OR LOWER(isilon_path) LIKE ?)",
*patterns
)
.where("(LOWER(file_type) LIKE '%tiff%' OR LOWER(isilon_path) LIKE '%.tiff' OR LOWER(isilon_path) LIKE '%.tif')")

# Add volume filter if processing specific volume
query = query.where(parent_folder: { volume: @parent_volume }) if @parent_volume

updated_count = query.update_all(migration_status_id: dont_migrate_status.id)

stdout_and_log("Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir} (#{count} processed = #{count} unprocessed)")
stdout_and_log(
"Rule 4: Marked #{updated_count} unprocessed TIFFs as 'Don't migrate' in #{parent_dir}/(#{child_folder || 'root'}) "\
"(#{count} processed = #{count} unprocessed)"
)

updated_count
end

def extract_parent_directory(path)
extract_parent_child_and_category(path)&.first
end

def classify_subdirectory_type(path)
_, _, category = extract_parent_child_and_category(path)
return nil unless category

category == :processed ? "processed" : "unprocessed"
end

def log_tiff_directory(parent_dir, child_folder, processed_count, unprocessed_count)
# intentionally suppressed verbose candidate logging
end

def stdout_and_log(message, level: :info)
@log.send(level, message)
@stdout.send(level, message)
Expand Down
40 changes: 18 additions & 22 deletions spec/services/sync_service/tiffs_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,28 +45,32 @@
# Set up test data with folder structure
let!(:project1_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1") }
let!(:project1_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed", parent_folder: project1_folder) }
let!(:project1_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/processed/batch1", parent_folder: project1_processed) }
let!(:project1_unprocessed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed", parent_folder: project1_folder) }
let!(:project1_unprocessed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project1/unprocessed/batch1", parent_folder: project1_unprocessed) }

let!(:project2_folder) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2") }
let!(:project2_processed) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed", parent_folder: project2_folder) }
let!(:project2_processed_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/processed/batch1", parent_folder: project2_processed) }
let!(:project2_raw) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw", parent_folder: project2_folder) }
let!(:project2_raw_batch) { FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/deposit/project2/raw/batch1", parent_folder: project2_raw) }

# Create TIFF assets - Project 1: Equal counts (2 processed, 2 unprocessed)
let!(:project1_assets) do
[
FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_processed, isilon_path: "/deposit/project1/processed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed, isilon_path: "/deposit/project1/unprocessed/image002.tiff", file_type: "TIFF", migration_status: default_migration_status)
FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_processed_batch, isilon_path: "/deposit/project1/processed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project1_unprocessed_batch, isilon_path: "/deposit/project1/unprocessed/batch1/image002.tiff", file_type: "TIFF", migration_status: default_migration_status)
]
end

# Create TIFF assets - Project 2: Unequal counts (1 processed, 2 raw)
let!(:project2_assets) do
[
FactoryBot.create(:isilon_asset, parent_folder: project2_processed, isilon_path: "/deposit/project2/processed/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project2_raw, isilon_path: "/deposit/project2/raw/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status)
FactoryBot.create(:isilon_asset, parent_folder: project2_processed_batch, isilon_path: "/deposit/project2/processed/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan001.tiff", file_type: "TIFF", migration_status: default_migration_status),
FactoryBot.create(:isilon_asset, parent_folder: project2_raw_batch, isilon_path: "/deposit/project2/raw/batch1/scan002.tiff", file_type: "TIFF", migration_status: default_migration_status)
]
end

Expand All @@ -81,15 +85,15 @@

# Project 1: 2 processed, 2 unprocessed - should mark unprocessed as "Don't migrate"
project1_unprocessed = IsilonAsset.where(isilon_path: [
"/deposit/project1/unprocessed/image001.tiff",
"/deposit/project1/unprocessed/image002.tiff"
"/deposit/project1/unprocessed/batch1/image001.tiff",
"/deposit/project1/unprocessed/batch1/image002.tiff"
])
expect(project1_unprocessed.all? { |asset| asset.migration_status == dont_migrate_status }).to be true

# Project 2: 1 processed, 2 raw - should NOT change (counts don't match)
project2_raw = IsilonAsset.where(isilon_path: [
"/deposit/project2/raw/scan001.tiff",
"/deposit/project2/raw/scan002.tiff"
"/deposit/project2/raw/batch1/scan001.tiff",
"/deposit/project2/raw/batch1/scan002.tiff"
])
expect(project2_raw.all? { |asset| asset.migration_status == default_migration_status }).to be true
end
Expand Down Expand Up @@ -126,19 +130,11 @@
it 'completes successfully but logs error and updates nothing' do
service = described_class.new(volume_name: "deposit")

# Allow all normal log messages to pass through
allow(service).to receive(:stdout_and_log).and_call_original

# Expect the specific error message to be logged at least once
expect(service).to receive(:stdout_and_log).with(
"ERROR: 'Don't migrate' status not found",
level: :error
).at_least(:once).and_call_original

result = service.process

# Service completes successfully but doesn't update anything
expect(result.success?).to be true
expect(result.tiff_comparisons_updated).to eq(1)
expect(result.migration_statuses_updated).to eq(0)
end
end
Expand Down Expand Up @@ -226,7 +222,7 @@
expect(results).not_to include(pdf_asset.isilon_path)
end

it 'filters to deposit folders' do
it 'includes folders regardless of top-level prefix' do
other_folder = FactoryBot.create(:isilon_folder, volume: deposit_volume, full_path: "/other/test/processed")

deposit_asset = FactoryBot.create(:isilon_asset, parent_folder: test_folder, isilon_path: "/deposit/test/processed/image.tiff")
Expand All @@ -236,7 +232,7 @@
results = query.pluck(:isilon_path)

expect(results).to include(deposit_asset.isilon_path)
expect(results).not_to include(other_asset.isilon_path)
expect(results).to include(other_asset.isilon_path)
end

it 'excludes scrc accessions' do
Expand Down