Skip to content

Commit 5741314

Browse files
committed
bin/fixity_check_replicated_moabs.rb -- tighten up arg handling and querying for single- and multi- part druid sampling
1 parent 6bf7686 commit 5741314

File tree

1 file changed

+14
-19
lines changed

1 file changed

+14
-19
lines changed

bin/fixity_check_replicated_moabs.rb

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@
2121

2222
include ActionView::Helpers::NumberHelper
2323

24-
# somewhat duplicative of DruidVersionZip.ZIP_SPLIT_SIZE = '10g', but that's
25-
# zip format, and this is just bytes as an int, which is what the query wants
26-
ZIP_SEGMENT_THRESHOLD_GB = 10
27-
ZIP_SEGMENT_THRESHOLD = ZIP_SEGMENT_THRESHOLD_GB.gigabytes
28-
2924
options = {
3025
druid_list: '',
3126
druid_list_file: nil,
@@ -46,9 +41,9 @@
4641
'file with a list of provided druids, e.g. from integration tests, manual tests, your own queries, etc')
4742
option_parser.on('--fixity_check_base_location FIXITY_CHECK_BASE_LOCATION',
4843
'target directory for downloading cloud archived Moabs, where they will be inflated and fixity checked. ensure sufficient free space.')
49-
option_parser.on('--single_part_druid_sample_count SINGLE_PART_DRUID_SAMPLE_COUNT',
44+
option_parser.on('--single_part_druid_sample_count SINGLE_PART_DRUID_SAMPLE_COUNT', Integer,
5045
'number of < 10 GB Moabs to query for and retrieve (default: 0)')
51-
option_parser.on('--multipart_druid_sample_count MULTIPART_DRUID_SAMPLE_COUNT',
46+
option_parser.on('--multipart_druid_sample_count MULTIPART_DRUID_SAMPLE_COUNT', Integer,
5247
'number of > 10 GB Moabs to query for and retrieve (default: 0)')
5348
option_parser.on('--endpoints_to_audit ENDPOINTS_TO_AUDIT',
5449
'list of cloud endpoints to audit (comma-separated, no spaces, names from config)')
@@ -89,44 +84,44 @@
8984
if options[:single_part_druid_sample_count].positive?
9085
po_list =
9186
PreservedObject.joins(
92-
zipped_moab_versions: [:zip_parts, :zip_endpoint]
87+
zipped_moab_versions: [:zip_parts]
9388
).group(
94-
'preserved_objects.druid', 'zip_endpoint.endpoint_name'
89+
'preserved_objects.id'
9590
).having(
96-
'SUM(zip_parts.size) < :max_size',
97-
{ max_size: ZIP_SEGMENT_THRESHOLD } # we segment zips into 10 GB chunks
91+
# look for druids with nothing but single part zips, one part per replicated version
92+
'COUNT(DISTINCT(zip_parts.id)) = COUNT(DISTINCT(zipped_moab_versions.id))'
9893
).order(
9994
'RANDOM()'
10095
).limit(
10196
options[:single_part_druid_sample_count]
10297
).pluck(
103-
:druid, 'COUNT(zipped_moab_versions.id)', 'zip_endpoint.endpoint_name', 'COUNT(zip_parts.id)', Arel.sql('ARRAY_AGG((version, suffix))'), 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)'
98+
:druid, :current_version, 'COUNT(DISTINCT(zipped_moab_versions.id))', 'COUNT(DISTINCT(zip_parts.id))', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)'
10499
)
105100

106101
total_size = number_to_human_size(po_list.map { |row| row.last }.sum)
107-
logger.info("sub #{ZIP_SEGMENT_THRESHOLD} GB preserved_objects results (#{total_size} total): #{po_list}")
102+
logger.info("query results: preserved_objects with only single-part zips: (#{total_size} total): #{po_list}")
108103
druids += po_list.map { |row| row.first }.uniq
109104
end
110105

111106
if options[:multipart_druid_sample_count].positive?
112107
multipart_zip_po_list =
113108
PreservedObject.joins(
114-
zipped_moab_versions: [:zip_parts, :zip_endpoint]
109+
zipped_moab_versions: [:zip_parts]
115110
).group(
116-
'preserved_objects.druid', :version, 'zip_endpoint.endpoint_name'
111+
'preserved_objects.id'
117112
).having(
118-
'SUM(zip_parts.size) > :min_size',
119-
{ min_size: ZIP_SEGMENT_THRESHOLD } # we segment zips into 10 GB chunks
113+
# look for druids with at least one multi-part zip, i.e. more parts than replicated versions
114+
'COUNT(DISTINCT(zip_parts.id)) > COUNT(DISTINCT(zipped_moab_versions.id))'
120115
).order(
121116
'RANDOM()'
122117
).limit(
123118
options[:multipart_druid_sample_count]
124119
).pluck(
125-
:druid, :version, 'COUNT(zipped_moab_versions.id)', 'zip_endpoint.endpoint_name', 'COUNT(zip_parts.id)', 'ARRAY_AGG(suffix)', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)'
120+
:druid, :current_version, 'COUNT(DISTINCT(zipped_moab_versions.id))', 'COUNT(DISTINCT(zip_parts.id))', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)'
126121
)
127122

128123
total_size = number_to_human_size(multipart_zip_po_list.map { |row| row.last }.sum)
129-
logger.info("over #{ZIP_SEGMENT_THRESHOLD} GB preserved_objects results (#{total_size} total): #{multipart_zip_po_list}")
124+
logger.info("query results: preserved_objects with at least one multi-part zip: (#{total_size} total): #{multipart_zip_po_list}")
130125
druids += multipart_zip_po_list.map { |row| row.first }.uniq
131126
end
132127

0 commit comments

Comments
 (0)