|
21 | 21 |
|
22 | 22 | include ActionView::Helpers::NumberHelper |
23 | 23 |
|
24 | | -# somewhat duplicative of DruidVersionZip.ZIP_SPLIT_SIZE = '10g', but that's |
25 | | -# zip format, and this is just bytes as an int, which is what the query wants |
26 | | -ZIP_SEGMENT_THRESHOLD_GB = 10 |
27 | | -ZIP_SEGMENT_THRESHOLD = ZIP_SEGMENT_THRESHOLD_GB.gigabytes |
28 | | - |
29 | 24 | options = { |
30 | 25 | druid_list: '', |
31 | 26 | druid_list_file: nil, |
|
46 | 41 | 'file with a list of provided druids, e.g. from integration tests, manual tests, your own queries, etc') |
47 | 42 | option_parser.on('--fixity_check_base_location FIXITY_CHECK_BASE_LOCATION', |
48 | 43 | 'target directory for downloading cloud archived Moabs, where they will be inflated and fixity checked. ensure sufficient free space.') |
49 | | - option_parser.on('--single_part_druid_sample_count SINGLE_PART_DRUID_SAMPLE_COUNT', |
| 44 | + option_parser.on('--single_part_druid_sample_count SINGLE_PART_DRUID_SAMPLE_COUNT', Integer, |
50 | 45 | 'number of < 10 GB Moabs to query for and retrieve (default: 0)') |
51 | | - option_parser.on('--multipart_druid_sample_count MULTIPART_DRUID_SAMPLE_COUNT', |
| 46 | + option_parser.on('--multipart_druid_sample_count MULTIPART_DRUID_SAMPLE_COUNT', Integer, |
52 | 47 | 'number of > 10 GB Moabs to query for and retrieve (default: 0)') |
53 | 48 | option_parser.on('--endpoints_to_audit ENDPOINTS_TO_AUDIT', |
54 | 49 | 'list of cloud endpoints to audit (comma-separated, no spaces, names from config)') |
|
89 | 84 | if options[:single_part_druid_sample_count].positive? |
90 | 85 | po_list = |
91 | 86 | PreservedObject.joins( |
92 | | - zipped_moab_versions: [:zip_parts, :zip_endpoint] |
| 87 | + zipped_moab_versions: [:zip_parts] |
93 | 88 | ).group( |
94 | | - 'preserved_objects.druid', 'zip_endpoint.endpoint_name' |
| 89 | + 'preserved_objects.id' |
95 | 90 | ).having( |
96 | | - 'SUM(zip_parts.size) < :max_size', |
97 | | - { max_size: ZIP_SEGMENT_THRESHOLD } # we segment zips into 10 GB chunks |
| 91 | + # look for druids with nothing but single part zips, one part per replicated version |
| 92 | + 'COUNT(DISTINCT(zip_parts.id)) = COUNT(DISTINCT(zipped_moab_versions.id))' |
98 | 93 | ).order( |
99 | 94 | 'RANDOM()' |
100 | 95 | ).limit( |
101 | 96 | options[:single_part_druid_sample_count] |
102 | 97 | ).pluck( |
103 | | - :druid, 'COUNT(zipped_moab_versions.id)', 'zip_endpoint.endpoint_name', 'COUNT(zip_parts.id)', Arel.sql('ARRAY_AGG((version, suffix))'), 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)' |
| 98 | + :druid, :current_version, 'COUNT(DISTINCT(zipped_moab_versions.id))', 'COUNT(DISTINCT(zip_parts.id))', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)' |
104 | 99 | ) |
105 | 100 |
|
106 | 101 | total_size = number_to_human_size(po_list.map { |row| row.last }.sum) |
107 | | - logger.info("sub #{ZIP_SEGMENT_THRESHOLD} GB preserved_objects results (#{total_size} total): #{po_list}") |
| 102 | + logger.info("query results: preserved_objects with only single-part zips: (#{total_size} total): #{po_list}") |
108 | 103 | druids += po_list.map { |row| row.first }.uniq |
109 | 104 | end |
110 | 105 |
|
111 | 106 | if options[:multipart_druid_sample_count].positive? |
112 | 107 | multipart_zip_po_list = |
113 | 108 | PreservedObject.joins( |
114 | | - zipped_moab_versions: [:zip_parts, :zip_endpoint] |
| 109 | + zipped_moab_versions: [:zip_parts] |
115 | 110 | ).group( |
116 | | - 'preserved_objects.druid', :version, 'zip_endpoint.endpoint_name' |
| 111 | + 'preserved_objects.id' |
117 | 112 | ).having( |
118 | | - 'SUM(zip_parts.size) > :min_size', |
119 | | - { min_size: ZIP_SEGMENT_THRESHOLD } # we segment zips into 10 GB chunks |
| 113 | + # look for druids with at least one multi-part zip, i.e. more parts than replicated versions |
| 114 | + 'COUNT(DISTINCT(zip_parts.id)) > COUNT(DISTINCT(zipped_moab_versions.id))' |
120 | 115 | ).order( |
121 | 116 | 'RANDOM()' |
122 | 117 | ).limit( |
123 | 118 | options[:multipart_druid_sample_count] |
124 | 119 | ).pluck( |
125 | | - :druid, :version, 'COUNT(zipped_moab_versions.id)', 'zip_endpoint.endpoint_name', 'COUNT(zip_parts.id)', 'ARRAY_AGG(suffix)', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)' |
| 120 | + :druid, :current_version, 'COUNT(DISTINCT(zipped_moab_versions.id))', 'COUNT(DISTINCT(zip_parts.id))', 'PG_SIZE_PRETTY(SUM(zip_parts.size))', 'SUM(zip_parts.size)' |
126 | 121 | ) |
127 | 122 |
|
128 | 123 | total_size = number_to_human_size(multipart_zip_po_list.map { |row| row.last }.sum) |
129 | | - logger.info("over #{ZIP_SEGMENT_THRESHOLD} GB preserved_objects results (#{total_size} total): #{multipart_zip_po_list}") |
| 124 | + logger.info("query results: preserved_objects with at least one multi-part zip: (#{total_size} total): #{multipart_zip_po_list}") |
130 | 125 | druids += multipart_zip_po_list.map { |row| row.first }.uniq |
131 | 126 | end |
132 | 127 |
|
|
0 commit comments