@@ -9,10 +9,6 @@ module Replication
99 # See comment on part_paths method re: individual part suffixes.
1010 # Just a regular model, not an ActiveRecord-backed model
1111 class DruidVersionZip
12- attr_reader :druid , :version , :storage_location
13-
14- delegate :base64digest , :hexdigest , to : :md5
15-
1612 # the size used with "zip -s" to break up the zip into parts
1713 ZIP_SPLIT_SIZE = '10g'
1814
@@ -26,34 +22,14 @@ def initialize(druid, version, storage_location = nil)
2622 @storage_location = storage_location
2723 end
2824
29- # @return [String] Filename/key without extension, in common to all this object's zip parts
30- def base_key
31- @base_key ||= s3_key . sub ( /.zip\z / , '' )
32- end
33-
34- # Checks to see whether a zip file already exists for this druid-version. If it does, just touch the
35- # file to refresh atime and mtime, so the zip cache cleaning cron job doesn't see it as stale. If it doesn't,
36- # create it.
37- # @raise [StandardError] if there's a zip file for this druid-version, but it looks too small to be complete.
38- def find_or_create_zip!
39- if exist?
40- raise "zip already exists, but size (#{ total_part_size } ) is smaller than the moab version size (#{ moab_version_size } )!" unless zip_size_ok?
41- FileUtils . touch ( file_path )
42- else
43- create_zip!
44- end
45- end
46-
47- def exist?
48- File . exist? ( file_path )
49- end
50-
5125 # @return [Boolean] true if there is a match between the zip part files and their md5 sidecar files
5226 def complete?
5327 # There is at least one part file
5428 return false if part_paths . empty?
29+
5530 # The set of md5 sidecar files matches the set of part files
5631 return false unless part_keys . to_set == part_keys_from_md5_sidecars . to_set
32+
5733 # Check each md5 sidecar file against the zip part file
5834 druid_version_zip_parts . all? ( &:md5_match? )
5935 end
@@ -67,6 +43,7 @@ def create_zip!
6743 ensure_zip_directory!
6844 combined , status = Open3 . capture2e ( zip_command , chdir : work_dir . to_s )
6945 raise "zipmaker failure #{ combined } " unless status . success?
46+
7047 unless zip_size_ok?
7148 raise "zip size (#{ total_part_size } ) is smaller than the moab version size (#{ moab_version_size } )! zipmaker failure #{ combined } "
7249 end
@@ -79,21 +56,6 @@ def create_zip!
7956 raise
8057 end
8158
82- # Ensure the directory the zip will live in exists
83- # @return [Pathname] the existing or created directory
84- def ensure_zip_directory!
85- Pathname . new ( file_path ) . tap { |pn | pn . dirname . mkpath }
86- end
87-
88- # @param [Integer] count Number of parts
89- # @return [Array<String>] Ordered pathnames expected to correspond to a zip broken into a given number of parts
90- def expected_part_keys ( count = 1 )
91- raise ArgumentError , "count (#{ count } ) must be >= 1" if count < 1
92- [ s3_key ] . concat (
93- ( 1 ..( count - 1 ) ) . map { |n | base_key + format ( '.z%02d' , n ) }
94- )
95- end
96-
9759 # @param [String] suffix, e.g. '.zip', '.z01', '.z125', etc., including the dot
9860 # @return [String] s3_key for the zip part specified by suffix
9961 # @see [S3 key name performance implications] https://docs.aws.amazon.com/AmazonS3/latest/dev/request-rate-perf-considerations.html
@@ -102,60 +64,37 @@ def s3_key(suffix = '.zip')
10264 druid . tree . join ( '/' ) + ".#{ v_version } #{ suffix } "
10365 end
10466
105- # @return [String] Path to the local temporary transfer root (.zip) part
106- def file_path
107- File . join ( zip_storage , s3_key )
108- end
109-
110- # WITHOUT (re)digesting the file, convert a hexdigest MD5 value to base64-endcoded equivalent.
111- # Motivation: we store hexdigest and S3 requires base64 in headers.
112- # @param [String] hex
113- # @return [String] base64 equivalent
114- def hex_to_base64 ( hex )
115- [ [ hex ] . pack ( 'H*' ) ] . pack ( 'm0' )
116- end
117-
118- # @raise [StandardError] if storage_location is not available (should have been provided in constructor)
119- def moab_version_path
120- raise "cannot determine moab_version_path for #{ druid . id } v#{ version } , storage_location not provided" unless storage_location
121- @moab_version_path ||= "#{ druid . path } /#{ v_version } "
67+ def moab_version_size
68+ moab_version_files . sum { |f | File . size ( f ) }
12269 end
12370
124- # @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts
125- def part_keys
126- part_paths . map { | part | part . relative_path_from ( zip_storage ) . to_s }
71+ # Deletes all zip part files and their md5 sidecar files from local zip storage
72+ def cleanup_zip_parts!
73+ FileUtils . rm_f ( parts_and_checksums_paths )
12774 end
12875
129- # NOTE: if there is only ONE part, it will end .zip; if there are multiple parts,
130- # the last one will end .zip, so two parts is: .z01, zip. (this agrees with zip utility)
131- # @return [Array<Pathname>] Existing pathnames for zip parts based on glob (.zip, .z01, .z02, etc.)
132- def part_paths
133- Pathname . glob ( file_path . sub ( /.zip\z / , '.z*' ) ) . reject do |path |
134- path . to_s =~ /.md5\z /
76+ # @return [Array<DruidVersionZipPart>] all parts for this DruidVersionZip
77+ def druid_version_zip_parts
78+ part_keys . map do |part_key |
79+ Replication ::DruidVersionZipPart . new ( self , part_key )
13580 end
13681 end
13782
138- # @return [Array<Pathname>] all extant zip part and checksum files for this dvz (e.g. bc123df4567.zip, bc123df4567.z01, bc123df4567.zip.md5,
139- # bc123df4567.z01.md5, etc)
140- def parts_and_checksums_paths
141- Pathname . glob ( File . join ( zip_storage , s3_key ( '.*' ) ) )
142- end
83+ private
14384
144- # @return [String] "v" with zero-padded 4-digit version, e.g., v0001
145- def v_version
146- format ( 'v%04d' , version )
147- end
85+ attr_reader :druid , :version , :storage_location
14886
149- # @return [Pathname] The proper directory in which to execute zip_command
150- # @raise [StandardError] if storage_location is not available (should have been provided in constructor)
151- def work_dir
152- Pathname . new ( moab_version_path ) . parent . parent
87+ # This assumes that the zip file will be at least as large as the Moab version being zipped. Why? Because
88+ # we don't enable compression (see zip_command). Why no compression? We thought it might make extraction
89+ # from zips more reliable in the distant future. For further explanation, see https://github.com/sul-dlss/preservation_catalog/wiki/Zip-Creation
90+ def zip_size_ok?
91+ total_part_size > moab_version_size
15392 end
15493
155- # @return [String] shell command to unzip
156- # def unzip_command
157- # "unzip #{file_path} -d #{place_to_unzip}"
158- # end
94+ # @return [Pathname]
95+ def zip_storage
96+ @zip_storage ||= Pathname . new ( Settings . zip_storage )
97+ end
15998
16099 # Presumes execution just "above" the druid dir in the druid tree, i.e. if the Moab is:
161100 # /storage_trunk_01/bj/102/hs/9687/bj102hs9687/v0003/...
@@ -167,39 +106,54 @@ def zip_command
167106 "zip -r0X -s #{ ZIP_SPLIT_SIZE } #{ file_path } #{ druid . id } /#{ v_version } "
168107 end
169108
170- def zip_version
171- @zip_version ||= fetch_zip_version
109+ # @return [Pathname] The proper directory in which to execute zip_command
110+ # @raise [StandardError] if storage_location is not available (should have been provided in constructor)
111+ def work_dir
112+ Pathname . new ( moab_version_path ) . parent . parent
172113 end
173114
174- # @return [Pathname]
175- def zip_storage
176- @zip_storage ||= Pathname . new ( Settings . zip_storage )
115+ # @return [String] "v" with zero-padded 4-digit version, e.g., v0001
116+ def v_version
117+ format ( 'v%04d' , version )
177118 end
178119
179- # This assumes that the zip file will be at least as large as the Moab version being zipped. Why? Because
180- # we don't enable compression (see zip_command). Why no compression? We thought it might make extraction
181- # from zips more reliable in the distant future. For further explanation, see https://github.com/sul-dlss/preservation_catalog/wiki/Zip-Creation
182- def zip_size_ok?
183- total_part_size > moab_version_size
120+ # @return [Array<Pathname>] all extant zip part and checksum files for this dvz (e.g. bc123df4567.zip, bc123df4567.z01, bc123df4567.zip.md5,
121+ # bc123df4567.z01.md5, etc)
122+ def parts_and_checksums_paths
123+ Pathname . glob ( File . join ( zip_storage , s3_key ( '.*' ) ) )
184124 end
185125
186- def moab_version_size
187- moab_version_files . sum { |f | File . size ( f ) }
126+ # NOTE: if there is only ONE part, it will end .zip; if there are multiple parts,
127+ # the last one will end .zip, so two parts is: .z01, zip. (this agrees with zip utility)
128+ # @return [Array<Pathname>] Existing pathnames for zip parts based on glob (.zip, .z01, .z02, etc.)
129+ def part_paths
130+ Pathname . glob ( file_path . sub ( /.zip\z / , '.z*' ) ) . reject do |path |
131+ path . to_s =~ /.md5\z /
132+ end
188133 end
189134
190- # Deletes all zip part files and their md5 sidecar files from local zip storage
191- def cleanup_zip_parts!
192- FileUtils . rm_f ( parts_and_checksums_paths )
135+ # @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts
136+ def part_keys
137+ part_paths . map { | part | part . relative_path_from ( zip_storage ) . to_s }
193138 end
194139
195- # @return [Array<DruidVersionZipPart>] all parts for this DruidVersionZip
196- def druid_version_zip_parts
197- part_keys . map do | part_key |
198- Replication :: DruidVersionZipPart . new ( self , part_key )
199- end
140+ # @raise [StandardError] if storage_location is not available (should have been provided in constructor)
141+ def moab_version_path
142+ raise "cannot determine moab_version_path for #{ druid . id } v #{ version } , storage_location not provided" unless storage_location
143+
144+ @moab_version_path ||= " #{ druid . path } / #{ v_version } "
200145 end
201146
202- private
147+ # @return [String] Path to the local temporary transfer root (.zip) part
148+ def file_path
149+ File . join ( zip_storage , s3_key )
150+ end
151+
152+ # Ensure the directory the zip will live in exists
153+ # @return [Pathname] the existing or created directory
154+ def ensure_zip_directory!
155+ Pathname . new ( file_path ) . tap { |pn | pn . dirname . mkpath }
156+ end
203157
204158 # Throws an error if any of the files in the moab are not yet readable. For example due to
205159 # Ceph MDS instance for a pres cat worker VM thinking that a file is a stray as a result of our
@@ -217,24 +171,8 @@ def total_part_size
217171
218172 def moab_version_files
219173 raise "Moab version does not exist: #{ moab_version_path } " unless File . exist? ( moab_version_path )
220- Dir
221- . glob ( "#{ moab_version_path } /**/*" )
222- . select { |path | File . file? ( path ) }
223- end
224-
225- # @return [String] e.g. 'Zip 3.0 (July 5th 2008)' or 'Zip 3.0.1'
226- def fetch_zip_version
227- match = nil
228- IO . popen ( 'zip -v' ) do |io |
229- re = zip_version_regexp
230- io . find { |line | match = line . match ( re ) }
231- end
232- return match [ 1 ] if match && match [ 1 ] . present?
233- raise 'No version info matched from `zip -v` ouptut'
234- end
235174
236- def zip_version_regexp
237- /This is (Zip \d +(\. \d )+\s *(\( .*\d {4}\) )?)/
175+ Dir . glob ( "#{ moab_version_path } /**/*" ) . select { |path | File . file? ( path ) }
238176 end
239177
240178 # @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts based on the md5 sidecar files
0 commit comments