Skip to content

Commit 175f1ee

Browse files
committed
Define the DVZ public interface and shrink it to what is used
1 parent f606caa commit 175f1ee

File tree

3 files changed

+95
-260
lines changed

3 files changed

+95
-260
lines changed

app/models/replication/druid_version_zip.rb

Lines changed: 60 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@ module Replication
99
# See comment on part_paths method re: individual part suffixes.
1010
# Just a regular model, not an ActiveRecord-backed model
1111
class DruidVersionZip
12-
attr_reader :druid, :version, :storage_location
13-
14-
delegate :base64digest, :hexdigest, to: :md5
15-
1612
# the size used with "zip -s" to break up the zip into parts
1713
ZIP_SPLIT_SIZE = '10g'
1814

@@ -26,34 +22,14 @@ def initialize(druid, version, storage_location = nil)
2622
@storage_location = storage_location
2723
end
2824

29-
# @return [String] Filename/key without extension, in common to all this object's zip parts
30-
def base_key
31-
@base_key ||= s3_key.sub(/.zip\z/, '')
32-
end
33-
34-
# Checks to see whether a zip file already exists for this druid-version. If it does, just touch the
35-
# file to refresh atime and mtime, so the zip cache cleaning cron job doesn't see it as stale. If it doesn't,
36-
# create it.
37-
# @raise [StandardError] if there's a zip file for this druid-version, but it looks too small to be complete.
38-
def find_or_create_zip!
39-
if exist?
40-
raise "zip already exists, but size (#{total_part_size}) is smaller than the moab version size (#{moab_version_size})!" unless zip_size_ok?
41-
FileUtils.touch(file_path)
42-
else
43-
create_zip!
44-
end
45-
end
46-
47-
def exist?
48-
File.exist?(file_path)
49-
end
50-
5125
# @return [Boolean] true if there is a match between the zip part files and their md5 sidecar files
5226
def complete?
5327
# There is at least one part file
5428
return false if part_paths.empty?
29+
5530
# The set of md5 sidecar files matches the set of part files
5631
return false unless part_keys.to_set == part_keys_from_md5_sidecars.to_set
32+
5733
# Check each md5 sidecar file against the zip part file
5834
druid_version_zip_parts.all?(&:md5_match?)
5935
end
@@ -67,6 +43,7 @@ def create_zip!
6743
ensure_zip_directory!
6844
combined, status = Open3.capture2e(zip_command, chdir: work_dir.to_s)
6945
raise "zipmaker failure #{combined}" unless status.success?
46+
7047
unless zip_size_ok?
7148
raise "zip size (#{total_part_size}) is smaller than the moab version size (#{moab_version_size})! zipmaker failure #{combined}"
7249
end
@@ -79,21 +56,6 @@ def create_zip!
7956
raise
8057
end
8158

82-
# Ensure the directory the zip will live in exists
83-
# @return [Pathname] the existing or created directory
84-
def ensure_zip_directory!
85-
Pathname.new(file_path).tap { |pn| pn.dirname.mkpath }
86-
end
87-
88-
# @param [Integer] count Number of parts
89-
# @return [Array<String>] Ordered pathnames expected to correspond to a zip broken into a given number of parts
90-
def expected_part_keys(count = 1)
91-
raise ArgumentError, "count (#{count}) must be >= 1" if count < 1
92-
[s3_key].concat(
93-
(1..(count - 1)).map { |n| base_key + format('.z%02d', n) }
94-
)
95-
end
96-
9759
# @param [String] suffix, e.g. '.zip', '.z01', '.z125', etc., including the dot
9860
# @return [String] s3_key for the zip part specified by suffix
9961
# @see [S3 key name performance implications] https://docs.aws.amazon.com/AmazonS3/latest/dev/request-rate-perf-considerations.html
@@ -102,60 +64,37 @@ def s3_key(suffix = '.zip')
10264
druid.tree.join('/') + ".#{v_version}#{suffix}"
10365
end
10466

105-
# @return [String] Path to the local temporary transfer root (.zip) part
106-
def file_path
107-
File.join(zip_storage, s3_key)
108-
end
109-
110-
# WITHOUT (re)digesting the file, convert a hexdigest MD5 value to base64-endcoded equivalent.
111-
# Motivation: we store hexdigest and S3 requires base64 in headers.
112-
# @param [String] hex
113-
# @return [String] base64 equivalent
114-
def hex_to_base64(hex)
115-
[[hex].pack('H*')].pack('m0')
116-
end
117-
118-
# @raise [StandardError] if storage_location is not available (should have been provided in constructor)
119-
def moab_version_path
120-
raise "cannot determine moab_version_path for #{druid.id} v#{version}, storage_location not provided" unless storage_location
121-
@moab_version_path ||= "#{druid.path}/#{v_version}"
67+
def moab_version_size
68+
moab_version_files.sum { |f| File.size(f) }
12269
end
12370

124-
# @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts
125-
def part_keys
126-
part_paths.map { |part| part.relative_path_from(zip_storage).to_s }
71+
# Deletes all zip part files and their md5 sidecar files from local zip storage
72+
def cleanup_zip_parts!
73+
FileUtils.rm_f(parts_and_checksums_paths)
12774
end
12875

129-
# NOTE: if there is only ONE part, it will end .zip; if there are multiple parts,
130-
# the last one will end .zip, so two parts is: .z01, zip. (this agrees with zip utility)
131-
# @return [Array<Pathname>] Existing pathnames for zip parts based on glob (.zip, .z01, .z02, etc.)
132-
def part_paths
133-
Pathname.glob(file_path.sub(/.zip\z/, '.z*')).reject do |path|
134-
path.to_s =~ /.md5\z/
76+
# @return [Array<DruidVersionZipPart>] all parts for this DruidVersionZip
77+
def druid_version_zip_parts
78+
part_keys.map do |part_key|
79+
Replication::DruidVersionZipPart.new(self, part_key)
13580
end
13681
end
13782

138-
# @return [Array<Pathname>] all extant zip part and checksum files for this dvz (e.g. bc123df4567.zip, bc123df4567.z01, bc123df4567.zip.md5,
139-
# bc123df4567.z01.md5, etc)
140-
def parts_and_checksums_paths
141-
Pathname.glob(File.join(zip_storage, s3_key('.*')))
142-
end
83+
private
14384

144-
# @return [String] "v" with zero-padded 4-digit version, e.g., v0001
145-
def v_version
146-
format('v%04d', version)
147-
end
85+
attr_reader :druid, :version, :storage_location
14886

149-
# @return [Pathname] The proper directory in which to execute zip_command
150-
# @raise [StandardError] if storage_location is not available (should have been provided in constructor)
151-
def work_dir
152-
Pathname.new(moab_version_path).parent.parent
87+
# This assumes that the zip file will be at least as large as the Moab version being zipped. Why? Because
88+
# we don't enable compression (see zip_command). Why no compression? We thought it might make extraction
89+
# from zips more reliable in the distant future. For further explanation, see https://github.com/sul-dlss/preservation_catalog/wiki/Zip-Creation
90+
def zip_size_ok?
91+
total_part_size > moab_version_size
15392
end
15493

155-
# @return [String] shell command to unzip
156-
# def unzip_command
157-
# "unzip #{file_path} -d #{place_to_unzip}"
158-
# end
94+
# @return [Pathname]
95+
def zip_storage
96+
@zip_storage ||= Pathname.new(Settings.zip_storage)
97+
end
15998

16099
# Presumes execution just "above" the druid dir in the druid tree, i.e. if the Moab is:
161100
# /storage_trunk_01/bj/102/hs/9687/bj102hs9687/v0003/...
@@ -167,39 +106,54 @@ def zip_command
167106
"zip -r0X -s #{ZIP_SPLIT_SIZE} #{file_path} #{druid.id}/#{v_version}"
168107
end
169108

170-
def zip_version
171-
@zip_version ||= fetch_zip_version
109+
# @return [Pathname] The proper directory in which to execute zip_command
110+
# @raise [StandardError] if storage_location is not available (should have been provided in constructor)
111+
def work_dir
112+
Pathname.new(moab_version_path).parent.parent
172113
end
173114

174-
# @return [Pathname]
175-
def zip_storage
176-
@zip_storage ||= Pathname.new(Settings.zip_storage)
115+
# @return [String] "v" with zero-padded 4-digit version, e.g., v0001
116+
def v_version
117+
format('v%04d', version)
177118
end
178119

179-
# This assumes that the zip file will be at least as large as the Moab version being zipped. Why? Because
180-
# we don't enable compression (see zip_command). Why no compression? We thought it might make extraction
181-
# from zips more reliable in the distant future. For further explanation, see https://github.com/sul-dlss/preservation_catalog/wiki/Zip-Creation
182-
def zip_size_ok?
183-
total_part_size > moab_version_size
120+
# @return [Array<Pathname>] all extant zip part and checksum files for this dvz (e.g. bc123df4567.zip, bc123df4567.z01, bc123df4567.zip.md5,
121+
# bc123df4567.z01.md5, etc)
122+
def parts_and_checksums_paths
123+
Pathname.glob(File.join(zip_storage, s3_key('.*')))
184124
end
185125

186-
def moab_version_size
187-
moab_version_files.sum { |f| File.size(f) }
126+
# NOTE: if there is only ONE part, it will end .zip; if there are multiple parts,
127+
# the last one will end .zip, so two parts is: .z01, zip. (this agrees with zip utility)
128+
# @return [Array<Pathname>] Existing pathnames for zip parts based on glob (.zip, .z01, .z02, etc.)
129+
def part_paths
130+
Pathname.glob(file_path.sub(/.zip\z/, '.z*')).reject do |path|
131+
path.to_s =~ /.md5\z/
132+
end
188133
end
189134

190-
# Deletes all zip part files and their md5 sidecar files from local zip storage
191-
def cleanup_zip_parts!
192-
FileUtils.rm_f(parts_and_checksums_paths)
135+
# @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts
136+
def part_keys
137+
part_paths.map { |part| part.relative_path_from(zip_storage).to_s }
193138
end
194139

195-
# @return [Array<DruidVersionZipPart>] all parts for this DruidVersionZip
196-
def druid_version_zip_parts
197-
part_keys.map do |part_key|
198-
Replication::DruidVersionZipPart.new(self, part_key)
199-
end
140+
# @raise [StandardError] if storage_location is not available (should have been provided in constructor)
141+
def moab_version_path
142+
raise "cannot determine moab_version_path for #{druid.id} v#{version}, storage_location not provided" unless storage_location
143+
144+
@moab_version_path ||= "#{druid.path}/#{v_version}"
200145
end
201146

202-
private
147+
# @return [String] Path to the local temporary transfer root (.zip) part
148+
def file_path
149+
File.join(zip_storage, s3_key)
150+
end
151+
152+
# Ensure the directory the zip will live in exists
153+
# @return [Pathname] the existing or created directory
154+
def ensure_zip_directory!
155+
Pathname.new(file_path).tap { |pn| pn.dirname.mkpath }
156+
end
203157

204158
# Throws an error if any of the files in the moab are not yet readable. For example due to
205159
# Ceph MDS instance for a pres cat worker VM thinking that a file is a stray as a result of our
@@ -217,24 +171,8 @@ def total_part_size
217171

218172
def moab_version_files
219173
raise "Moab version does not exist: #{moab_version_path}" unless File.exist?(moab_version_path)
220-
Dir
221-
.glob("#{moab_version_path}/**/*")
222-
.select { |path| File.file?(path) }
223-
end
224-
225-
# @return [String] e.g. 'Zip 3.0 (July 5th 2008)' or 'Zip 3.0.1'
226-
def fetch_zip_version
227-
match = nil
228-
IO.popen('zip -v') do |io|
229-
re = zip_version_regexp
230-
io.find { |line| match = line.match(re) }
231-
end
232-
return match[1] if match && match[1].present?
233-
raise 'No version info matched from `zip -v` ouptut'
234-
end
235174

236-
def zip_version_regexp
237-
/This is (Zip \d+(\.\d)+\s*(\(.*\d{4}\))?)/
175+
Dir.glob("#{moab_version_path}/**/*").select { |path| File.file?(path) }
238176
end
239177

240178
# @return [Array<String>] relative paths, i.e. s3_part_keys for existing parts based on the md5 sidecar files

0 commit comments

Comments
 (0)