Skip to content

Commit 9e5213d

Browse files
committed
WIP: S3 version deletion service
1 parent 30c472e commit 9e5213d

File tree

5 files changed

+226
-149
lines changed

5 files changed

+226
-149
lines changed

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ gem 'roo-xls' # Add excel support to roo
115115
gem 'rubyzip' # Zip the large CSV files before emailing
116116
gem 'rufus-scheduler' # Cron
117117
gem 'rack-cors' # API
118+
gem 'amazing_print' # debug printing
118119

119120
group :development, :test do
120121
gem 'rails-controller-testing'

Gemfile.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ GEM
8383
uri (>= 0.13.1)
8484
addressable (2.8.7)
8585
public_suffix (>= 2.0.2, < 7.0)
86+
amazing_print (1.8.0)
8687
annotaterb (4.14.0)
8788
arbre (2.2.0)
8889
activesupport (>= 7.0)
@@ -679,6 +680,7 @@ PLATFORMS
679680

680681
DEPENDENCIES
681682
activeadmin (= 4.0.0.beta15)
683+
amazing_print
682684
annotaterb
683685
aws-actionmailer-ses (~> 1)
684686
aws-sdk-rails (~> 5)

app/services/s3_service.rb

Lines changed: 0 additions & 149 deletions
This file was deleted.
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
require 'csv'
2+
require 'aws-sdk-s3'
3+
require 'amazing_print'
4+
5+
# rubocop:disable Metrics/MethodLength,Metrics/BlockLength
6+
class S3VersionDeletionService
7+
def initialize(env, verbose: false)
8+
@meta_bucket = "nabu-meta-#{env}"
9+
@prefix = "inventories/catalog/nabu-catalog-#{env}/CatalogBucketInventory0/"
10+
@bucket = "nabu-catalog-#{env}"
11+
@verbose = verbose
12+
13+
# Strange bug in dev docker
14+
ENV.delete('AWS_SECRET_ACCESS_KEY')
15+
ENV.delete('AWS_ACCESS_KEY_ID')
16+
ENV.delete('AWS_SESSION_TOKEN')
17+
18+
@s3 = Aws::S3::Client.new(region: 'ap-southeast-2')
19+
20+
@count = Hash.new(0)
21+
@size = Hash.new(0)
22+
@remove_delete_markers = []
23+
end
24+
25+
def run
26+
s3_files = get_s3_files
27+
28+
s3_files.each_pair do |filename, versions|
29+
process_file(filename, versions)
30+
end
31+
32+
ap @count
33+
@size.each do |key, size|
34+
@size[key] = size / 1024 / 1024 / 1024
35+
end
36+
ap @size
37+
38+
# write the remove_delete_markers to a file
39+
file = File.open('remove_delete_markers.txt', 'w')
40+
@remove_delete_markers.each do |version|
41+
file.puts version[:filename]
42+
puts version[:version_id] unless version[:version_id].empty?
43+
end
44+
end
45+
46+
def process_file(filename, versions)
47+
if versions.size == 0
48+
p 'How can there be no versions for a file?'
49+
throw filename
50+
end
51+
52+
if versions.size == 1
53+
version = versions.first
54+
throw version unless version[:is_latest]
55+
56+
if version[:delete_marker]
57+
# We deleted files befire but forgot to kill the delete markers
58+
if version[:version_id].empty? && version[:size] == 0
59+
@count[:old_delete_marker] += 1
60+
@remove_delete_markers << version
61+
62+
return
63+
end
64+
65+
ap version
66+
throw 'WTF1'
67+
end
68+
69+
@count[:single_upload] += 1
70+
@size[:single_upload] += version[:size]
71+
return
72+
end
73+
74+
@count[:other] += 1
75+
@size[:other] += versions.map { |v| v[:size] }.sum
76+
77+
latest = versions.last
78+
79+
# ap versions
80+
# exit
81+
# #
82+
# if file[:is_latest] && !file[:delete_marker]
83+
# @stats[:real] += 1
84+
#
85+
# return
86+
# end
87+
#
88+
# if file[:is_latest] && file[:delete_marker]
89+
# @stats[:deleted] += 1
90+
#
91+
# return
92+
# end
93+
#
94+
# if !file[:is_latest] && file[:delete_marker]
95+
# @stats[:deleted_version] += 1
96+
#
97+
# return
98+
# end
99+
#
100+
# if !file[:is_latest] && !file[:delete_marker]
101+
# @stats[:old_version] += 1
102+
#
103+
# return
104+
# end
105+
#
106+
# raise "Unknown file state: #{file[:is_latest]} #{file[:delete_marker]}"
107+
end
108+
109+
110+
def get_s3_files
111+
inventory_dir = find_recent_inventory_dir
112+
inventory_csv = fetch_inventory_csv(inventory_dir)
113+
114+
s3_files = extract_s3_files(inventory_csv)
115+
116+
s3_files
117+
end
118+
119+
def extract_s3_files(inventory_csv)
120+
s3_files = {}
121+
122+
headers = %i[
123+
bucket_name filename version_id is_latest delete_marker size last_modified etag
124+
storage_class multiple_upload multipart_upload_flag replication_status checksum_algo
125+
]
126+
127+
versions = CSV.parse(inventory_csv, headers: false).map do |row|
128+
obj = headers.zip(row).to_h
129+
obj[:filename] = CGI.unescape(obj[:filename])
130+
obj[:size] = obj[:size].to_i
131+
obj[:is_latest] = obj[:is_latest] == 'true'
132+
obj[:delete_marker] = obj[:delete_marker] == 'true'
133+
134+
obj
135+
end
136+
137+
puts "We found #{versions.size} versions of files in the inventory"
138+
139+
s3_files = Hash.new([])
140+
141+
versions.each do |version|
142+
s3_files[version[:filename]] += [version]
143+
end
144+
145+
s3_files.each do |_, versions|
146+
versions.sort_by! { |v| v[:last_modified] }
147+
end
148+
149+
puts "We found #{s3_files.size} files in the inventory"
150+
151+
s3_files
152+
end
153+
154+
def fetch_inventory_csv(inventory_dir)
155+
manifest_json = @s3.get_object(bucket: @meta_bucket, key: "#{inventory_dir}manifest.json").body.read
156+
manifest = JSON.parse(manifest_json)
157+
158+
files = manifest['files']
159+
if files.size > 1
160+
raise 'Multiple files in manifest'
161+
end
162+
163+
file = files.first['key']
164+
165+
# Download the S3 Inventory CSV file
166+
puts "Downloading S3 Inventory CSV file: #{file}"
167+
inventory_gzipped = @s3.get_object(bucket: @meta_bucket, key: file).body.read
168+
puts "Unzipping file: #{file}\n\n"
169+
inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read
170+
171+
inventory_csv
172+
end
173+
174+
def find_recent_inventory_dir
175+
inventory_files = fetch_inventory_files
176+
177+
# Extract the timestamp part from each key and convert it to Time object
178+
timestamped_files = inventory_files.map do |key|
179+
match = key.match(/CatalogBucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/)
180+
if match
181+
year, month, day, hour, minute = match.captures
182+
time = Time.new(year, month, day, hour, minute)
183+
{ key: key, time: time }
184+
end
185+
end.compact
186+
# Find the most recent file
187+
most_recent_dir = timestamped_files.max_by { |file| file[:time] }
188+
189+
puts "Most recent inventory file: #{most_recent_dir[:key]}"
190+
most_recent_dir[:key]
191+
end
192+
193+
def fetch_inventory_files
194+
inventory_files = []
195+
next_token = nil
196+
197+
loop do
198+
response = @s3.list_objects_v2(
199+
bucket: @meta_bucket,
200+
prefix: @prefix,
201+
delimiter: '/',
202+
continuation_token: next_token
203+
)
204+
205+
# Collect all object keys
206+
inventory_files += response.common_prefixes.map(&:prefix)
207+
208+
break unless response.is_truncated
209+
210+
next_token = response.next_continuation_token
211+
end
212+
213+
inventory_files
214+
end
215+
end
216+
# rubocop:enable Metrics/MethodLength,Metrics/BlockLength

0 commit comments

Comments
 (0)