Skip to content

Commit daa5f0e

Browse files
authored
Add S3 media uploader (#526)
1 parent dce0f27 commit daa5f0e

File tree

2 files changed

+248
-0
lines changed

2 files changed

+248
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#
2+
#
3+
# Copyright (C) 2022 David M. Straub
4+
#
5+
# This program is free software; you can redistribute it and/or modify
6+
# it under the terms of the GNU General Public License as published by
7+
# the Free Software Foundation; either version 2 of the License, or
8+
# (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU General Public License
16+
# along with this program; if not, write to the Free Software
17+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18+
#
19+
#
20+
# -------------------------
21+
#
22+
# S3 Media Uploader
23+
#
24+
# -----------------------
25+
register(
26+
TOOL,
27+
id="s3uploader",
28+
name=_("S3 Media Uploader"),
29+
category=TOOL_UTILS,
30+
status=STABLE,
31+
fname="S3MediaUploader.py",
32+
toolclass="S3MediaUploader",
33+
optionclass="S3MediaUploaderOptions",
34+
tool_modes=[TOOL_MODE_CLI],
35+
authors=["David M. Straub"],
36+
authors_email=["[email protected]"],
37+
description=_("Upload media files to S3 (or compatible) object-based storage via the command line."),
38+
version="0.1",
39+
gramps_target_version="5.1",
40+
)

S3MediaUploader/S3MediaUploader.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
"""Upload media files to S3 via the command line."""
2+
#
3+
#
4+
# Copyright (C) 2022 David M. Straub
5+
#
6+
# This program is free software; you can redistribute it and/or modify
7+
# it under the terms of the GNU General Public License as published by
8+
# the Free Software Foundation; either version 2 of the License, or
9+
# (at your option) any later version.
10+
#
11+
# This program is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU General Public License
17+
# along with this program; if not, write to the Free Software
18+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19+
#
20+
# $Id$
21+
#
22+
23+
import logging
24+
import os
25+
import sys
26+
27+
from gramps.gen.utils.file import create_checksum, expand_media_path
28+
from gramps.gui.plug import tool
29+
30+
_LOG = logging.getLogger("S3MediaUploader")
31+
32+
try:
33+
import boto3
34+
from botocore.exceptions import BotoCoreError, ClientError
35+
except ImportError:
36+
_LOG.error(
37+
"The S3 media uploader add-on requires the boto3"
38+
"Python library to be installed."
39+
)
40+
sys.exit(1)
41+
42+
43+
class S3MediaUploader(tool.Tool):
44+
def __init__(self, dbstate, user, options_class, name, callback=None):
45+
"""Initialize tool."""
46+
tool.Tool.__init__(self, dbstate, options_class, name)
47+
self.dbstate = dbstate
48+
self.run_tool()
49+
50+
def run_tool(self):
51+
"""Run the tool."""
52+
bucket_name = self.options.handler.options_dict["bucket_name"]
53+
endpoint_url = self.options.handler.options_dict["endpoint_url"] or None
54+
try:
55+
uploader = S3MediaUploadHandler(
56+
db=self.dbstate.db,
57+
bucket_name=bucket_name,
58+
endpoint_url=endpoint_url,
59+
create=True,
60+
logger=_LOG,
61+
)
62+
except (BotoCoreError, ClientError) as err:
63+
_LOG.error(err)
64+
return None
65+
uploader.upload_missing()
66+
67+
68+
class S3MediaUploaderOptions(tool.ToolOptions):
69+
"""Options class for S3 Media Uploader addon."""
70+
71+
def __init__(self, name, person_id=None):
72+
tool.ToolOptions.__init__(self, name, person_id)
73+
74+
self.options_dict = {
75+
"bucket_name": "",
76+
"endpoint_url": "",
77+
}
78+
self.options_help = {
79+
"bucket_name": ("=str", "Name of the bucket to store files in.", "string"),
80+
"endpoint_url": (
81+
"=str",
82+
"Altnerative endpoint URL for S3-compatible storage.",
83+
"string",
84+
),
85+
}
86+
87+
88+
class S3MediaUploadHandler:
89+
"""Class to upload media objects to an S3 bucket.
90+
91+
Based on https://github.com/DavidMStraub/gramps-webapp/.
92+
"""
93+
94+
def __init__(self, db, bucket_name, endpoint_url=None, create=False, logger=None):
95+
"""Initialize the class.
96+
97+
`db` is an instance of an appropriate subclass of `gramps.gen.db.base.DbReadBase`.
98+
`bucket_name` is the S3 bucket name.
99+
If `create` is True, the bucket will be created if it doesn't exist.
100+
"""
101+
self.db = db
102+
self.bucket_name = bucket_name
103+
self.s3 = boto3.resource("s3", endpoint_url=endpoint_url)
104+
self.client = boto3.client("s3", endpoint_url=endpoint_url)
105+
self.logger = logger or logging.getLogger()
106+
if create:
107+
if self.bucket_exists:
108+
self.logger.debug("Bucket {} already exists".format(bucket_name))
109+
else:
110+
self.logger.warning(
111+
"Bucket {} not found. Creating ...".format(bucket_name)
112+
)
113+
region_name = boto3.session.Session().region_name
114+
bucket_config = {}
115+
if region_name:
116+
bucket_config = {"LocationConstraint": region_name}
117+
self.client.create_bucket(
118+
Bucket=bucket_name, CreateBucketConfiguration=bucket_config
119+
)
120+
self.bucket = self.s3.Bucket(self.bucket_name)
121+
self.base_path = expand_media_path(self.db.get_mediapath(), self.db)
122+
123+
@property
124+
def bucket_exists(self):
125+
"""Return boolean if the bucket exists."""
126+
try:
127+
self.client.head_bucket(Bucket=self.bucket_name)
128+
except ClientError as err:
129+
error_code = int(err.response["Error"]["Code"])
130+
if error_code == 404: # bucket does not exist
131+
return False
132+
return True
133+
134+
def get_remote_objects(self):
135+
"""Get a set of all names of objects (media hashes) in the bucket."""
136+
return set(obj.key for obj in self.bucket.objects.all())
137+
138+
def get_local_objects(self):
139+
"""Get a dictionary of handle, hash, and mime types of all media objects
140+
in the database."""
141+
return {
142+
media_obj.handle: {
143+
"checksum": media_obj.get_checksum(),
144+
"mime": media_obj.get_mime_type(),
145+
}
146+
for media_obj in self.db.iter_media()
147+
}
148+
149+
def get_full_path(self, handle):
150+
"""Get the full local path to a media object by handle."""
151+
media_obj = self.db.get_media_from_handle(handle)
152+
return os.path.join(self.base_path, media_obj.path)
153+
154+
def check_checksum(self, handle, checksum):
155+
"""Check the media object's checksum, returning a boolean."""
156+
full_path = self.get_full_path(handle)
157+
new_checksum = create_checksum(full_path)
158+
return new_checksum == checksum
159+
160+
def upload(self, handle, checksum, mime):
161+
"""Upload a media object with given handle, hash, and MIME type."""
162+
path = self.get_full_path(handle)
163+
if not os.path.exists(path):
164+
self.logger.error("File {} not found. Skipping upload".format(path))
165+
return False
166+
if not self.check_checksum(handle, checksum):
167+
self.logger.error(
168+
"Found checksum mismatch for file {}. Skipping upload".format(path)
169+
)
170+
self.logger.error(
171+
"Old: {}, New: {}".format(checksum, create_checksum(path))
172+
)
173+
return False
174+
try:
175+
self.client.upload_file(
176+
path, self.bucket_name, checksum, ExtraArgs={"ContentType": mime}
177+
)
178+
except ClientError as err:
179+
logging.error(err)
180+
return False
181+
return True
182+
183+
def upload_all(self):
184+
"""Upload all media objects (overwriting existing ones)."""
185+
local_objects = self.get_local_objects()
186+
for handle, v in local_objects.items():
187+
self.upload(handle, **v)
188+
189+
def upload_missing(self):
190+
"""Upload the media objects that are not yet in the bucket."""
191+
local_objects_dict = self.get_local_objects()
192+
checksum_dict = {
193+
v["checksum"]: (handle, v["mime"])
194+
for handle, v in local_objects_dict.items()
195+
}
196+
local_checksums = set(obj["checksum"] for obj in local_objects_dict.values())
197+
remote_checksums = self.get_remote_objects()
198+
missing = local_checksums - remote_checksums
199+
num_missing = len(missing)
200+
self.logger.info("Found {} objects to upload.".format(num_missing))
201+
for i, checksum in enumerate(missing):
202+
self.logger.info(
203+
"Uploading file {} of {} ({}%)".format(
204+
i + 1, num_missing, round(100 * i / num_missing)
205+
)
206+
)
207+
handle, mime = checksum_dict[checksum]
208+
self.upload(handle, checksum, mime)

0 commit comments

Comments
 (0)