|
| 1 | +"""Upload media files to S3 via the command line.""" |
| 2 | +# |
| 3 | +# |
| 4 | +# Copyright (C) 2022 David M. Straub |
| 5 | +# |
| 6 | +# This program is free software; you can redistribute it and/or modify |
| 7 | +# it under the terms of the GNU General Public License as published by |
| 8 | +# the Free Software Foundation; either version 2 of the License, or |
| 9 | +# (at your option) any later version. |
| 10 | +# |
| 11 | +# This program is distributed in the hope that it will be useful, |
| 12 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | +# GNU General Public License for more details. |
| 15 | +# |
| 16 | +# You should have received a copy of the GNU General Public License |
| 17 | +# along with this program; if not, write to the Free Software |
| 18 | +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 19 | +# |
| 20 | +# $Id$ |
| 21 | +# |
| 22 | + |
| 23 | +import logging |
| 24 | +import os |
| 25 | +import sys |
| 26 | + |
| 27 | +from gramps.gen.utils.file import create_checksum, expand_media_path |
| 28 | +from gramps.gui.plug import tool |
| 29 | + |
| 30 | +_LOG = logging.getLogger("S3MediaUploader") |
| 31 | + |
| 32 | +try: |
| 33 | + import boto3 |
| 34 | + from botocore.exceptions import BotoCoreError, ClientError |
| 35 | +except ImportError: |
| 36 | + _LOG.error( |
| 37 | + "The S3 media uploader add-on requires the boto3" |
| 38 | + "Python library to be installed." |
| 39 | + ) |
| 40 | + sys.exit(1) |
| 41 | + |
| 42 | + |
| 43 | +class S3MediaUploader(tool.Tool): |
| 44 | + def __init__(self, dbstate, user, options_class, name, callback=None): |
| 45 | + """Initialize tool.""" |
| 46 | + tool.Tool.__init__(self, dbstate, options_class, name) |
| 47 | + self.dbstate = dbstate |
| 48 | + self.run_tool() |
| 49 | + |
| 50 | + def run_tool(self): |
| 51 | + """Run the tool.""" |
| 52 | + bucket_name = self.options.handler.options_dict["bucket_name"] |
| 53 | + endpoint_url = self.options.handler.options_dict["endpoint_url"] or None |
| 54 | + try: |
| 55 | + uploader = S3MediaUploadHandler( |
| 56 | + db=self.dbstate.db, |
| 57 | + bucket_name=bucket_name, |
| 58 | + endpoint_url=endpoint_url, |
| 59 | + create=True, |
| 60 | + logger=_LOG, |
| 61 | + ) |
| 62 | + except (BotoCoreError, ClientError) as err: |
| 63 | + _LOG.error(err) |
| 64 | + return None |
| 65 | + uploader.upload_missing() |
| 66 | + |
| 67 | + |
| 68 | +class S3MediaUploaderOptions(tool.ToolOptions): |
| 69 | + """Options class for S3 Media Uploader addon.""" |
| 70 | + |
| 71 | + def __init__(self, name, person_id=None): |
| 72 | + tool.ToolOptions.__init__(self, name, person_id) |
| 73 | + |
| 74 | + self.options_dict = { |
| 75 | + "bucket_name": "", |
| 76 | + "endpoint_url": "", |
| 77 | + } |
| 78 | + self.options_help = { |
| 79 | + "bucket_name": ("=str", "Name of the bucket to store files in.", "string"), |
| 80 | + "endpoint_url": ( |
| 81 | + "=str", |
| 82 | + "Altnerative endpoint URL for S3-compatible storage.", |
| 83 | + "string", |
| 84 | + ), |
| 85 | + } |
| 86 | + |
| 87 | + |
| 88 | +class S3MediaUploadHandler: |
| 89 | + """Class to upload media objects to an S3 bucket. |
| 90 | +
|
| 91 | + Based on https://github.com/DavidMStraub/gramps-webapp/. |
| 92 | + """ |
| 93 | + |
| 94 | + def __init__(self, db, bucket_name, endpoint_url=None, create=False, logger=None): |
| 95 | + """Initialize the class. |
| 96 | +
|
| 97 | + `db` is an instance of an appropriate subclass of `gramps.gen.db.base.DbReadBase`. |
| 98 | + `bucket_name` is the S3 bucket name. |
| 99 | + If `create` is True, the bucket will be created if it doesn't exist. |
| 100 | + """ |
| 101 | + self.db = db |
| 102 | + self.bucket_name = bucket_name |
| 103 | + self.s3 = boto3.resource("s3", endpoint_url=endpoint_url) |
| 104 | + self.client = boto3.client("s3", endpoint_url=endpoint_url) |
| 105 | + self.logger = logger or logging.getLogger() |
| 106 | + if create: |
| 107 | + if self.bucket_exists: |
| 108 | + self.logger.debug("Bucket {} already exists".format(bucket_name)) |
| 109 | + else: |
| 110 | + self.logger.warning( |
| 111 | + "Bucket {} not found. Creating ...".format(bucket_name) |
| 112 | + ) |
| 113 | + region_name = boto3.session.Session().region_name |
| 114 | + bucket_config = {} |
| 115 | + if region_name: |
| 116 | + bucket_config = {"LocationConstraint": region_name} |
| 117 | + self.client.create_bucket( |
| 118 | + Bucket=bucket_name, CreateBucketConfiguration=bucket_config |
| 119 | + ) |
| 120 | + self.bucket = self.s3.Bucket(self.bucket_name) |
| 121 | + self.base_path = expand_media_path(self.db.get_mediapath(), self.db) |
| 122 | + |
| 123 | + @property |
| 124 | + def bucket_exists(self): |
| 125 | + """Return boolean if the bucket exists.""" |
| 126 | + try: |
| 127 | + self.client.head_bucket(Bucket=self.bucket_name) |
| 128 | + except ClientError as err: |
| 129 | + error_code = int(err.response["Error"]["Code"]) |
| 130 | + if error_code == 404: # bucket does not exist |
| 131 | + return False |
| 132 | + return True |
| 133 | + |
| 134 | + def get_remote_objects(self): |
| 135 | + """Get a set of all names of objects (media hashes) in the bucket.""" |
| 136 | + return set(obj.key for obj in self.bucket.objects.all()) |
| 137 | + |
| 138 | + def get_local_objects(self): |
| 139 | + """Get a dictionary of handle, hash, and mime types of all media objects |
| 140 | + in the database.""" |
| 141 | + return { |
| 142 | + media_obj.handle: { |
| 143 | + "checksum": media_obj.get_checksum(), |
| 144 | + "mime": media_obj.get_mime_type(), |
| 145 | + } |
| 146 | + for media_obj in self.db.iter_media() |
| 147 | + } |
| 148 | + |
| 149 | + def get_full_path(self, handle): |
| 150 | + """Get the full local path to a media object by handle.""" |
| 151 | + media_obj = self.db.get_media_from_handle(handle) |
| 152 | + return os.path.join(self.base_path, media_obj.path) |
| 153 | + |
| 154 | + def check_checksum(self, handle, checksum): |
| 155 | + """Check the media object's checksum, returning a boolean.""" |
| 156 | + full_path = self.get_full_path(handle) |
| 157 | + new_checksum = create_checksum(full_path) |
| 158 | + return new_checksum == checksum |
| 159 | + |
| 160 | + def upload(self, handle, checksum, mime): |
| 161 | + """Upload a media object with given handle, hash, and MIME type.""" |
| 162 | + path = self.get_full_path(handle) |
| 163 | + if not os.path.exists(path): |
| 164 | + self.logger.error("File {} not found. Skipping upload".format(path)) |
| 165 | + return False |
| 166 | + if not self.check_checksum(handle, checksum): |
| 167 | + self.logger.error( |
| 168 | + "Found checksum mismatch for file {}. Skipping upload".format(path) |
| 169 | + ) |
| 170 | + self.logger.error( |
| 171 | + "Old: {}, New: {}".format(checksum, create_checksum(path)) |
| 172 | + ) |
| 173 | + return False |
| 174 | + try: |
| 175 | + self.client.upload_file( |
| 176 | + path, self.bucket_name, checksum, ExtraArgs={"ContentType": mime} |
| 177 | + ) |
| 178 | + except ClientError as err: |
| 179 | + logging.error(err) |
| 180 | + return False |
| 181 | + return True |
| 182 | + |
| 183 | + def upload_all(self): |
| 184 | + """Upload all media objects (overwriting existing ones).""" |
| 185 | + local_objects = self.get_local_objects() |
| 186 | + for handle, v in local_objects.items(): |
| 187 | + self.upload(handle, **v) |
| 188 | + |
| 189 | + def upload_missing(self): |
| 190 | + """Upload the media objects that are not yet in the bucket.""" |
| 191 | + local_objects_dict = self.get_local_objects() |
| 192 | + checksum_dict = { |
| 193 | + v["checksum"]: (handle, v["mime"]) |
| 194 | + for handle, v in local_objects_dict.items() |
| 195 | + } |
| 196 | + local_checksums = set(obj["checksum"] for obj in local_objects_dict.values()) |
| 197 | + remote_checksums = self.get_remote_objects() |
| 198 | + missing = local_checksums - remote_checksums |
| 199 | + num_missing = len(missing) |
| 200 | + self.logger.info("Found {} objects to upload.".format(num_missing)) |
| 201 | + for i, checksum in enumerate(missing): |
| 202 | + self.logger.info( |
| 203 | + "Uploading file {} of {} ({}%)".format( |
| 204 | + i + 1, num_missing, round(100 * i / num_missing) |
| 205 | + ) |
| 206 | + ) |
| 207 | + handle, mime = checksum_dict[checksum] |
| 208 | + self.upload(handle, checksum, mime) |
0 commit comments