Skip to content

Commit b4ab037

Browse files
authored
[SDBELGA-1051] improve(s3): Use ObjectId and store filename in metadata (#3169)
* [SDBELGA-1051] improve(s3): Use ObjectId and store filename in metadata * fix s3 tests * remove commented test code * Update pydocs, convert filename to ASCII, fix test mock args order * fix unicode filename test
1 parent b235511 commit b4ab037

2 files changed

Lines changed: 75 additions & 67 deletions

File tree

superdesk/storage/amazon_media_storage.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@
1818
import time
1919
import unidecode
2020

21-
from os.path import splitext
22-
from urllib.parse import urlparse
2321
from botocore.client import Config
22+
from bson import ObjectId
2423

25-
from superdesk.media.media_operations import download_file_from_url, guess_media_extension
24+
from superdesk.media.media_operations import download_file_from_url
2625
from .superdesk_file import SuperdeskFile
2726
from superdesk.utc import query_datetime
2827
from . import SuperdeskMediaStorage
@@ -35,7 +34,7 @@
3534

3635

3736
class AmazonObjectWrapper(SuperdeskFile):
38-
def __init__(self, s3_object, name, metadata):
37+
def __init__(self, s3_object, key, metadata):
3938
super().__init__()
4039

4140
s3_body = s3_object["Body"]
@@ -48,12 +47,11 @@ def __init__(self, s3_object, name, metadata):
4847
self.seek(0)
4948
self.content_type = s3_object["ContentType"]
5049
self.length = int(s3_object["ContentLength"])
51-
self._name = name
52-
self.filename = name
53-
self.metadata = metadata
50+
self.metadata = metadata or {}
51+
self._name = self.filename = self.metadata.get("filename") or key
5452
self.upload_date = s3_object["LastModified"]
5553
self.md5 = s3_object["ETag"][1:-1]
56-
self._id = name
54+
self._id = key
5755

5856

5957
class AmazonMediaStorage(SuperdeskMediaStorage):
@@ -121,19 +119,21 @@ def get_translation_table():
121119

122120
return unidecode.unidecode(str(_id)).translate(get_translation_table())
123121

124-
def media_id(self, filename, content_type=None, version=True):
125-
"""Get the ``media_id`` path for the given ``filename``.
126-
127-
if filename doesn't have an extension one is guessed,
128-
and additional *version* option to have automatic version or not to have,
129-
or to send a `string` one.
122+
def media_id(self, version: bool | str = True) -> str:
123+
"""
124+
Generates a media ID string using a configurable time-based prefix and a unique identifier.
125+
126+
This method generates a media ID based on a configurable time granularity or provided
127+
custom version. The time granularity settings can be 'hourly', 'daily', or 'none'. A unique
128+
identifier is appended to the generated string, ensuring a unique media ID is produced
129+
each time the method is called.
130+
131+
:param version: Determines the versioning format. If True, an automatic time-based
132+
version is generated based on the 'AMAZON_MEDIA_ID_TIME_PREFIX' configuration
133+
from the application. If False, no version prefix is included. If a string is
134+
provided, it is used as a custom prefix after stripping any surrounding slashes.
135+
:returns: A unique media ID string composed of the specified versioning format and a unique identifier.
130136
"""
131-
path = urlparse(filename).path.split("/")[-1]
132-
file_extension = splitext(path)[1]
133-
134-
extension = ""
135-
if not file_extension:
136-
extension = str(guess_media_extension(content_type)) if content_type else ""
137137

138138
if version is True:
139139
folder_granularity = str(self.app.config.get("AMAZON_MEDIA_ID_TIME_PREFIX", "hourly")).lower()
@@ -150,7 +150,7 @@ def media_id(self, filename, content_type=None, version=True):
150150
else:
151151
version = "%s/" % version.strip("/")
152152

153-
return "%s%s%s" % (version, self._make_s3_safe(filename), extension)
153+
return f"{version}{ObjectId()}"
154154

155155
def fetch_rendition(self, rendition):
156156
stream, name, mime = download_file_from_url(rendition.get("href"))
@@ -211,7 +211,9 @@ def _get_all_keys_in_batches(self):
211211
def extract_metadata_from_headers(self, request_headers):
212212
headers = {}
213213
for key, value in request_headers.items():
214-
if self.user_metadata_header in key:
214+
if key == "filename":
215+
headers["filename"] = value
216+
elif self.user_metadata_header in key:
215217
new_key = key.split(self.user_metadata_header)[1]
216218
if value:
217219
try:
@@ -258,7 +260,7 @@ def put(
258260
content_type = self._get_mimetype(content, filename, content_type)
259261

260262
if not _id:
261-
_id = self.media_id(filename, content_type=content_type, version=version)
263+
_id = self.media_id(version)
262264

263265
if folder:
264266
_id = "%s/%s" % (folder.rstrip("/"), _id)
@@ -273,6 +275,10 @@ def put(
273275
# probably better to turn on/off public-read on the bucket instead
274276
kwargs["ACL"] = acl
275277

278+
if filename:
279+
kwargs.setdefault("Metadata", {})
280+
kwargs["Metadata"]["filename"] = self._make_s3_safe(filename)
281+
276282
try:
277283
self.call("put_object", Key=_id, Body=content, ContentType=content_type, **kwargs)
278284
return _id

tests/storage/amazon_media_storage_test.py

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -28,41 +28,40 @@ def setUp(self):
2828
p.start()
2929
self.addCleanup(p.stop)
3030

31-
def test_media_url(self):
32-
filename = "test"
31+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
32+
def test_media_url(self, _mock_object_id):
3333
# automatic version is set on hourly granularity.
3434
time_id = time.strftime("%Y%m%d%H")
35-
media_id = self.amazon.media_id(filename)
36-
self.assertEqual("%s/%s" % (time_id, filename), media_id)
35+
media_id = self.amazon.media_id()
36+
self.assertEqual(f"{time_id}/507f1f77bcf86cd799439011", media_id)
3737
self.assertEqual(self.amazon.url_for_media(media_id), "https://acname.s3-us-east-1.amazonaws.com/%s" % media_id)
3838
sub = "test-sub"
3939
settings = {"AMAZON_S3_SUBFOLDER": sub, "MEDIA_PREFIX": "https://acname.s3-us-east-1.amazonaws.com/" + sub}
4040
with patch.dict(self.app.config, settings):
41-
media_id = self.amazon.media_id(filename)
42-
self.assertEqual("%s/%s" % (time_id, filename), media_id)
41+
media_id = self.amazon.media_id()
42+
self.assertEqual(f"{time_id}/507f1f77bcf86cd799439011", media_id)
4343
path = "%s/%s" % (sub, media_id)
4444
self.assertEqual(self.amazon.url_for_media(media_id), "https://acname.s3-us-east-1.amazonaws.com/%s" % path)
4545
with patch.object(self.amazon, "client") as s3:
4646
self.amazon.get(media_id)
4747
self.assertTrue(s3.get_object.called)
4848
self.assertEqual(s3.get_object.call_args[1], dict(Bucket="acname", Key=path))
4949

50-
def test_media_id_time_prefix(self):
51-
filename = "test"
52-
50+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
51+
def test_media_id_time_prefix(self, _mock_object_id):
5352
with patch.dict(self.app.config, {"AMAZON_MEDIA_ID_TIME_PREFIX": "none"}):
54-
media_id = self.amazon.media_id(filename)
55-
self.assertEqual(filename, media_id)
53+
media_id = self.amazon.media_id()
54+
self.assertEqual("507f1f77bcf86cd799439011", media_id)
5655

5756
with patch("superdesk.storage.amazon_media_storage.time.strftime", return_value="20260102"):
5857
with patch.dict(self.app.config, {"AMAZON_MEDIA_ID_TIME_PREFIX": "daily"}):
59-
media_id = self.amazon.media_id(filename)
60-
self.assertEqual("20260102/%s" % filename, media_id)
58+
media_id = self.amazon.media_id()
59+
self.assertEqual("20260102/507f1f77bcf86cd799439011", media_id)
6160

6261
with patch("superdesk.storage.amazon_media_storage.time.strftime", return_value="2026010215"):
6362
with patch.dict(self.app.config, {"AMAZON_MEDIA_ID_TIME_PREFIX": "hourly"}):
64-
media_id = self.amazon.media_id(filename)
65-
self.assertEqual("2026010215/%s" % filename, media_id)
63+
media_id = self.amazon.media_id()
64+
self.assertEqual("2026010215/507f1f77bcf86cd799439011", media_id)
6665

6766
def test_put_and_delete(self):
6867
"""Test amazon if configured.
@@ -83,22 +82,24 @@ def test_put_and_delete(self):
8382
else:
8483
self.assertTrue(True)
8584

86-
def test_put_into_folder(self):
85+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
86+
@patch("superdesk.storage.amazon_media_storage.time.strftime", return_value="2026010215")
87+
def test_put_into_folder(self, mock_time_strftime, _mock_object_id):
8788
data = b"test data"
8889
folder = "s3test"
8990
filename = "abc123.zip"
9091
content_type = "application/zip"
9192
self.amazon.client.put_object = Mock()
92-
self.amazon.media_id = Mock(return_value=filename)
9393
self.amazon._check_exists = Mock(return_value=False)
9494

9595
self.amazon.put(data, filename, content_type, folder=folder)
9696

9797
kwargs = {
98-
"Key": "{}/{}".format(folder, filename),
98+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
9999
"Body": data,
100100
"Bucket": "acname",
101101
"ContentType": content_type,
102+
"Metadata": {"filename": filename},
102103
}
103104
self.amazon.client.put_object.assert_called_once_with(**kwargs)
104105

@@ -147,30 +148,28 @@ def test_guess_extension(self):
147148
# leave empty when there is no extension
148149
self.assertEqual("", guess_media_extension("audio/foo"))
149150

150-
def test_media_url_none_utf8(self):
151+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
152+
def test_media_url_none_utf8(self, _mock_object_id):
151153
filename = "[DIARY NOTE] – Victory In The Pacific Day Commemoration - Thursday (1)"
152154
# automatic version is set on hourly granularity.
153155
time_id = time.strftime("%Y%m%d%H")
154-
media_id = self.amazon.media_id(filename)
155-
self.assertEqual(
156-
"%s/%s" % (time_id, "DIARY NOTE - Victory In The Pacific Day Commemoration - Thursday (1)"), media_id
157-
)
156+
media_id = self.amazon.media_id()
157+
self.assertEqual(f"{time_id}/507f1f77bcf86cd799439011", media_id)
158158
self.assertEqual(self.amazon.url_for_media(media_id), "https://acname.s3-us-east-1.amazonaws.com/%s" % media_id)
159159
sub = "test-sub"
160160
settings = {"AMAZON_S3_SUBFOLDER": sub, "MEDIA_PREFIX": "https://acname.s3-us-east-1.amazonaws.com/" + sub}
161161
with patch.dict(self.app.config, settings):
162-
media_id = self.amazon.media_id(filename)
163-
self.assertEqual(
164-
"%s/%s" % (time_id, "DIARY NOTE - Victory In The Pacific Day Commemoration - Thursday (1)"), media_id
165-
)
162+
media_id = self.amazon.media_id()
163+
self.assertEqual(f"{time_id}/507f1f77bcf86cd799439011", media_id)
166164
path = "%s/%s" % (sub, media_id)
167165
self.assertEqual(self.amazon.url_for_media(media_id), "https://acname.s3-us-east-1.amazonaws.com/%s" % path)
168166
with patch.object(self.amazon, "client") as s3:
169167
self.amazon.get(media_id)
170168
self.assertTrue(s3.get_object.called)
171169
self.assertEqual(s3.get_object.call_args[1], dict(Bucket="acname", Key=path))
172170

173-
def test_put_into_folder_none_utf8(self):
171+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
172+
def test_put_into_folder_none_utf8(self, _mock_object_id):
174173
data = b"test data"
175174
folder = "s3test"
176175
filename = "[DIARY NOTE] – Victory In The Pacific Day Commemoration - Thursday (1).pdf"
@@ -181,10 +180,11 @@ def test_put_into_folder_none_utf8(self):
181180
self.amazon.put(data, filename, content_type, folder=folder, version=False)
182181

183182
kwargs = {
184-
"Key": "{}/{}".format(folder, "DIARY NOTE - Victory In The Pacific Day Commemoration - Thursday (1).pdf"),
183+
"Key": f"{folder}/507f1f77bcf86cd799439011",
185184
"Body": data,
186185
"Bucket": "acname",
187186
"ContentType": content_type,
187+
"Metadata": {"filename": "DIARY NOTE - Victory In The Pacific Day Commemoration - Thursday (1).pdf"},
188188
}
189189
self.amazon.client.put_object.assert_called_once_with(**kwargs)
190190

@@ -197,22 +197,24 @@ def test_get_none_utf8(self):
197197
kwargs = {"Bucket": "acname", "Key": "DIARY NOTE - Victory In The Pacific Day Commemoration - Thursday (1).pdf"}
198198
self.amazon.client.get_object.assert_called_once_with(**kwargs)
199199

200-
def test_mimetype_detect(self):
200+
@patch("superdesk.storage.amazon_media_storage.ObjectId", return_value="507f1f77bcf86cd799439011")
201+
@patch("superdesk.storage.amazon_media_storage.time.strftime", return_value="2026010215")
202+
def test_mimetype_detect(self, mock_time_strftime, _mock_object_id):
201203
# keep default mimetype
202204
content = b"bytes are here"
203205
filename = "extensionless"
204206
content_type = "text/css"
205207
folder = "f1"
206208
self.amazon.client.put_object = Mock()
207209
self.amazon._check_exists = Mock(return_value=False)
208-
self.amazon.media_id = Mock(return_value=filename)
209210
self.amazon.put(content, filename, content_type, folder=folder)
210211
self.amazon.client.put_object.assert_called_once_with(
211212
**{
212-
"Key": "{}/{}".format(folder, filename),
213+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
213214
"Body": content,
214215
"Bucket": "acname",
215216
"ContentType": content_type,
217+
"Metadata": {"filename": filename},
216218
}
217219
)
218220

@@ -223,14 +225,14 @@ def test_mimetype_detect(self):
223225
folder = "f1"
224226
self.amazon.client.put_object = Mock()
225227
self.amazon._check_exists = Mock(return_value=False)
226-
self.amazon.media_id = Mock(return_value=filename)
227228
self.amazon.put(content, filename, content_type, folder=folder)
228229
self.amazon.client.put_object.assert_called_once_with(
229230
**{
230-
"Key": "{}/{}".format(folder, filename),
231+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
231232
"Body": b"bytes are here",
232233
"Bucket": "acname",
233234
"ContentType": "text/css",
235+
"Metadata": {"filename": filename},
234236
}
235237
)
236238

@@ -240,14 +242,14 @@ def test_mimetype_detect(self):
240242
folder = "f1"
241243
self.amazon.client.put_object = Mock()
242244
self.amazon._check_exists = Mock(return_value=False)
243-
self.amazon.media_id = Mock(return_value=filename)
244245
self.amazon.put(content, filename, content_type, folder=folder)
245246
self.amazon.client.put_object.assert_called_once_with(
246247
**{
247-
"Key": "{}/{}".format(folder, filename),
248+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
248249
"Body": b"bytes are here",
249250
"Bucket": "acname",
250251
"ContentType": "image/jpeg",
252+
"Metadata": {"filename": filename},
251253
}
252254
)
253255

@@ -260,14 +262,14 @@ def test_mimetype_detect(self):
260262
folder = "f1"
261263
self.amazon.client.put_object = Mock()
262264
self.amazon._check_exists = Mock(return_value=False)
263-
self.amazon.media_id = Mock(return_value=filename)
264265
self.amazon.put(content, filename, content_type, folder=folder)
265266
self.amazon.client.put_object.assert_called_once_with(
266267
**{
267-
"Key": "{}/{}".format(folder, filename),
268+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
268269
"Body": content,
269270
"Bucket": "acname",
270271
"ContentType": "image/jpeg",
272+
"Metadata": {"filename": filename},
271273
}
272274
)
273275

@@ -277,14 +279,14 @@ def test_mimetype_detect(self):
277279
folder = "f1"
278280
self.amazon.client.put_object = Mock()
279281
self.amazon._check_exists = Mock(return_value=False)
280-
self.amazon.media_id = Mock(return_value=filename)
281282
self.amazon.put(content, filename, content_type, folder=folder)
282283
self.amazon.client.put_object.assert_called_once_with(
283284
**{
284-
"Key": "{}/{}".format(folder, filename),
285+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
285286
"Body": content,
286287
"Bucket": "acname",
287288
"ContentType": "application/vnd.ms-excel",
289+
"Metadata": {"filename": filename},
288290
}
289291
)
290292

@@ -294,14 +296,14 @@ def test_mimetype_detect(self):
294296
folder = "f1"
295297
self.amazon.client.put_object = Mock()
296298
self.amazon._check_exists = Mock(return_value=False)
297-
self.amazon.media_id = Mock(return_value=filename)
298299
self.amazon.put(content, filename, content_type, folder=folder)
299300
self.amazon.client.put_object.assert_called_once_with(
300301
**{
301-
"Key": "{}/{}".format(folder, filename),
302+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
302303
"Body": content,
303304
"Bucket": "acname",
304305
"ContentType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
306+
"Metadata": {"filename": filename},
305307
}
306308
)
307309

@@ -311,13 +313,13 @@ def test_mimetype_detect(self):
311313
folder = "f1"
312314
self.amazon.client.put_object = Mock()
313315
self.amazon._check_exists = Mock(return_value=False)
314-
self.amazon.media_id = Mock(return_value=filename)
315316
self.amazon.put(content, filename, content_type, folder=folder)
316317
self.amazon.client.put_object.assert_called_once_with(
317318
**{
318-
"Key": "{}/{}".format(folder, filename),
319+
"Key": f"{folder}/2026010215/507f1f77bcf86cd799439011",
319320
"Body": content,
320321
"Bucket": "acname",
321322
"ContentType": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
323+
"Metadata": {"filename": filename},
322324
}
323325
)

0 commit comments

Comments
 (0)