Skip to content

Commit 8dc9d6d

Browse files
committed
🚚(back) serve legacy documents from Scaleway S3
Documents are now served from the `aws/` directory in Scaleway S3. They are served using the django-storage already in place. As content headers cannot be set to Scaleway Edge services URLs, legacy documents need to be renamed to their filenames. Adding a management command to do that.
1 parent c94c87b commit 8dc9d6d

File tree

7 files changed

+212
-37
lines changed

7 files changed

+212
-37
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Versioning](https://semver.org/spec/v2.0.0.html).
1111
### Changed
1212

1313
- Serve legacy deposited files from Scaleway S3 after AWS migration
14+
- Serve legacy documents from Scaleway S3 after AWS migration
1415

1516
## [5.10.0] - 2025-07-09
1617

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""Rename documents in Scaleway S3 for serving them with the right name."""
2+
3+
import logging
4+
5+
from django.conf import settings
6+
from django.core.management.base import BaseCommand
7+
8+
import boto3
9+
10+
from marsha.core.defaults import AWS_S3, AWS_STORAGE_BASE_DIRECTORY, READY
11+
from marsha.core.models.file import Document
12+
from marsha.core.storage.storage_class import file_storage
13+
from marsha.core.utils import time_utils
14+
15+
16+
logger = logging.getLogger(__name__)
17+
18+
scw_credentials = {
19+
"aws_access_key_id": settings.STORAGE_S3_ACCESS_KEY,
20+
"aws_secret_access_key": settings.STORAGE_S3_SECRET_KEY,
21+
"region_name": settings.STORAGE_S3_REGION_NAME,
22+
"endpoint_url": settings.STORAGE_S3_ENDPOINT_URL,
23+
}
24+
25+
# Configure medialive client
26+
s3_client = boto3.client("s3", **scw_credentials)
27+
28+
29+
class Command(BaseCommand):
30+
"""Rename documents in Scaleway S3 to their filename."""
31+
32+
help = "Rename documents in Scaleway S3 to their filename."
33+
34+
def validate_filename(self, value):
35+
"""Transform filename to make it valid."""
36+
37+
value = value.replace("/", "_")
38+
value = value.replace("\\", "_")
39+
value = value.lstrip(".")
40+
41+
return value
42+
43+
def handle(self, *args, **options):
44+
"""Execute management command."""
45+
46+
documents = Document.objects.filter(storage_location=AWS_S3, upload_state=READY)
47+
48+
for document in documents:
49+
# Get the file stored on Scaleway S3 under `aws/`
50+
stamp = time_utils.to_timestamp(document.uploaded_on)
51+
extension = "." + document.extension if document.extension else ""
52+
53+
file_key_src = document.get_storage_key(
54+
filename=f"{stamp}{extension}", base_dir=AWS_STORAGE_BASE_DIRECTORY
55+
)
56+
copy_source = {
57+
"Bucket": settings.STORAGE_S3_BUCKET_NAME,
58+
"Key": file_key_src,
59+
}
60+
61+
filename = self.validate_filename(document.filename)
62+
63+
# Override document filename with the validated S3-compatible filename
64+
if filename != document.filename:
65+
document.filename = filename
66+
document.save()
67+
68+
# Compute file key destination which should be the document filename
69+
file_key_dest = document.get_storage_key(
70+
filename, base_dir=AWS_STORAGE_BASE_DIRECTORY
71+
)
72+
if file_storage.exists(file_key_dest):
73+
logger.info("Object %s already exists", file_key_dest)
74+
continue
75+
76+
logger.info("Copying %s to %s", file_key_src, file_key_dest)
77+
s3_client.copy_object(
78+
Bucket=settings.STORAGE_S3_BUCKET_NAME,
79+
CopySource=copy_source,
80+
Key=file_key_dest,
81+
)
82+
83+
logger.info("Finished copying!")

src/backend/marsha/core/models/file.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from django.utils.translation import gettext_lazy as _
55

66
from marsha.core.defaults import (
7+
AWS_STORAGE_BASE_DIRECTORY,
78
DELETED_STORAGE_BASE_DIRECTORY,
89
DOCUMENT_STORAGE_BASE_DIRECTORY,
910
PENDING,
@@ -227,6 +228,9 @@ def get_storage_key(
227228
passed.
228229
"""
229230
base = base_dir
231+
if base == AWS_STORAGE_BASE_DIRECTORY:
232+
return f"{base}/{self.pk}/document/{filename}"
233+
230234
if base == DELETED_STORAGE_BASE_DIRECTORY:
231235
base = f"{base}/{DOCUMENT_STORAGE_BASE_DIRECTORY}"
232236

src/backend/marsha/core/serializers/file.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,22 @@
11
"""Structure of Document related models API responses with Django Rest Framework serializers."""
22

3-
from datetime import timedelta
43
import mimetypes
54
from os.path import splitext
6-
from urllib.parse import quote_plus
75

86
from django.conf import settings
97
from django.core.exceptions import ImproperlyConfigured
108
from django.urls import reverse
11-
from django.utils import timezone
129

13-
from botocore.signers import CloudFrontSigner
1410
from rest_framework import serializers
1511

16-
from marsha.core.defaults import SCW_S3
12+
from marsha.core.defaults import AWS_STORAGE_BASE_DIRECTORY, SCW_S3
1713
from marsha.core.models import Document
1814
from marsha.core.serializers.base import (
1915
TimestampField,
2016
UploadableFileWithExtensionSerializerMixin,
2117
)
2218
from marsha.core.serializers.playlist import PlaylistLiteSerializer
2319
from marsha.core.storage.storage_class import file_storage
24-
from marsha.core.utils import cloudfront_utils, time_utils
2520

2621

2722
class DocumentSerializer(
@@ -117,29 +112,12 @@ def get_url(self, obj):
117112

118113
if obj.storage_location == SCW_S3:
119114
file_key = obj.get_storage_key(self.get_filename(obj))
120-
121-
return file_storage.url(file_key)
122-
123-
# Default AWS fallback:
124-
url = (
125-
f"{settings.AWS_S3_URL_PROTOCOL}://{settings.CLOUDFRONT_DOMAIN}/{obj.pk}/document/"
126-
f"{time_utils.to_timestamp(obj.uploaded_on)}{self._get_extension_string(obj)}?response"
127-
f"-content-disposition={quote_plus('attachment; filename=' + self.get_filename(obj))}"
128-
)
129-
130-
# Sign the document urls only if the functionality is activated
131-
if settings.CLOUDFRONT_SIGNED_URLS_ACTIVE:
132-
date_less_than = timezone.now() + timedelta(
133-
seconds=settings.CLOUDFRONT_SIGNED_URLS_VALIDITY
134-
)
135-
cloudfront_signer = CloudFrontSigner(
136-
settings.CLOUDFRONT_SIGNED_PUBLIC_KEY_ID, cloudfront_utils.rsa_signer
137-
)
138-
url = cloudfront_signer.generate_presigned_url(
139-
url, date_less_than=date_less_than
115+
else:
116+
file_key = obj.get_storage_key(
117+
self.get_filename(obj), base_dir=AWS_STORAGE_BASE_DIRECTORY
140118
)
141119

142-
return url
120+
return file_storage.url(file_key)
143121

144122

145123
class DocumentSelectLTISerializer(serializers.ModelSerializer):
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""Test the ``rename_documents`` management command."""
2+
3+
from datetime import datetime, timezone
4+
from unittest import mock
5+
6+
from django.core.management import call_command
7+
from django.test import TestCase
8+
9+
from botocore.stub import Stubber
10+
11+
from marsha import settings
12+
from marsha.core.defaults import AWS_S3, PENDING, READY, SCW_S3
13+
from marsha.core.factories import DocumentFactory
14+
from marsha.core.management.commands import rename_documents
15+
from marsha.core.models.file import Document
16+
from marsha.core.utils import time_utils
17+
18+
19+
class RenameDocumentsTestCase(TestCase):
20+
"""
21+
Test the ``rename_documents`` command.
22+
"""
23+
24+
@mock.patch("marsha.core.storage.storage_class.file_storage.exists")
25+
def test_rename_documents(self, mock_exists):
26+
"""Command should rename document S3 objects to their filename."""
27+
28+
mock_exists.return_value = False
29+
30+
now = datetime(2018, 8, 8, tzinfo=timezone.utc)
31+
32+
with Stubber(rename_documents.s3_client) as s3_client_stubber:
33+
# Generate some documents
34+
# (<original filename>, <expected and cleaned>)
35+
filenames = [
36+
("normal_filename.pdf", "normal_filename.pdf"),
37+
("weird\\file/name.pdf", "weird_file_name.pdf"),
38+
(".hidden_file", "hidden_file"),
39+
]
40+
41+
documents = []
42+
for filename_src, _ in filenames:
43+
document = DocumentFactory(
44+
filename=filename_src,
45+
uploaded_on=now,
46+
upload_state=READY,
47+
storage_location=AWS_S3,
48+
)
49+
documents.append(document)
50+
51+
# Create mocks for copy_objects with Stubber
52+
# Note: Stubber requires that its mocks are called in the exact order they
53+
# were created, so we must iterate over objects.all() in the same sequence
54+
for document in Document.objects.all():
55+
stamp = time_utils.to_timestamp(document.uploaded_on)
56+
extension = "." + document.extension if document.extension else ""
57+
58+
file_key_src = f"aws/{document.id}/document/{stamp}{extension}"
59+
60+
sanitized_filename = rename_documents.Command().validate_filename(
61+
document.filename
62+
)
63+
file_key_dest = f"aws/{document.id}/document/{sanitized_filename}"
64+
65+
expected_params = {
66+
"Bucket": settings.STORAGE_S3_BUCKET_NAME,
67+
"CopySource": {
68+
"Bucket": settings.STORAGE_S3_BUCKET_NAME,
69+
"Key": file_key_src,
70+
},
71+
"Key": file_key_dest,
72+
}
73+
s3_client_stubber.add_response("copy_object", {}, expected_params)
74+
75+
# Create some documents that should not be concerned
76+
DocumentFactory(
77+
upload_state=READY,
78+
storage_location=SCW_S3,
79+
)
80+
DocumentFactory(
81+
upload_state=PENDING,
82+
storage_location=AWS_S3,
83+
)
84+
85+
call_command("rename_documents")
86+
87+
s3_client_stubber.assert_no_pending_responses()
88+
89+
# Check that each document.filename has been updated with the clean
90+
# S3-compatible filename
91+
for document, (_, expected_filename) in zip(documents, filenames):
92+
document.refresh_from_db()
93+
assert document.filename == expected_filename
94+
95+
@mock.patch("marsha.core.storage.storage_class.file_storage.exists")
96+
def test_rename_documents_file_exists(self, mock_exists):
97+
"""Command should not copy document if file already exists."""
98+
99+
mock_exists.return_value = True
100+
101+
now = datetime(2018, 8, 8, tzinfo=timezone.utc)
102+
103+
DocumentFactory(
104+
filename="filename.pdf",
105+
uploaded_on=now,
106+
)
107+
108+
with Stubber(rename_documents.s3_client) as s3_client_stubber:
109+
call_command("rename_documents")
110+
s3_client_stubber.assert_no_pending_responses()

src/backend/marsha/core/tests/test_api_document.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_api_document_fetch_student(self):
5656
content, {"detail": "You do not have permission to perform this action."}
5757
)
5858

59-
@override_settings(CLOUDFRONT_SIGNED_URLS_ACTIVE=False)
59+
@override_settings(MEDIA_URL="https://abc.svc.edge.scw.cloud/")
6060
def test_api_document_fetch_instructor_on_aws(self):
6161
"""An instructor should be able to fetch a document."""
6262
document = DocumentFactory(
@@ -88,9 +88,8 @@ def test_api_document_fetch_instructor_on_aws(self):
8888
"is_ready_to_show": True,
8989
"title": document.title,
9090
"upload_state": "ready",
91-
"url": "https://abc.cloudfront.net/4c51f469-f91e-4998-b438-e31ee3bd3ea6/"
92-
"document/1533686400.pdf"
93-
"?response-content-disposition=attachment%3B+filename%3Dfoo_bar-baz.pdf",
91+
"url": "https://abc.svc.edge.scw.cloud/aws/4c51f469-f91e-4998-b438-e31ee3bd3ea6/"
92+
"document/foo_bar-baz.pdf",
9493
"show_download": True,
9594
"playlist": {
9695
"id": str(document.playlist.id),

src/backend/marsha/core/tests/views/test_public_document.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def tearDown(self):
2727
super().tearDown()
2828
cache.clear()
2929

30+
@override_settings(MEDIA_URL="https://abc.svc.edge.scw.cloud/")
3031
def test_document_publicly_accessible_on_aws(self):
3132
"""Validate to access to a public document."""
3233
document = DocumentFactory(
@@ -85,9 +86,8 @@ def test_document_publicly_accessible_on_aws(self):
8586
"lti_id": "course-v1:ufr+mathematics+00001",
8687
},
8788
"url": (
88-
"https://abc.cloudfront.net/301b5f4f-b9f1-4a5f-897d-f8f1bf22c396"
89-
"/document/1569309880?response-content-disposition=attachment%3B"
90-
"+filename%3Dplaylist-003_document-001"
89+
"https://abc.svc.edge.scw.cloud/aws/301b5f4f-b9f1-4a5f-897d-f8f1bf22c396"
90+
"/document/playlist-003_document-001"
9191
),
9292
},
9393
)
@@ -192,6 +192,7 @@ def test_document_publicly_accessible_on_scw(self):
192192
"frame-ancestors trusted_domain.com *.trusted_domain.com;",
193193
)
194194

195+
@override_settings(MEDIA_URL="https://abc.svc.edge.scw.cloud/")
195196
def test_public_document_without_consumer_site(self):
196197
"""Public document without consumer site should have x-frame-options header"""
197198
organization = OrganizationFactory()
@@ -249,9 +250,8 @@ def test_public_document_without_consumer_site(self):
249250
"lti_id": None,
250251
},
251252
"url": (
252-
f"https://abc.cloudfront.net/{document.pk}"
253-
"/document/1569309880?response-content-disposition=attachment%3B"
254-
"+filename%3Dplaylist-003_document-001"
253+
f"https://abc.svc.edge.scw.cloud/aws/{document.pk}"
254+
"/document/playlist-003_document-001"
255255
),
256256
},
257257
)

0 commit comments

Comments
 (0)