Skip to content

Commit 6daf144

Browse files
authored
fix: FIT-720: Fix binary octet processing in image caching (#9378)
1 parent 1d80f3b commit 6daf144

File tree

9 files changed

+438
-7
lines changed

9 files changed

+438
-7
lines changed

.pre-commit-dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ repos:
1414
rev: "v0.1.0"
1515
hooks:
1616
- id: biome-check
17-
args: [ --config-path, ./web ]
17+
args: [ --config-path, ./web, --diagnostic-level=error ]
1818
additional_dependencies: [ "@biomejs/biome@1.9.4" ]
1919
files: ^web/.*
2020
- repo: local

label_studio/data_manager/actions/experimental.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ def add_data_field(project, queryset, **kwargs):
165165
value = request.data.get('value')
166166
size = queryset.count()
167167

168+
# Save a task ID before the update, because queryset filters may no longer
169+
# match after we modify the data field (e.g. changing the filtered column value).
170+
first_task_id = queryset.values_list('id', flat=True).first()
171+
168172
cast = {'String': str, 'Number': float, 'Expression': str}
169173
assert value_type in cast.keys()
170174
value = cast[value_type](value)
@@ -192,7 +196,13 @@ def add_data_field(project, queryset, **kwargs):
192196
)
193197
)
194198

195-
project.summary.update_data_columns([queryset.first()])
199+
# Fetch the task by saved ID since the original queryset filters
200+
# may no longer match after the data field was modified.
201+
if first_task_id is not None:
202+
first_task = Task.objects.filter(id=first_task_id).first()
203+
if first_task is not None:
204+
project.summary.update_data_columns([first_task])
205+
196206
return {'response_code': 200, 'detail': f'Updated {size} tasks'}
197207

198208

label_studio/io_storages/proxy_api.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import logging
3+
import mimetypes
34
import time
45
from typing import Union
56
from urllib.parse import unquote
@@ -246,6 +247,13 @@ def proxy_data_from_storage(self, request, uri, project, storage):
246247
status=status.HTTP_424_FAILED_DEPENDENCY,
247248
)
248249

250+
# Detect content type from URI file extension as a safety net for all storage types
251+
# when the storage returns a generic type like binary/octet-stream or application/octet-stream
252+
if not content_type or 'octet-stream' in content_type:
253+
guessed_type, _ = mimetypes.guess_type(uri)
254+
if guessed_type:
255+
content_type = guessed_type
256+
249257
# Create time-limited stream
250258
time_limited_stream = self.time_limited_chunker(stream)
251259

label_studio/io_storages/s3/tests/__init__.py

Whitespace-only changes.
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""Tests for resolve_s3_url content type handling.
2+
3+
Verifies that presigned URLs include ResponseContentType based on file extension,
4+
so S3 returns the correct MIME type even if objects were uploaded without one.
5+
"""
6+
import unittest
7+
from unittest.mock import MagicMock
8+
9+
from io_storages.s3.utils import resolve_s3_url
10+
11+
12+
class TestResolveS3UrlContentType(unittest.TestCase):
13+
"""Test that resolve_s3_url adds ResponseContentType to presigned URL params."""
14+
15+
def setUp(self):
16+
self.mock_client = MagicMock()
17+
self.mock_client.generate_presigned_url.return_value = 'https://bucket.s3.amazonaws.com/presigned'
18+
19+
def test_jpeg_gets_response_content_type(self):
20+
"""Presigned URL for .jpg should include ResponseContentType=image/jpeg"""
21+
resolve_s3_url('s3://bucket/path/image.jpg', self.mock_client, presign=True)
22+
23+
call_args = self.mock_client.generate_presigned_url.call_args
24+
params = (
25+
call_args[1]['Params']
26+
if 'Params' in call_args[1]
27+
else call_args[0][1]
28+
if len(call_args[0]) > 1
29+
else call_args[1].get('Params')
30+
)
31+
assert params['ResponseContentType'] == 'image/jpeg'
32+
33+
def test_png_gets_response_content_type(self):
34+
"""Presigned URL for .png should include ResponseContentType=image/png"""
35+
resolve_s3_url('s3://bucket/path/image.png', self.mock_client, presign=True)
36+
37+
call_args = self.mock_client.generate_presigned_url.call_args
38+
params = call_args[1]['Params']
39+
assert params['ResponseContentType'] == 'image/png'
40+
41+
def test_mp4_gets_response_content_type(self):
42+
"""Presigned URL for .mp4 should include ResponseContentType=video/mp4"""
43+
resolve_s3_url('s3://bucket/videos/clip.mp4', self.mock_client, presign=True)
44+
45+
call_args = self.mock_client.generate_presigned_url.call_args
46+
params = call_args[1]['Params']
47+
assert params['ResponseContentType'] == 'video/mp4'
48+
49+
def test_unknown_extension_no_response_content_type(self):
50+
"""Presigned URL for unknown extension should NOT include ResponseContentType"""
51+
resolve_s3_url('s3://bucket/data/file.xyz123', self.mock_client, presign=True)
52+
53+
call_args = self.mock_client.generate_presigned_url.call_args
54+
params = call_args[1]['Params']
55+
assert 'ResponseContentType' not in params
56+
57+
def test_no_extension_no_response_content_type(self):
58+
"""Presigned URL for file without extension should NOT include ResponseContentType"""
59+
resolve_s3_url('s3://bucket/data/README', self.mock_client, presign=True)
60+
61+
call_args = self.mock_client.generate_presigned_url.call_args
62+
params = call_args[1]['Params']
63+
assert 'ResponseContentType' not in params
64+
65+
def test_json_gets_response_content_type(self):
66+
"""Presigned URL for .json should include ResponseContentType=application/json"""
67+
resolve_s3_url('s3://bucket/data/tasks.json', self.mock_client, presign=True)
68+
69+
call_args = self.mock_client.generate_presigned_url.call_args
70+
params = call_args[1]['Params']
71+
assert params['ResponseContentType'] == 'application/json'
72+
73+
def test_tiff_gets_response_content_type(self):
74+
"""Presigned URL for .tiff should include ResponseContentType=image/tiff"""
75+
resolve_s3_url('s3://bucket/scans/scan.tiff', self.mock_client, presign=True)
76+
77+
call_args = self.mock_client.generate_presigned_url.call_args
78+
params = call_args[1]['Params']
79+
assert params['ResponseContentType'] == 'image/tiff'
80+
81+
def test_presign_false_does_not_use_response_content_type(self):
82+
"""When presign=False, blob is returned directly - no presigned URL generated"""
83+
mock_body = MagicMock()
84+
mock_body.read.return_value = b'fake image data'
85+
self.mock_client.get_object.return_value = {
86+
'ResponseMetadata': {'HTTPHeaders': {'content-type': 'binary/octet-stream'}},
87+
'Body': mock_body,
88+
}
89+
90+
result = resolve_s3_url('s3://bucket/image.jpg', self.mock_client, presign=False)
91+
92+
# Should return base64 data, not call generate_presigned_url
93+
self.mock_client.generate_presigned_url.assert_not_called()
94+
assert result.startswith('data:binary/octet-stream;base64,')
95+
96+
def test_bucket_and_key_always_present(self):
97+
"""Bucket and Key should always be in params regardless of content type detection"""
98+
resolve_s3_url('s3://my-bucket/path/to/file.jpg', self.mock_client, presign=True)
99+
100+
call_args = self.mock_client.generate_presigned_url.call_args
101+
params = call_args[1]['Params']
102+
assert params['Bucket'] == 'my-bucket'
103+
assert params['Key'] == 'path/to/file.jpg'
104+
105+
def test_expires_in_passed_through(self):
106+
"""ExpiresIn parameter should be passed through to generate_presigned_url"""
107+
resolve_s3_url('s3://bucket/file.jpg', self.mock_client, presign=True, expires_in=120)
108+
109+
call_args = self.mock_client.generate_presigned_url.call_args
110+
assert call_args[1]['ExpiresIn'] == 120

label_studio/io_storages/s3/utils.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import base64
44
import fnmatch
55
import logging
6+
import mimetypes
67
import re
78
from urllib.parse import urlparse
89

@@ -55,9 +56,17 @@ def resolve_s3_url(url, client, presign=True, expires_in=3600):
5556

5657
# Otherwise try to generate presigned url
5758
try:
58-
presigned_url = client.generate_presigned_url(
59-
ClientMethod='get_object', Params={'Bucket': bucket_name, 'Key': key}, ExpiresIn=expires_in
60-
)
59+
params = {'Bucket': bucket_name, 'Key': key}
60+
61+
# Override response Content-Type based on file extension so that S3 returns
62+
# the correct MIME type even if the object was uploaded without one.
63+
# Without this, S3 may return binary/octet-stream which causes browsers
64+
# to download files instead of displaying them inline (e.g. images, audio, video).
65+
content_type, _ = mimetypes.guess_type(key)
66+
if content_type:
67+
params['ResponseContentType'] = content_type
68+
69+
presigned_url = client.generate_presigned_url(ClientMethod='get_object', Params=params, ExpiresIn=expires_in)
6170
except ClientError as exc:
6271
logger.warning(f"Can't generate presigned URL. Reason: {exc}")
6372
return url

label_studio/io_storages/tests/test_proxy_api.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,117 @@ def test_redirect_to_presign_url_exception(self):
112112
result = self.mixin.redirect_to_presign_url('fileuri', self.task, 'Task')
113113
assert result.status_code == status.HTTP_404_NOT_FOUND
114114

115+
def test_proxy_data_from_storage_content_type_fallback_for_octet_stream(self):
116+
"""Test that proxy detects correct content type from URI when storage returns octet-stream.
117+
118+
S3 objects uploaded without explicit Content-Type often have binary/octet-stream.
119+
The proxy should detect the correct type from the URI file extension.
120+
"""
121+
mock_storage = MagicMock()
122+
mock_stream = MagicMock()
123+
mock_metadata = {
124+
'StatusCode': 200,
125+
'ContentLength': 1000,
126+
'LastModified': datetime.now(),
127+
'ETag': '"abcdef123456"',
128+
}
129+
# Storage returns binary/octet-stream (common S3 default for missing Content-Type)
130+
mock_storage.get_bytes_stream.return_value = (mock_stream, 'binary/octet-stream', mock_metadata)
131+
mock_project = MagicMock()
132+
133+
with patch('io_storages.proxy_api.StreamingHttpResponse') as mock_response_class, patch(
134+
'io_storages.proxy_api.settings'
135+
) as mock_settings:
136+
mock_settings.RESOLVER_PROXY_MAX_RANGE_SIZE = 1024 * 1024
137+
mock_settings.RESOLVER_PROXY_BUFFER_SIZE = 8192
138+
mock_settings.RESOLVER_PROXY_CACHE_TIMEOUT = 3600
139+
mock_settings.RESOLVER_PROXY_TIMEOUT = 20
140+
mock_settings.RESOLVER_PROXY_ENABLE_ETAG_CACHE = False
141+
142+
mock_response = MagicMock()
143+
mock_response.headers = {}
144+
mock_response_class.return_value = mock_response
145+
self.request.headers = {}
146+
147+
# URI with .jpg extension - should be detected as image/jpeg
148+
self.mixin.proxy_data_from_storage(self.request, 's3://bucket/photo.jpg', mock_project, mock_storage)
149+
150+
# Verify StreamingHttpResponse was called with image/jpeg, not binary/octet-stream
151+
call_args, call_kwargs = mock_response_class.call_args
152+
assert call_kwargs.get('content_type') == 'image/jpeg' or (
153+
len(call_args) > 1 and call_args[1] == 'image/jpeg'
154+
), f'Expected content_type=image/jpeg, got: args={call_args}, kwargs={call_kwargs}'
155+
156+
def test_proxy_data_from_storage_content_type_fallback_for_application_octet_stream(self):
157+
"""Test fallback for application/octet-stream (another generic type)."""
158+
mock_storage = MagicMock()
159+
mock_stream = MagicMock()
160+
mock_metadata = {
161+
'StatusCode': 200,
162+
'ContentLength': 5000,
163+
'LastModified': datetime.now(),
164+
'ETag': '"xyz789"',
165+
}
166+
mock_storage.get_bytes_stream.return_value = (mock_stream, 'application/octet-stream', mock_metadata)
167+
mock_project = MagicMock()
168+
169+
with patch('io_storages.proxy_api.StreamingHttpResponse') as mock_response_class, patch(
170+
'io_storages.proxy_api.settings'
171+
) as mock_settings:
172+
mock_settings.RESOLVER_PROXY_MAX_RANGE_SIZE = 1024 * 1024
173+
mock_settings.RESOLVER_PROXY_BUFFER_SIZE = 8192
174+
mock_settings.RESOLVER_PROXY_CACHE_TIMEOUT = 3600
175+
mock_settings.RESOLVER_PROXY_TIMEOUT = 20
176+
mock_settings.RESOLVER_PROXY_ENABLE_ETAG_CACHE = False
177+
178+
mock_response = MagicMock()
179+
mock_response.headers = {}
180+
mock_response_class.return_value = mock_response
181+
self.request.headers = {}
182+
183+
self.mixin.proxy_data_from_storage(self.request, 's3://bucket/video.mp4', mock_project, mock_storage)
184+
185+
call_args, call_kwargs = mock_response_class.call_args
186+
assert call_kwargs.get('content_type') == 'video/mp4' or (
187+
len(call_args) > 1 and call_args[1] == 'video/mp4'
188+
), f'Expected content_type=video/mp4, got: args={call_args}, kwargs={call_kwargs}'
189+
190+
def test_proxy_data_from_storage_preserves_correct_content_type(self):
191+
"""When storage returns a proper content type, it should not be overridden."""
192+
mock_storage = MagicMock()
193+
mock_stream = MagicMock()
194+
mock_metadata = {
195+
'StatusCode': 200,
196+
'ContentLength': 1000,
197+
'LastModified': datetime.now(),
198+
'ETag': '"test"',
199+
}
200+
# Storage returns correct content type
201+
mock_storage.get_bytes_stream.return_value = (mock_stream, 'image/webp', mock_metadata)
202+
mock_project = MagicMock()
203+
204+
with patch('io_storages.proxy_api.StreamingHttpResponse') as mock_response_class, patch(
205+
'io_storages.proxy_api.settings'
206+
) as mock_settings:
207+
mock_settings.RESOLVER_PROXY_MAX_RANGE_SIZE = 1024 * 1024
208+
mock_settings.RESOLVER_PROXY_BUFFER_SIZE = 8192
209+
mock_settings.RESOLVER_PROXY_CACHE_TIMEOUT = 3600
210+
mock_settings.RESOLVER_PROXY_TIMEOUT = 20
211+
mock_settings.RESOLVER_PROXY_ENABLE_ETAG_CACHE = False
212+
213+
mock_response = MagicMock()
214+
mock_response.headers = {}
215+
mock_response_class.return_value = mock_response
216+
self.request.headers = {}
217+
218+
self.mixin.proxy_data_from_storage(self.request, 's3://bucket/photo.jpg', mock_project, mock_storage)
219+
220+
# Should preserve the original image/webp, not override with image/jpeg
221+
call_args, call_kwargs = mock_response_class.call_args
222+
assert call_kwargs.get('content_type') == 'image/webp' or (
223+
len(call_args) > 1 and call_args[1] == 'image/webp'
224+
), f'Expected content_type=image/webp, got: args={call_args}, kwargs={call_kwargs}'
225+
115226
def test_proxy_data_from_storage_success(self):
116227
mock_storage = MagicMock()
117228
# Ensure get_bytes_stream returns a three-tuple, metadata can be empty initially

0 commit comments

Comments
 (0)