Skip to content
This repository was archived by the owner on May 5, 2025. It is now read-only.

Commit 1ad7fa7

Browse files
committed
feat: support zstd compression in miniostorage
we want to use zstd compression when compressing files for storage in object storage because it performs better than gzip which is what we were using before these changes are only being made to the minio storage service because we want to consolidate the storage service functionality into this one so both worker and API will be using this backend in the future (API was already using this one) we have to manually decompress the zstd compressed files in read_file but HTTPResponse takes care of it for us if the content encoding of the file is gzip the is_already_gzipped argument is being deprecated in favour of compression_type and is_compressed, also the ability to pass a str to write_file is being deprecated. we're keeping track of the use of these using sentry capture_message
1 parent 12361de commit 1ad7fa7

File tree

5 files changed

+465
-263
lines changed

5 files changed

+465
-263
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ dependencies = [
3434
"requests>=2.32.3",
3535
"sentry-sdk>=2.18.0",
3636
"sqlalchemy<2",
37+
"zstandard==0.23.0",
3738
]
3839

3940
[build-system]

shared/storage/minio.py

Lines changed: 128 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
import gzip
1+
import datetime
22
import json
33
import logging
44
import os
5-
import shutil
65
import sys
76
import tempfile
87
from io import BytesIO
9-
from typing import BinaryIO, overload
8+
from typing import BinaryIO, Protocol, overload
109

10+
import sentry_sdk
11+
import sentry_sdk.scope
12+
import zstandard
1113
from minio import Minio
1214
from minio.credentials import (
1315
ChainedProvider,
@@ -17,13 +19,29 @@
1719
)
1820
from minio.deleteobjects import DeleteObject
1921
from minio.error import MinioException, S3Error
22+
from urllib3.response import HTTPResponse
2023

21-
from shared.storage.base import CHUNK_SIZE, BaseStorageService
24+
from shared.storage.base import BaseStorageService
2225
from shared.storage.exceptions import BucketAlreadyExistsError, FileNotInStorageError
2326

2427
log = logging.getLogger(__name__)
2528

2629

30+
class Readable(Protocol):
31+
def read(self, size: int = -1) -> bytes: ...
32+
33+
34+
class GetObjectToFileResponse(Protocol):
35+
bucket_name: str
36+
object_name: str
37+
last_modified: datetime.datetime | None
38+
etag: str
39+
size: int
40+
content_type: str | None
41+
metadata: dict[str, str]
42+
version_id: str | None
43+
44+
2745
# Service class for interfacing with codecov's underlying storage layer, minio
2846
class MinioStorageService(BaseStorageService):
2947
def __init__(self, minio_config):
@@ -57,20 +75,21 @@ def init_minio_client(
5775
region: str = None,
5876
):
5977
"""
60-
Initialize the minio client
78+
Initialize the minio client
6179
6280
`iam_auth` adds support for IAM base authentication in a fallback pattern.
63-
The following will be checked in order:
81+
The following will be checked in order:
6482
6583
* EC2 metadata -- a custom endpoint can be provided, default is None.
66-
* AWS env vars, specifically AWS_ACCESS_KEY and AWS_SECRECT_KEY
6784
* Minio env vars, specifically MINIO_ACCESS_KEY and MINIO_SECRET_KEY
85+
* AWS env vars, specifically AWS_ACCESS_KEY and AWS_SECRECT_KEY
6886
69-
to support backward compatibility, the iam_auth setting should be used in the installation
70-
configuration
87+
to support backward compatibility, the iam_auth setting should be used
88+
in the installation configuration
7189
7290
Args:
7391
host (str): The address of the host where minio lives
92+
7493
port (str): The port number (as str or int should be ok)
7594
access_key (str, optional): The access key (optional if IAM is being used)
7695
secret_key (str, optional): The secret key (optional if IAM is being used)
@@ -143,50 +162,64 @@ def create_root_storage(self, bucket_name="archive", region="us-east-1"):
143162
# Writes a file to storage will gzip if not compressed already
144163
def write_file(
145164
self,
146-
bucket_name,
147-
path,
148-
data,
149-
reduced_redundancy=False,
165+
bucket_name: str,
166+
path: str,
167+
data: BinaryIO,
168+
reduced_redundancy: bool = False,
150169
*,
151-
is_already_gzipped: bool = False,
170+
is_already_gzipped: bool = False, # deprecated
171+
is_compressed: bool = False,
172+
compression_type: str = "zstd",
152173
):
174+
if is_already_gzipped:
175+
log.warning(
176+
"is_already_gzipped is deprecated and will be removed in a future version, instead compress using zstd and use the is_already_zstd_compressed argument"
177+
)
178+
with sentry_sdk.new_scope() as scope:
179+
scope.set_extra("bucket_name", bucket_name)
180+
scope.set_extra("path", path)
181+
sentry_sdk.capture_message("is_already_gzipped passed with True")
182+
is_compressed = True
183+
compression_type = "gzip"
184+
153185
if isinstance(data, str):
154-
data = data.encode()
186+
log.warning(
187+
"passing data as a str to write_file is deprecated and will be removed in a future version, instead pass an object compliant with the BinaryIO type"
188+
)
189+
with sentry_sdk.new_scope() as scope:
190+
scope.set_extra("bucket_name", bucket_name)
191+
scope.set_extra("path", path)
192+
sentry_sdk.capture_message("write_file data argument passed as str")
155193

156-
if isinstance(data, bytes):
157-
if not is_already_gzipped:
158-
out = BytesIO()
159-
with gzip.GzipFile(fileobj=out, mode="w", compresslevel=9) as gz:
160-
gz.write(data)
161-
else:
162-
out = BytesIO(data)
163-
164-
# get file size
165-
out.seek(0, os.SEEK_END)
166-
out_size = out.tell()
167-
else:
168-
# data is already a file-like object
169-
if not is_already_gzipped:
170-
_, filename = tempfile.mkstemp()
171-
with gzip.open(filename, "wb") as f:
172-
shutil.copyfileobj(data, f)
173-
out = open(filename, "rb")
174-
else:
175-
out = data
194+
data = BytesIO(data.encode())
176195

177-
out_size = os.stat(filename).st_size
196+
if not is_compressed:
197+
cctx = zstandard.ZstdCompressor()
198+
reader: zstandard.ZstdCompressionReader = cctx.stream_reader(data)
199+
_, filepath = tempfile.mkstemp()
200+
with open(filepath, "wb") as f:
201+
while chunk := reader.read(16384):
202+
f.write(chunk)
203+
data = open(filepath, "rb")
178204

179205
try:
180-
# reset pos for minio reading.
181-
out.seek(0)
206+
out_size = data.seek(0, os.SEEK_END)
207+
data.seek(0)
208+
209+
if compression_type == "gzip":
210+
content_encoding = "gzip"
211+
elif compression_type == "zstd":
212+
content_encoding = "zstd"
213+
214+
headers = {"Content-Encoding": content_encoding}
182215

183-
headers = {"Content-Encoding": "gzip"}
184216
if reduced_redundancy:
185217
headers["x-amz-storage-class"] = "REDUCED_REDUNDANCY"
218+
186219
self.minio_client.put_object(
187220
bucket_name,
188221
path,
189-
out,
222+
data,
190223
out_size,
191224
metadata=headers,
192225
content_type="text/plain",
@@ -195,25 +228,65 @@ def write_file(
195228

196229
except MinioException:
197230
raise
231+
finally:
232+
if not is_compressed:
233+
data.close()
234+
os.unlink(filepath)
198235

199236
@overload
200-
def read_file(self, bucket_name: str, path: str) -> bytes: ...
237+
def read_file(
238+
self, bucket_name: str, path: str, file_obj: None = None
239+
) -> bytes: ...
201240

202241
@overload
203-
def read_file(self, bucket_name: str, path: str, file_obj: BinaryIO) -> None: ...
242+
def read_file(self, bucket_name: str, path: str, file_obj: str) -> None: ...
204243

205244
def read_file(self, bucket_name, path, file_obj=None) -> bytes | None:
206245
try:
207-
res = self.minio_client.get_object(bucket_name, path)
208-
if file_obj is None:
209-
data = BytesIO()
210-
for d in res.stream(CHUNK_SIZE):
211-
data.write(d)
212-
data.seek(0)
213-
return data.getvalue()
246+
headers = {"Accept-Encoding": "gzip, zstd"}
247+
if file_obj:
248+
_, tmpfilepath = tempfile.mkstemp()
249+
to_file_response: GetObjectToFileResponse = (
250+
self.minio_client.fget_object(
251+
bucket_name, path, tmpfilepath, request_headers=headers
252+
)
253+
)
254+
data = open(tmpfilepath, "rb")
255+
content_encoding = to_file_response.metadata.get(
256+
"Content-Encoding", None
257+
)
258+
else:
259+
response: HTTPResponse = self.minio_client.get_object(
260+
bucket_name, path, request_headers=headers
261+
)
262+
data = response
263+
content_encoding = response.headers.get("Content-Encoding", None)
264+
265+
reader: Readable | None = None
266+
if content_encoding == "gzip":
267+
# HTTPResponse automatically decodes gzipped data for us
268+
# minio_client.fget_object uses HTTPResponse under the hood,
269+
# so this applies to both get_object and fget_object
270+
reader = data
271+
elif content_encoding == "zstd":
272+
# we have to manually decompress zstandard compressed data
273+
cctx = zstandard.ZstdDecompressor()
274+
reader = cctx.stream_reader(data)
275+
else:
276+
with sentry_sdk.new_scope() as scope:
277+
scope.set_extra("bucket_name", bucket_name)
278+
scope.set_extra("path", path)
279+
raise ValueError("Blob does not have Content-Encoding set")
280+
281+
if file_obj:
282+
while chunk := reader.read(16384):
283+
file_obj.write(chunk)
284+
return None
214285
else:
215-
for d in res.stream(CHUNK_SIZE):
216-
file_obj.write(d)
286+
res = BytesIO()
287+
while chunk := reader.read(16384):
288+
res.write(chunk)
289+
return res.getvalue()
217290
except S3Error as e:
218291
if e.code == "NoSuchKey":
219292
raise FileNotInStorageError(
@@ -222,6 +295,10 @@ def read_file(self, bucket_name, path, file_obj=None) -> bytes | None:
222295
raise e
223296
except MinioException:
224297
raise
298+
finally:
299+
if file_obj:
300+
data.close()
301+
os.unlink(tmpfilepath)
225302

226303
"""
227304
Deletes file url in specified bucket.

tests/unit/storage/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)