Skip to content

Commit d80f72e

Browse files
SNOW-922991: fix file chunks size for files over 80GB (#1777)
1 parent 829101d commit d80f72e

File tree

4 files changed

+76
-3
lines changed

4 files changed

+76
-3
lines changed

DESCRIPTION.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
1212

1313
- Added support for `use_logical_type` in `write_pandas`.
1414
- Removed dependencies on pycryptodomex and oscrypto. All connections now go through OpenSSL via the cryptography library, which was already a dependency.
15+
- Fixed issue with ingesting files over 80 GB to S3.
1516
- Added the `backoff_policy` argument to `snowflake.connector.connect` allowing for configurable backoff policy between retries of failed requests. See available implementations in the `backoff_policies` module.
1617
- Added the `socket_timeout` argument to `snowflake.connector.connect` specifying socket read and connect timeout.
1718
- Fixed `login_timeout` and `network_timeout` behaviour. Retries of login and network requests are now properly halted after these timeouts expire.

src/snowflake/connector/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,14 @@ class IterUnit(Enum):
318318
TABLE_UNIT = "table"
319319

320320

321+
# Amazon S3 multipart upload limits
322+
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
323+
S3_DEFAULT_CHUNK_SIZE = 8 * 1024**2
324+
S3_MAX_OBJECT_SIZE = 5 * 1024**4
325+
S3_MAX_PART_SIZE = 5 * 1024**3
326+
S3_MIN_PART_SIZE = 5 * 1024**2
327+
S3_MAX_PARTS = 10000
328+
321329
S3_CHUNK_SIZE = 8388608 # boto3 default
322330
AZURE_CHUNK_SIZE = 4 * megabyte
323331

src/snowflake/connector/file_transfer_agent.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import binascii
99
import glob
10+
import math
1011
import mimetypes
1112
import os
1213
import sys
@@ -21,12 +22,17 @@
2122
from .azure_storage_client import SnowflakeAzureRestClient
2223
from .compat import GET_CWD, IS_WINDOWS
2324
from .constants import (
25+
AZURE_CHUNK_SIZE,
2426
AZURE_FS,
2527
CMD_TYPE_DOWNLOAD,
2628
CMD_TYPE_UPLOAD,
2729
GCS_FS,
2830
LOCAL_FS,
31+
S3_DEFAULT_CHUNK_SIZE,
2932
S3_FS,
33+
S3_MAX_OBJECT_SIZE,
34+
S3_MAX_PARTS,
35+
S3_MIN_PART_SIZE,
3036
ResultStatus,
3137
megabyte,
3238
)
@@ -183,6 +189,28 @@ def _update_progress(
183189
return progress == 1.0
184190

185191

192+
def _chunk_size_calculator(file_size: int) -> int:
193+
# S3 has limitation on the num of parts to be uploaded, this helper method recalculate the num of parts
194+
if file_size > S3_MAX_OBJECT_SIZE:
195+
# check if we don't exceed the allowed S3 max file size 5 TiB
196+
raise ValueError(
197+
f"File size {file_size} exceeds the maximum file size {S3_MAX_OBJECT_SIZE} allowed in S3."
198+
)
199+
200+
# num_parts = math.ceil(file_size / default_chunk_size)
201+
# if num_parts is greater than the allowed S3_MAX_PARTS, we update our chunk_size, otherwise we use the default one
202+
calculated_chunk_size = (
203+
max(math.ceil(file_size / S3_MAX_PARTS), S3_MIN_PART_SIZE)
204+
if math.ceil(file_size / S3_DEFAULT_CHUNK_SIZE) > S3_MAX_PARTS
205+
else S3_DEFAULT_CHUNK_SIZE
206+
)
207+
if calculated_chunk_size != S3_DEFAULT_CHUNK_SIZE:
208+
logger.debug(
209+
f"Setting chunksize to {calculated_chunk_size} instead of the default {S3_DEFAULT_CHUNK_SIZE}."
210+
)
211+
return calculated_chunk_size
212+
213+
186214
def percent(seen_so_far: int, size: float) -> float:
187215
return 1.0 if seen_so_far >= size or size <= 0 else float(seen_so_far / size)
188216

@@ -630,8 +658,6 @@ def function_and_callback_wrapper(
630658
def _create_file_transfer_client(
631659
self, meta: SnowflakeFileMeta
632660
) -> SnowflakeStorageClient:
633-
from .constants import AZURE_CHUNK_SIZE, S3_CHUNK_SIZE
634-
635661
if self._stage_location_type == LOCAL_FS:
636662
return SnowflakeLocalStorageClient(
637663
meta,
@@ -651,7 +677,7 @@ def _create_file_transfer_client(
651677
meta,
652678
self._credentials,
653679
self._stage_info,
654-
S3_CHUNK_SIZE,
680+
_chunk_size_calculator(meta.src_file_size),
655681
use_accelerate_endpoint=self._use_accelerate_endpoint,
656682
use_s3_regional_url=self._use_s3_regional_url,
657683
)

test/unit/test_compute_chunk_size.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#
2+
# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
#
4+
5+
import pytest
6+
7+
pytestmark = pytest.mark.skipolddriver
8+
9+
10+
def test_check_chunk_size():
11+
from snowflake.connector.constants import (
12+
S3_MAX_OBJECT_SIZE,
13+
S3_MAX_PART_SIZE,
14+
S3_MIN_PART_SIZE,
15+
)
16+
from snowflake.connector.file_transfer_agent import _chunk_size_calculator
17+
18+
expected_chunk_size = 8 * 1024**2
19+
sample_file_size_2gb = 2 * 1024**3
20+
sample_file_size_under_5tb = 4.9 * 1024**4
21+
sample_file_size_6tb = 6 * 1024**4
22+
sample_chunk_size_4mb = 4 * 1024**2
23+
24+
chunk_size_1 = _chunk_size_calculator(sample_file_size_2gb)
25+
assert chunk_size_1 == expected_chunk_size
26+
27+
chunk_size_2 = _chunk_size_calculator(int(sample_file_size_under_5tb))
28+
assert chunk_size_2 <= S3_MAX_PART_SIZE
29+
30+
with pytest.raises(ValueError) as exc:
31+
_chunk_size_calculator(sample_file_size_6tb)
32+
assert (
33+
f"File size {sample_file_size_6tb} exceeds the maximum file size {S3_MAX_OBJECT_SIZE} allowed in S3."
34+
in str(exc)
35+
)
36+
37+
chunk_size_1 = _chunk_size_calculator(sample_chunk_size_4mb)
38+
assert chunk_size_1 >= S3_MIN_PART_SIZE

0 commit comments

Comments
 (0)