Skip to content

Commit 3fcee3b

Browse files
committed
[901] Large file uploads
1 parent 92d72cf commit 3fcee3b

File tree

3 files changed

+1884
-4
lines changed

3 files changed

+1884
-4
lines changed

databricks/sdk/config.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import configparser
22
import copy
3+
import datetime
34
import logging
45
import os
56
import pathlib
@@ -97,6 +98,45 @@ class Config:
9798
files_api_client_download_max_total_recovers = None
9899
files_api_client_download_max_total_recovers_without_progressing = 1
99100

101+
# File multipart upload parameters
102+
# ----------------------
103+
104+
# Minimal input stream size (bytes) to use multipart / resumable uploads.
105+
# For small files it's more efficient to make one single-shot upload request.
106+
# When uploading a file, SDK will initially buffer this many bytes from input stream.
107+
# This parameter can be less or bigger than multipart_upload_chunk_size.
108+
multipart_upload_min_stream_size: int = 5 * 1024 * 1024
109+
110+
# Maximum number of presigned URLs that can be requested at a time.
111+
#
112+
# The more URLs we request at once, the higher chance is that some of the URLs will expire
113+
# before we get to use it. We discover the presigned URL is expired *after* sending the
114+
# input stream partition to the server. So to retry the upload of this partition we must rewind
115+
# the stream back. In case of a non-seekable stream we cannot rewind, so we'll abort
116+
# the upload. To reduce the chance of this, we're requesting presigned URLs one by one
117+
# and using them immediately.
118+
multipart_upload_batch_url_count: int = 1
119+
120+
# Size of the chunk to use for multipart uploads.
121+
#
122+
# The smaller chunk is, the less chance for network errors (or URL get expired),
123+
# but the more requests we'll make.
124+
# For AWS, minimum is 5Mb: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
125+
# For GCP, minimum is 256 KiB (and also recommended multiple is 256 KiB)
126+
# boto uses 8Mb: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
127+
multipart_upload_chunk_size: int = 10 * 1024 * 1024
128+
129+
# use maximum duration of 1 hour
130+
multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
131+
132+
# This is not a "wall time" cutoff for the whole upload request,
133+
# but a maximum time between consecutive data reception events (even 1 byte) from the server
134+
multipart_upload_single_chunk_upload_timeout_seconds: float = 60
135+
136+
# Limit of retries during multipart upload.
137+
# Retry counter is reset when progressing along the stream.
138+
multipart_upload_max_retries = 3
139+
100140
def __init__(
101141
self,
102142
*,

0 commit comments

Comments
 (0)