Skip to content

Commit cb773f1

Browse files
Public Preview of Large File Upload through Files API
1 parent 736a39a commit cb773f1

File tree

6 files changed

+3393
-531
lines changed

6 files changed

+3393
-531
lines changed

NEXT_CHANGELOG.md

Lines changed: 256 additions & 0 deletions
Large diffs are not rendered by default.

databricks/sdk/config.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pathlib
77
import sys
88
import urllib.parse
9-
from typing import Dict, Iterable, Optional
9+
from typing import Dict, Iterable, List, Optional
1010

1111
import requests
1212

@@ -111,17 +111,24 @@ class Config:
111111
disable_async_token_refresh: bool = ConfigAttribute(env="DATABRICKS_DISABLE_ASYNC_TOKEN_REFRESH")
112112

113113
enable_experimental_files_api_client: bool = ConfigAttribute(env="DATABRICKS_ENABLE_EXPERIMENTAL_FILES_API_CLIENT")
114-
files_api_client_download_max_total_recovers = None
115-
files_api_client_download_max_total_recovers_without_progressing = 1
116114

117-
# File multipart upload parameters
115+
files_ext_client_download_streaming_chunk_size: int = 2 * 1024 * 1024 # 2 MiB
116+
117+
# When downloading a file, the maximum number of attempts to retry downloading the whole file. Default is no limit.
118+
files_ext_client_download_max_total_recovers: Optional[int] = None
119+
120+
# When downloading a file, the maximum number of attempts to retry downloading from the same offset without progressing.
121+
# This is to avoid infinite retrying when the download is not making any progress. Default is 1.
122+
files_ext_client_download_max_total_recovers_without_progressing = 1
123+
124+
# File multipart upload/download parameters
118125
# ----------------------
119126

120127
# Minimal input stream size (bytes) to use multipart / resumable uploads.
121128
# For small files it's more efficient to make one single-shot upload request.
122129
# When uploading a file, SDK will initially buffer this many bytes from input stream.
123130
# This parameter can be less or bigger than multipart_upload_chunk_size.
124-
multipart_upload_min_stream_size: int = 5 * 1024 * 1024
131+
files_ext_multipart_upload_min_stream_size: int = 50 * 1024 * 1024
125132

126133
# Maximum number of presigned URLs that can be requested at a time.
127134
#
@@ -131,31 +138,70 @@ class Config:
131138
# the stream back. In case of a non-seekable stream we cannot rewind, so we'll abort
132139
# the upload. To reduce the chance of this, we're requesting presigned URLs one by one
133140
# and using them immediately.
134-
multipart_upload_batch_url_count: int = 1
141+
files_ext_multipart_upload_batch_url_count: int = 1
135142

136-
# Size of the chunk to use for multipart uploads.
143+
# Size of the chunk to use for multipart uploads & downloads.
137144
#
138145
# The smaller chunk is, the less chance for network errors (or URL get expired),
139146
# but the more requests we'll make.
140147
# For AWS, minimum is 5Mb: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
141148
# For GCP, minimum is 256 KiB (and also recommended multiple is 256 KiB)
142149
# boto uses 8Mb: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
143-
multipart_upload_chunk_size: int = 10 * 1024 * 1024
144-
145-
# use maximum duration of 1 hour
146-
multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
150+
files_ext_multipart_upload_default_part_size: int = 10 * 1024 * 1024 # 10 MiB
151+
152+
# List of multipart upload part sizes that can be automatically selected
153+
files_ext_multipart_upload_part_size_options: List[int] = [
154+
10 * 1024 * 1024, # 10 MiB
155+
20 * 1024 * 1024, # 20 MiB
156+
50 * 1024 * 1024, # 50 MiB
157+
100 * 1024 * 1024, # 100 MiB
158+
200 * 1024 * 1024, # 200 MiB
159+
500 * 1024 * 1024, # 500 MiB
160+
1 * 1024 * 1024 * 1024, # 1 GiB
161+
2 * 1024 * 1024 * 1024, # 2 GiB
162+
4 * 1024 * 1024 * 1024, # 4 GiB
163+
]
164+
165+
# Maximum size of a single part in multipart upload.
166+
# For AWS, maximum is 5 GiB: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
167+
# For Azure, maximum is 4 GiB: https://learn.microsoft.com/en-us/rest/api/storageservices/put-block
168+
# For CloudFlare R2, maximum is 5 GiB: https://developers.cloudflare.com/r2/objects/multipart-objects/
169+
files_ext_multipart_upload_max_part_size: int = 4 * 1024 * 1024 * 1024 # 4 GiB
170+
171+
# Default parallel multipart upload concurrency. Set to 10 because of the experiment results show that it
172+
# gives good performance result.
173+
files_ext_multipart_upload_default_parallelism: int = 10
174+
175+
# The expiration duration for presigned URLs used in multipart uploads and downloads.
176+
# The client will request new presigned URLs if the previous one is expired. The duration should be long enough
177+
# to complete the upload or download of a single part.
178+
files_ext_multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
179+
files_ext_presigned_download_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
180+
181+
# When downloading a file in parallel, how many worker threads to use.
182+
files_ext_parallel_download_default_parallelism: int = 10
183+
184+
# When downloading a file, if the file size is smaller than this threshold,
185+
# We'll use a single-threaded download even if the parallel download is enabled.
186+
files_ext_parallel_download_min_file_size: int = 50 * 1024 * 1024 # 50 MiB
187+
188+
# Default chunk size to use when downloading a file in parallel. Not effective for single threaded download.
189+
files_ext_parallel_download_default_part_size: int = 10 * 1024 * 1024 # 10 MiB
147190

148191
# This is not a "wall time" cutoff for the whole upload request,
149192
# but a maximum time between consecutive data reception events (even 1 byte) from the server
150-
multipart_upload_single_chunk_upload_timeout_seconds: float = 60
193+
files_ext_network_transfer_inactivity_timeout_seconds: float = 60
151194

152195
# Cap on the number of custom retries during incremental uploads:
153196
# 1) multipart: upload part URL is expired, so new upload URLs must be requested to continue upload
154197
# 2) resumable: chunk upload produced a retryable response (or exception), so upload status must be
155198
# retrieved to continue the upload.
156199
# In these two cases standard SDK retries (which are capped by the `retry_timeout_seconds` option) are not used.
157200
# Note that retry counter is reset when upload is successfully resumed.
158-
multipart_upload_max_retries = 3
201+
files_ext_multipart_upload_max_retries = 3
202+
203+
# Cap on the number of custom retries during parallel downloads.
204+
files_ext_parallel_download_max_retries = 3
159205

160206
def __init__(
161207
self,

0 commit comments

Comments
 (0)