@@ -111,17 +111,24 @@ class Config:
111111 disable_async_token_refresh : bool = ConfigAttribute (env = "DATABRICKS_DISABLE_ASYNC_TOKEN_REFRESH" )
112112
113113 enable_experimental_files_api_client : bool = ConfigAttribute (env = "DATABRICKS_ENABLE_EXPERIMENTAL_FILES_API_CLIENT" )
114- files_api_client_download_max_total_recovers = None
115- files_api_client_download_max_total_recovers_without_progressing = 1
116114
117- # File multipart upload parameters
115+ files_ext_client_download_streaming_chunk_size : int = 2 * 1024 * 1024 # 2 MiB
116+
117+ # When downloading a file, the maximum number of attempts to retry downloading the whole file. Default is no limit.
118+ files_ext_client_download_max_total_recovers : Optional [int ] = None
119+
120+ # When downloading a file, the maximum number of attempts to retry downloading from the same offset without progressing.
121+ # This is to avoid infinite retrying when the download is not making any progress. Default is 1.
122+ files_ext_client_download_max_total_recovers_without_progressing = 1
123+
124+ # File multipart upload/download parameters
118125 # ----------------------
119126
120127 # Minimal input stream size (bytes) to use multipart / resumable uploads.
121128 # For small files it's more efficient to make one single-shot upload request.
122129 # When uploading a file, SDK will initially buffer this many bytes from input stream.
123130 # This parameter can be less or bigger than multipart_upload_chunk_size.
124- multipart_upload_min_stream_size : int = 5 * 1024 * 1024
131+ files_ext_multipart_upload_min_stream_size : int = 50 * 1024 * 1024
125132
126133 # Maximum number of presigned URLs that can be requested at a time.
127134 #
@@ -131,31 +138,70 @@ class Config:
131138 # the stream back. In case of a non-seekable stream we cannot rewind, so we'll abort
132139 # the upload. To reduce the chance of this, we're requesting presigned URLs one by one
133140 # and using them immediately.
134- multipart_upload_batch_url_count : int = 1
141+ files_ext_multipart_upload_batch_url_count : int = 1
135142
136- # Size of the chunk to use for multipart uploads.
143+ # Size of the chunk to use for multipart uploads & downloads .
137144 #
138145 # The smaller chunk is, the less chance for network errors (or URL get expired),
139146 # but the more requests we'll make.
140147 # For AWS, minimum is 5Mb: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
141148 # For GCP, minimum is 256 KiB (and also recommended multiple is 256 KiB)
142149 # boto uses 8Mb: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
143- multipart_upload_chunk_size : int = 10 * 1024 * 1024
144-
145- # use maximum duration of 1 hour
146- multipart_upload_url_expiration_duration : datetime .timedelta = datetime .timedelta (hours = 1 )
150+ files_ext_multipart_upload_default_part_size : int = 10 * 1024 * 1024 # 10 MiB
151+
152+ # List of multipart upload part sizes that can be automatically selected
153+ files_ext_multipart_upload_part_size_options : list = [
154+ 10 * 1024 * 1024 , # 10 MiB
155+ 20 * 1024 * 1024 , # 20 MiB
156+ 50 * 1024 * 1024 , # 50 MiB
157+ 100 * 1024 * 1024 , # 100 MiB
158+ 200 * 1024 * 1024 , # 200 MiB
159+ 500 * 1024 * 1024 , # 500 MiB
160+ 1 * 1024 * 1024 * 1024 , # 1 GiB
161+ 2 * 1024 * 1024 * 1024 , # 2 GiB
162+ 4 * 1024 * 1024 * 1024 , # 4 GiB
163+ ]
164+
165+ # Maximum size of a single part in multipart upload.
166+ # For AWS, maximum is 5 GiB: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
167+ # For Azure, maximum is 4 GiB: https://learn.microsoft.com/en-us/rest/api/storageservices/put-block
168+ # For CloudFlare R2, maximum is 5 GiB: https://developers.cloudflare.com/r2/objects/multipart-objects/
169+ files_ext_multipart_upload_max_part_size : int = 4 * 1024 * 1024 * 1024 # 4 GiB
170+
171+ # Default parallel multipart upload concurrency. Set to 10 because of the experiment results show that it
172+ # gives good performance result.
173+ files_ext_multipart_upload_default_parallelism : int = 10
174+
175+ # The expiration duration for presigned URLs used in multipart uploads and downloads.
176+ # The client will request new presigned URLs if the previous one is expired. The duration should be long enough
177+ # to complete the upload or download of a single part.
178+ files_ext_multipart_upload_url_expiration_duration : datetime .timedelta = datetime .timedelta (hours = 1 )
179+ files_ext_presigned_download_url_expiration_duration : datetime .timedelta = datetime .timedelta (hours = 1 )
180+
181+ # When downloading a file in parallel, how many worker threads to use.
182+ files_ext_parallel_download_default_parallelism : int = 10
183+
184+ # When downloading a file, if the file size is smaller than this threshold,
185+ # We'll use a single-threaded download even if the parallel download is enabled.
186+ files_ext_parallel_download_min_file_size : int = 50 * 1024 * 1024 # 50 MiB
187+
188+ # Default chunk size to use when downloading a file in parallel. Not effective for single threaded download.
189+ files_ext_parallel_download_default_part_size : int = 10 * 1024 * 1024 # 10 MiB
147190
148191 # This is not a "wall time" cutoff for the whole upload request,
149192 # but a maximum time between consecutive data reception events (even 1 byte) from the server
150- multipart_upload_single_chunk_upload_timeout_seconds : float = 60
193+ files_ext_network_transfer_inactivity_timeout_seconds : float = 60
151194
152195 # Cap on the number of custom retries during incremental uploads:
153196 # 1) multipart: upload part URL is expired, so new upload URLs must be requested to continue upload
154197 # 2) resumable: chunk upload produced a retryable response (or exception), so upload status must be
155198 # retrieved to continue the upload.
156199 # In these two cases standard SDK retries (which are capped by the `retry_timeout_seconds` option) are not used.
157200 # Note that retry counter is reset when upload is successfully resumed.
158- multipart_upload_max_retries = 3
201+ files_ext_multipart_upload_max_retries = 3
202+
203+ # Cap on the number of custom retries during parallel downloads.
204+ files_ext_parallel_download_max_retries = 3
159205
160206 def __init__ (
161207 self ,
0 commit comments