Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions NEXT_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,26 @@

### New Features and Improvements

* Add a new interface `upload_from` to `databricks.sdk.mixins.FilesExt` to support upload from a file in local filesystem.
* Improve `databricks.sdk.mixins.FilesExt` upload throughput by uploading data in parallel by default.
* Add a new interface `download_to` to `databricks.sdk.mixins.FilesExt` to support download to a file in local filesystem. This interface will also download the file in parallel by default. Parallel downloading is currently unavailable on Windows.
* Improve `databricks.sdk.mixins.FilesExt.upload` to support uploading when Presigned URL is not enabled for the Workspace by introducing a fallback to Single Part Upload.

### Bug Fixes

### Documentation

### Internal Changes

### API Changes

* Add `upload_from()`, `download_to()` method for `databricks.sdk.mixins.FilesExt`.
* Add `use_parallel`, `parallelism`, `part_size` field for `databricks.sdk.mixins.FilesExt.upload`.
* [Breaking] Change `files_api_client_download_max_total_recovers` to `files_ext_client_download_max_total_recovers` for `databricks.sdk.Config`
* [Breaking] Change `files_api_client_download_max_total_recovers_without_progressing` to `files_ext_client_download_max_total_recovers_without_progressing` for `databricks.sdk.Config`
* [Breaking] Change `multipart_upload_min_stream_size` to `files_ext_multipart_upload_min_stream_size` for `databricks.sdk.Config`
* [Breaking] Change `multipart_upload_batch_url_count` to `files_ext_multipart_upload_batch_url_count` for `databricks.sdk.Config`
* [Breaking] Change `multipart_upload_chunk_size` to `files_ext_multipart_upload_default_part_size` for `databricks.sdk.Config`
* [Breaking] Change `multipart_upload_url_expiration_duration` to `files_ext_multipart_upload_url_expiration_duration` for `databricks.sdk.Config`
* [Breaking] Change `multipart_upload_max_retries` to `files_ext_multipart_upload_max_retries` for `databricks.sdk.Config`
* Add `files_ext_client_download_streaming_chunk_size`, `files_ext_multipart_upload_part_size_options`, `files_ext_multipart_upload_max_part_size`, `files_ext_multipart_upload_default_parallelism`, `files_ext_presigned_download_url_expiration_duration`, `files_ext_parallel_download_default_parallelism`, `files_ext_parallel_download_min_file_size`, `files_ext_parallel_download_default_part_size`, `files_ext_parallel_download_max_retries` for `databricks.sdk.Config`
16 changes: 6 additions & 10 deletions databricks/sdk/__init__.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

76 changes: 62 additions & 14 deletions databricks/sdk/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pathlib
import sys
import urllib.parse
from typing import Dict, Iterable, Optional
from typing import Dict, Iterable, List, Optional

import requests

Expand Down Expand Up @@ -110,18 +110,27 @@ class Config:

disable_async_token_refresh: bool = ConfigAttribute(env="DATABRICKS_DISABLE_ASYNC_TOKEN_REFRESH")

enable_experimental_files_api_client: bool = ConfigAttribute(env="DATABRICKS_ENABLE_EXPERIMENTAL_FILES_API_CLIENT")
files_api_client_download_max_total_recovers = None
files_api_client_download_max_total_recovers_without_progressing = 1
disable_experimental_files_api_client: bool = ConfigAttribute(
env="DATABRICKS_DISABLE_EXPERIMENTAL_FILES_API_CLIENT"
)

files_ext_client_download_streaming_chunk_size: int = 2 * 1024 * 1024 # 2 MiB

# When downloading a file, the maximum number of attempts to retry downloading the whole file. Default is no limit.
files_ext_client_download_max_total_recovers: Optional[int] = None

# File multipart upload parameters
# When downloading a file, the maximum number of attempts to retry downloading from the same offset without progressing.
# This is to avoid infinite retrying when the download is not making any progress. Default is 1.
files_ext_client_download_max_total_recovers_without_progressing = 1

# File multipart upload/download parameters
# ----------------------

# Minimal input stream size (bytes) to use multipart / resumable uploads.
# For small files it's more efficient to make one single-shot upload request.
# When uploading a file, SDK will initially buffer this many bytes from input stream.
# This parameter can be less or bigger than multipart_upload_chunk_size.
multipart_upload_min_stream_size: int = 5 * 1024 * 1024
files_ext_multipart_upload_min_stream_size: int = 50 * 1024 * 1024

# Maximum number of presigned URLs that can be requested at a time.
#
Expand All @@ -131,31 +140,70 @@ class Config:
# the stream back. In case of a non-seekable stream we cannot rewind, so we'll abort
# the upload. To reduce the chance of this, we're requesting presigned URLs one by one
# and using them immediately.
multipart_upload_batch_url_count: int = 1
files_ext_multipart_upload_batch_url_count: int = 1

# Size of the chunk to use for multipart uploads.
# Size of the chunk to use for multipart uploads & downloads.
#
# The smaller chunk is, the less chance for network errors (or URL get expired),
# but the more requests we'll make.
# For AWS, minimum is 5Mb: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
# For GCP, minimum is 256 KiB (and also recommended multiple is 256 KiB)
# boto uses 8Mb: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
multipart_upload_chunk_size: int = 10 * 1024 * 1024

# use maximum duration of 1 hour
multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
files_ext_multipart_upload_default_part_size: int = 10 * 1024 * 1024 # 10 MiB

# List of multipart upload part sizes that can be automatically selected
files_ext_multipart_upload_part_size_options: List[int] = [
10 * 1024 * 1024, # 10 MiB
20 * 1024 * 1024, # 20 MiB
50 * 1024 * 1024, # 50 MiB
100 * 1024 * 1024, # 100 MiB
200 * 1024 * 1024, # 200 MiB
500 * 1024 * 1024, # 500 MiB
1 * 1024 * 1024 * 1024, # 1 GiB
2 * 1024 * 1024 * 1024, # 2 GiB
4 * 1024 * 1024 * 1024, # 4 GiB
]

# Maximum size of a single part in multipart upload.
# For AWS, maximum is 5 GiB: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
# For Azure, maximum is 4 GiB: https://learn.microsoft.com/en-us/rest/api/storageservices/put-block
# For CloudFlare R2, maximum is 5 GiB: https://developers.cloudflare.com/r2/objects/multipart-objects/
files_ext_multipart_upload_max_part_size: int = 4 * 1024 * 1024 * 1024 # 4 GiB

# Default parallel multipart upload concurrency. Set to 10 because of the experiment results show that it
# gives good performance result.
files_ext_multipart_upload_default_parallelism: int = 10

# The expiration duration for presigned URLs used in multipart uploads and downloads.
# The client will request new presigned URLs if the previous one is expired. The duration should be long enough
# to complete the upload or download of a single part.
files_ext_multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
files_ext_presigned_download_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)

# When downloading a file in parallel, how many worker threads to use.
files_ext_parallel_download_default_parallelism: int = 10

# When downloading a file, if the file size is smaller than this threshold,
# We'll use a single-threaded download even if the parallel download is enabled.
files_ext_parallel_download_min_file_size: int = 50 * 1024 * 1024 # 50 MiB

# Default chunk size to use when downloading a file in parallel. Not effective for single threaded download.
files_ext_parallel_download_default_part_size: int = 10 * 1024 * 1024 # 10 MiB

# This is not a "wall time" cutoff for the whole upload request,
# but a maximum time between consecutive data reception events (even 1 byte) from the server
multipart_upload_single_chunk_upload_timeout_seconds: float = 60
files_ext_network_transfer_inactivity_timeout_seconds: float = 60

# Cap on the number of custom retries during incremental uploads:
# 1) multipart: upload part URL is expired, so new upload URLs must be requested to continue upload
# 2) resumable: chunk upload produced a retryable response (or exception), so upload status must be
# retrieved to continue the upload.
# In these two cases standard SDK retries (which are capped by the `retry_timeout_seconds` option) are not used.
# Note that retry counter is reset when upload is successfully resumed.
multipart_upload_max_retries = 3
files_ext_multipart_upload_max_retries = 3

# Cap on the number of custom retries during parallel downloads.
files_ext_parallel_download_max_retries = 3

def __init__(
self,
Expand Down
Loading
Loading