-
Notifications
You must be signed in to change notification settings - Fork 110
updated default block size #509
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
23cd7f8
2559f6e
1c8d123
78e6ef5
b3c456b
7f28bbe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,7 +69,7 @@ | |
"is_current_version", | ||
] | ||
_ROOT_PATH = "/" | ||
_DEFAULT_BLOCK_SIZE = 4 * 1024 * 1024 | ||
_DEFAULT_BLOCK_SIZE = 50 * 2**20 | ||
|
||
_SOCKET_TIMEOUT_DEFAULT = object() | ||
|
||
|
@@ -177,8 +177,7 @@ class AzureBlobFileSystem(AsyncFileSystem): | |
The credentials with which to authenticate. Optional if the account URL already has a SAS token. | ||
Can include an instance of TokenCredential class from azure.identity.aio. | ||
blocksize: int | ||
The block size to use for download/upload operations. Defaults to hardcoded value of | ||
``BlockBlobService.MAX_BLOCK_SIZE`` | ||
The block size to use for download/upload operations. Defaults to 50 MiB | ||
client_id: str | ||
Client ID to use when authenticating using an AD Service Principal client/secret. | ||
client_secret: str | ||
|
@@ -1879,6 +1878,8 @@ def _open( | |
is versioning aware and blob versioning is enabled on the releveant container. | ||
""" | ||
logger.debug(f"_open: {path}") | ||
if block_size is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should make sure to update the
|
||
block_size = self.blocksize | ||
if not self.version_aware and version_id: | ||
raise ValueError( | ||
"version_id cannot be specified if the filesystem " | ||
|
@@ -1901,7 +1902,7 @@ def _open( | |
class AzureBlobFile(AbstractBufferedFile): | ||
"""File-like operations on Azure Blobs""" | ||
|
||
DEFAULT_BLOCK_SIZE = 5 * 2**20 | ||
DEFAULT_BLOCK_SIZE = _DEFAULT_BLOCK_SIZE | ||
|
||
def __init__( | ||
self, | ||
|
@@ -2146,7 +2147,7 @@ async def _async_initiate_upload(self, **kwargs): | |
|
||
_initiate_upload = sync_wrapper(_async_initiate_upload) | ||
|
||
def _get_chunks(self, data, chunk_size=1024**3): # Keeping the chunk size as 1 GB | ||
def _get_chunks(self, data, chunk_size): | ||
|
||
start = 0 | ||
length = len(data) | ||
while start < length: | ||
|
@@ -2173,7 +2174,7 @@ async def _async_upload_chunk(self, final: bool = False, **kwargs): | |
commit_kw["headers"] = {"If-None-Match": "*"} | ||
if self.mode in {"wb", "xb"}: | ||
try: | ||
for chunk in self._get_chunks(data): | ||
for chunk in self._get_chunks(data, self.blocksize): | ||
async with self.container_client.get_blob_client( | ||
blob=self.blob | ||
) as bc: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
import datetime | ||
import math | ||
import os | ||
import tempfile | ||
from unittest import mock | ||
from unittest.mock import patch | ||
|
||
import azure.storage.blob.aio | ||
import dask.dataframe as dd | ||
|
@@ -2045,3 +2047,37 @@ def test_open_file_x(storage: azure.storage.blob.BlobServiceClient, tmpdir): | |
with fs.open("data/afile", "xb") as f: | ||
pass | ||
assert fs.cat_file("data/afile") == b"data" | ||
|
||
|
||
@pytest.mark.parametrize("blocksize", [5 * 2**20, 50 * 2**20, 100 * 2**20]) | ||
|
||
def test_number_of_blocks(storage, mocker, blocksize): | ||
|
||
fs = AzureBlobFileSystem( | ||
account_name=storage.account_name, | ||
connection_string=CONN_STR, | ||
blocksize=blocksize, | ||
) | ||
|
||
content = b"1" * (blocksize * 2 + 1) | ||
with fs.open("data/root/a/file.txt", "wb", blocksize=blocksize) as f: | ||
|
||
mocker.patch( | ||
"azure.storage.blob.aio.BlobClient.commit_block_list", autospec=True | ||
) | ||
with patch( | ||
"azure.storage.blob.aio.BlobClient.stage_block", autospec=True | ||
) as mock_stage_block: | ||
|
||
f.write(content) | ||
expected_blocks = math.ceil(len(content) / blocksize) | ||
actual_blocks = mock_stage_block.call_count | ||
|
||
assert actual_blocks == expected_blocks | ||
|
||
|
||
|
||
def test_block_size(storage): | ||
|
||
fs = AzureBlobFileSystem( | ||
account_name=storage.account_name, | ||
connection_string=CONN_STR, | ||
blocksize=5 * 2**20, | ||
|
||
) | ||
|
||
with fs.open("data/root/a/file.txt", "wb") as f: | ||
assert f.blocksize == 5 * 2**20 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also make sure to update the changelog in this PR. I'm thinking we can just use the three bullet points from this comment I made: #509 (comment) and make the wording a bit more succinct.