Skip to content

Commit 56742ca

Browse files
committed
Add one_shot_download strategy.
1 parent b0e6e97 commit 56742ca

File tree

3 files changed

+16
-29
lines changed

3 files changed

+16
-29
lines changed

awswrangler/s3/_fs.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,13 @@ def __init__(
192192
if mode not in {"rb", "wb", "r", "w"}:
193193
raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode)
194194
self._mode: str = "rb" if mode is None else mode
195-
if s3_block_size < 2:
196-
raise exceptions.InvalidArgumentValue("s3_block_size MUST > 1")
195+
self._one_shot_download: bool = False
196+
if s3_block_size == 1:
197+
raise exceptions.InvalidArgumentValue("s3_block_size MUST > 1 to define a valid size or "
198+
"< 1 to avoid blocks and always execute one shot downloads.")
199+
elif s3_block_size < 1:
200+
_logger.debug(f"s3_block_size of %d, enabling one_shot_download.", s3_block_size)
201+
self._one_shot_download = True
197202
self._s3_block_size: int = s3_block_size
198203
self._s3_half_block_size: int = s3_block_size // 2
199204
self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
@@ -304,6 +309,12 @@ def _fetch(self, start: int, end: int) -> None:
304309
self._end = end
305310
return None
306311

312+
if self._one_shot_download:
313+
self._start = 0
314+
self._end = self._size
315+
self._cache = self._fetch_range_proxy(self._start, self._end)
316+
return None
317+
307318
# Calculating block START and END positions
308319
_logger.debug("Downloading: %s (start) / %s (end)", start, end)
309320
mid: int = int(math.ceil((start + (end - 1)) / 2))
@@ -525,7 +536,7 @@ def open_s3_object(
525536
mode: str,
526537
use_threads: bool = False,
527538
s3_additional_kwargs: Optional[Dict[str, str]] = None,
528-
s3_block_size: int = 4_194_304, # 4 MB (4 * 2**20)
539+
s3_block_size: int = -1, # One shot download
529540
boto3_session: Optional[boto3.Session] = None,
530541
newline: Optional[str] = "\n",
531542
encoding: Optional[str] = "utf-8",

awswrangler/s3/_read_parquet.py

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def _read_parquet_file(
318318
path=path,
319319
mode="rb",
320320
use_threads=use_threads,
321-
s3_block_size=134_217_728, # 128 MB (128 * 2**20)
321+
s3_block_size=-1, # One shot download
322322
s3_additional_kwargs=s3_additional_kwargs,
323323
boto3_session=boto3_session,
324324
) as f:
@@ -348,30 +348,6 @@ def _count_row_groups(
348348
return n
349349

350350

351-
def _read_parquet_row_group(
352-
row_group: int,
353-
path: str,
354-
columns: Optional[List[str]],
355-
categories: Optional[List[str]],
356-
boto3_primitives: _utils.Boto3PrimitivesType,
357-
s3_additional_kwargs: Optional[Dict[str, str]],
358-
use_threads: bool,
359-
) -> pa.Table:
360-
boto3_session: boto3.Session = _utils.boto3_from_primitives(primitives=boto3_primitives)
361-
with open_s3_object(
362-
path=path,
363-
mode="rb",
364-
use_threads=use_threads,
365-
s3_block_size=10_485_760, # 10 MB (10 * 2**20)
366-
s3_additional_kwargs=s3_additional_kwargs,
367-
boto3_session=boto3_session,
368-
) as f:
369-
pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
370-
num_row_groups: int = pq_file.num_row_groups
371-
_logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups)
372-
return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
373-
374-
375351
def _read_parquet(
376352
path: str,
377353
columns: Optional[List[str]],

awswrangler/s3/_read_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def _read_text_file(
7777
path=path,
7878
mode=mode,
7979
use_threads=use_threads,
80-
s3_block_size=134_217_728, # 128 MB (128 * 2**20)
80+
s3_block_size=-1, # One shot download
8181
encoding=encoding,
8282
s3_additional_kwargs=s3_additional_kwargs,
8383
newline=newline,

0 commit comments

Comments
 (0)