Fix s3 filesystem abstraction bugs.

igorborgest · igorborgest · commit 3f3a9b72cbbb · 2020-08-31T15:23:53.000-03:00
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -282,8 +282,7 @@ def get_even_chunks_sizes(total_size: int, chunk_size: int, upper_bound: bool) -
     """Calculate even chunks sizes (Best effort)."""
     round_func: Callable[[float], float] = math.ceil if upper_bound is True else math.floor
     num_chunks: int = int(round_func(float(total_size) / float(chunk_size)))
-    if num_chunks < 1:
-        raise ValueError("Invalid chunks size requirements.")
+    num_chunks = 1 if num_chunks < 1 else num_chunks
     base_size: int = int(total_size / num_chunks)
     rest: int = total_size % num_chunks
     sizes: List[int] = list(itertools.repeat(base_size, num_chunks))
diff --git a/awswrangler/s3/_fs.py b/awswrangler/s3/_fs.py
@@ -332,9 +332,11 @@ def _fetch(self, start: int, end: int) -> None:
         )
 
         # Calculating missing bytes in cache
-        if (new_block_start < self._start and new_block_end > self._end) or (
-            new_block_start > self._end and new_block_end < self._start
-        ):  # Full block download
+        if (  # Full block download
+            (new_block_start < self._start and new_block_end > self._end)
+            or new_block_start > self._end
+            or new_block_end < self._start
+        ):
             self._cache = self._fetch_range_proxy(new_block_start, new_block_end)
         elif new_block_end > self._end:
             prune_diff: int = new_block_start - self._start
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -40,7 +40,7 @@ def _read_parquet_metadata_file(
         path=path,
         mode="rb",
         use_threads=use_threads,
-        s3_block_size=1_048_576,  # 1 MB (1 * 2**20)
+        s3_block_size=131_072,  # 128 KB (128 * 2**10)
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
     ) as f:
@@ -339,12 +339,14 @@ def _count_row_groups(
         path=path,
         mode="rb",
         use_threads=use_threads,
-        s3_block_size=1_048_576,  # 1 MB (1 * 2**20)
+        s3_block_size=131_072,  # 128 KB (128 * 2**10)
         s3_additional_kwargs=s3_additional_kwargs,
         boto3_session=boto3_session,
     ) as f:
         pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
-        return cast(int, pq_file.num_row_groups)
+        n: int = cast(int, pq_file.num_row_groups)
+        _logger.debug("Row groups count: %d", n)
+        return n
 
 
 def _read_parquet_row_group(
diff --git a/tests/test_fs.py b/tests/test_fs.py
@@ -174,3 +174,15 @@ def test_cache(path, use_threads, block_size, text):
             assert value == text[i].encode("utf-8")
             assert len(s3obj._cache) in (block_size, block_size - 1, len(text))
     assert s3obj._cache == b""
+
+
+def test_cache_seek(path):
+    client_s3 = boto3.client("s3")
+    path = f"{path}0.txt"
+    bucket, key = wr._utils.parse_path(path)
+    text = "0" * 1_000_000 + "1" * 4
+    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
+    with open_s3_object(path, mode="rb", s3_block_size=1_000) as s3obj:
+        s3obj.seek(1_000_000)
+        assert s3obj.read(100).decode("utf-8") == "1" * 4
+    assert s3obj._cache == b""