Tuning the default_block_size for s3fs

igorborgest · igorborgest · commit f100749ebb91 · 2020-04-08T19:51:31.000-03:00
diff --git a/README.md b/README.md
@@ -34,6 +34,10 @@
   - [3 - Amazon S3](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/3%20-%20Amazon%20S3.ipynb)
   - [4 - Parquet Datasets](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/4%20-%20Parquet%20Datasets.ipynb)
   - [5 - Glue Catalog](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/5%20-%20Glue%20Catalog.ipynb)
+  - [6 - Amazon Athena](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/6%20-%20Amazon%20Athena.ipynb)
+  - [7 - Databases (Redshift, MySQL and PostgreSQL)](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/7%20-%20Redshift%2C%20MySQL%2C%20PostgreSQL.ipynb)
+  - [8 - Redshift Copy & Unload.ipynb](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/8%20-%20Redshift%20Copy%20%26%20Unload.ipynb)
+  - [9 - Parquet Crawler.ipynb](https://github.com/awslabs/aws-data-wrangler/blob/dev-1.0.0/tutorials/9%20-%20Parquet%20Crawler.ipynb)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/api.html#amazon-s3)
   - [AWS Glue Catalog](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/api.html#aws-glue-catalog)
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -133,7 +133,7 @@ def get_fs(
         use_ssl=True,
         default_cache_type="none",
         default_fill_cache=False,
-        default_block_size=52_428_800,  # 50 MB (50 * 2**20)
+        default_block_size=134_217_728,  # 128 MB (50 * 2**20)
         config_kwargs={"retries": {"mode": "adaptive", "max_attempts": 10}},
         session=ensure_session(session=session)._session,  # pylint: disable=protected-access
         s3_additional_kwargs=s3_additional_kwargs,
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -340,15 +340,15 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
 
     1 - `ctas_approach=True` (`Default`):
     Wrap the query with a CTAS and then reads the table data as parquet directly from s3.
-    PROS: Faster and can handle some level of nested types
+    PROS: Faster and can handle some level of nested types.
     CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone
     (A temporary table will be created and then deleted immediately).
 
     2 - `ctas_approach False`:
     Does a regular query on Athena and parse the regular CSV result on s3.
-    PROS: Does not require create/delete table permissions on Glue and give support timestamp with time zone.
+    PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone.
     CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
-    and does not handle nested types at all
+    and does not handle nested types at all.
 
     Note
     ----
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -10,7 +10,7 @@ pytest~=5.4.1
 pytest-cov~=2.8.1
 pytest-xdist~=1.31.0
 scikit-learn~=0.22.1
-awscli~=1.18.37
+awscli~=1.18.39
 cfn-lint~=0.29.4
 twine~=3.1.1
 wheel~=0.34.2