Updating branch

igorborgest · igorborgest · commit 68e7ec11b65b · 2020-04-08T18:03:20.000Z
diff --git a/README.md b/README.md
@@ -26,6 +26,8 @@
   - [AWS Glue Wheel](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/install.html#aws-glue-wheel)
   - [Amazon SageMaker Notebook](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/install.html#amazon-sagemaker-notebook)
   - [Amazon SageMaker Notebook Lifecycle](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/install.html#amazon-sagemaker-notebook-lifecycle)
+  - [EMR](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/install.html#emr)
+  - [From source](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/install.html#from-source)
 - [**Tutorials**](https://github.com/awslabs/aws-data-wrangler/tree/dev-1.0.0/tutorials)
 - [**API Reference**](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/api.html)
   - [Amazon S3](https://aws-data-wrangler.readthedocs.io/en/dev-1.0.0/api.html#amazon-s3)
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -1198,10 +1198,11 @@ def _read_parquet_init(
     """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
     if dataset is False:
         path_or_paths: Union[str, List[str]] = _path2list(path=path, boto3_session=boto3_session)
+    elif isinstance(path, str):
+        path_or_paths = path[:-1] if path.endswith("/") else path
     else:
         path_or_paths = path
     _logger.debug(f"path_or_paths: {path_or_paths}")
-    print(f"path_or_paths: {path_or_paths}")
     fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
     cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
     data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -5,6 +5,10 @@ Install
 and on several platforms (AWS Lambda, AWS Glue Python Shell, EMR, EC2,
 on-premises, Amazon SageMaker, local, etc).
 
+Some good practices for most of the methods bellow are:
+  - Use new and individual Virtual Environments for each project (`venv <https://docs.python.org/3/library/venv.html>`_).
+  - On Notebooks, always restart your kernel after installations.
+
 PyPI (pip)
 ----------
 
@@ -86,3 +90,48 @@ SageMaker kernels (`Reference <https://github.com/aws-samples/amazon-sagemaker-n
     done
 
     EOF
+
+EMR
+---
+
+Even not being a distributed library,
+AWS Data Wrangler could be a good helper to
+complement Big Data pipelines.
+
+- Configure Python 3 as the default interpreter for
+  PySpark under your cluster configuration
+
+    .. code-block:: json
+
+        [
+          {
+             "Classification": "spark-env",
+             "Configurations": [
+               {
+                 "Classification": "export",
+                 "Properties": {
+                    "PYSPARK_PYTHON": "/usr/bin/python3"
+                  }
+               }
+            ]
+          }
+        ]
+
+- Keep the bootstrap script above on S3 and reference it on your cluster.
+
+    .. code-block:: sh
+
+        #!/usr/bin/env bash
+        set -ex
+
+        sudo pip-3.6 install awswrangler
+
+.. note:: Make sure to freeze the Wrangler version in the bootstrap for productive
+          environments (e.g. awswrangler==1.0.0)
+
+From Source
+-----------
+
+    >>> git clone https://github.com/awslabs/aws-data-wrangler.git
+    >>> cd aws-data-wrangler
+    >>> pip install .
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
@@ -83,6 +83,7 @@ def test_cluster(bucket, cloudformation_outputs):
     step_state = wr.emr.get_step_state(cluster_id=cluster_id, step_id=step_id)
     assert step_state == "PENDING"
     wr.emr.terminate_cluster(cluster_id=cluster_id)
+    wr.s3.delete_objects(f"s3://{bucket}/emr-logs/")
 
 
 def test_cluster_single_node(bucket, cloudformation_outputs):
@@ -144,3 +145,4 @@ def test_cluster_single_node(bucket, cloudformation_outputs):
         steps.append(wr.emr.build_step(name=cmd, command=cmd))
     wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
     wr.emr.terminate_cluster(cluster_id=cluster_id)
+    wr.s3.delete_objects(f"s3://{bucket}/emr-logs/")