Add support to EMR with Docker

igorborgest · igorborgest · commit 9ce624b60fe9 · 2020-04-25T17:41:16.000-03:00
diff --git a/awswrangler/__init__.py b/awswrangler/__init__.py
@@ -9,5 +9,6 @@
 
 from awswrangler import athena, catalog, cloudwatch, db, emr, exceptions, s3  # noqa
 from awswrangler.__metadata__ import __description__, __license__, __title__, __version__  # noqa
+from awswrangler._utils import get_account_id  # noqa
 
 logging.getLogger("awswrangler").addHandler(logging.NullHandler())
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -166,3 +166,16 @@ def ensure_postgresql_casts():
 def get_directory(path: str) -> str:
     """Extract directory path."""
     return path.rsplit(sep="/", maxsplit=1)[0] + "/"
+
+
+def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str:
+    """Get Account ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    return client(service_name="sts", session=session).get_caller_identity().get("Account")
+
+
+def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session] = None) -> str:
+    """Extract region from Subnet ID."""
+    session: boto3.Session = ensure_session(session=boto3_session)
+    client_ec2: boto3.client = client(service_name="ec2", session=session)
+    return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9]
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -68,7 +68,7 @@ def create_athena_bucket(boto3_session: Optional[boto3.Session] = None) -> str:
 
     """
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
-    account_id: str = _utils.client(service_name="sts", session=session).get_caller_identity().get("Account")
+    account_id: str = _utils.get_account_id(boto3_session=session)
     region_name: str = str(session.region_name).lower()
     s3_output = f"s3://aws-athena-query-results-{account_id}-{region_name}/"
     s3_resource = session.resource("s3")
diff --git a/awswrangler/emr.py b/awswrangler/emr.py
diff --git a/awswrangler/s3.py b/awswrangler/s3.py
@@ -111,6 +111,40 @@ def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None)
         raise ex  # pragma: no cover
 
 
+def list_directories(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
+    """List Amazon S3 objects from a prefix.
+
+    Parameters
+    ----------
+    path : str
+        S3 path (e.g. s3://bucket/prefix).
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    List[str]
+        List of objects paths.
+
+    Examples
+    --------
+    Using the default boto3 session
+
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/')
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    Using a custom boto3 session
+
+    >>> import boto3
+    >>> import awswrangler as wr
+    >>> wr.s3.list_objects('s3://bucket/prefix/', boto3_session=boto3.Session())
+    ['s3://bucket/prefix/dir0', 's3://bucket/prefix/dir1', 's3://bucket/prefix/dir2']
+
+    """
+    return _list_objects(path=path, delimiter="/", boto3_session=boto3_session)
+
+
 def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]:
     """List Amazon S3 objects from a prefix.
 
@@ -142,20 +176,37 @@ def list_objects(path: str, boto3_session: Optional[boto3.Session] = None) -> Li
     ['s3://bucket/prefix0', 's3://bucket/prefix1', 's3://bucket/prefix2']
 
     """
+    return _list_objects(path=path, delimiter=None, boto3_session=boto3_session)
+
+
+def _list_objects(
+    path: str, delimiter: Optional[str] = None, boto3_session: Optional[boto3.Session] = None
+) -> List[str]:
     client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
     paginator = client_s3.get_paginator("list_objects_v2")
     bucket: str
     prefix: str
     bucket, prefix = _utils.parse_path(path=path)
-    response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000})
+    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}}
+    if delimiter is not None:
+        args["Delimiter"] = delimiter
+    response_iterator = paginator.paginate(**args)
     paths: List[str] = []
     for page in response_iterator:
-        contents: Optional[List] = page.get("Contents")
-        if contents is not None:
-            for content in contents:
-                if (content is not None) and ("Key" in content):
-                    key: str = content["Key"]
-                    paths.append(f"s3://{bucket}/{key}")
+        if delimiter is None:
+            contents: Optional[List[Optional[Dict[str, str]]]] = page.get("Contents")
+            if contents is not None:
+                for content in contents:
+                    if (content is not None) and ("Key" in content):
+                        key: str = content["Key"]
+                        paths.append(f"s3://{bucket}/{key}")
+        else:
+            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes")
+            if prefixes is not None:
+                for pfx in prefixes:
+                    if (pfx is not None) and ("Prefix" in pfx):
+                        key = pfx["Prefix"]
+                        paths.append(f"s3://{bucket}/{key}")
     return paths
 
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -16,6 +16,7 @@ Amazon S3
     does_object_exist
     get_bucket_region
     list_objects
+    list_directories
     read_csv
     read_fwf
     read_json
@@ -115,6 +116,7 @@ EMR
     submit_steps
     build_step
     get_step_state
+    update_ecr_credentials
 
 CloudWatch Logs
 ---------------
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -17,4 +17,5 @@ twine~=3.1.1
 wheel~=0.34.2
 sphinx~=3.0.1
 sphinx_bootstrap_theme~=0.7.1
-moto~=1.3.14
+moto~=1.3.14
+jupyterlab~=2.1.1
diff --git a/testing/test_awswrangler/test_cloudwatch.py b/testing/test_awswrangler/test_cloudwatch.py
@@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs):
 def test_query_cancelled(loggroup):
     client_logs = boto3.client("logs")
     query_id = wr.cloudwatch.start_query(
-        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc | limit 5"
+        log_group_names=[loggroup], query="fields @timestamp, @message | sort @timestamp desc"
     )
     client_logs.stop_query(queryId=query_id)
     with pytest.raises(exceptions.QueryCancelled):
diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py
@@ -127,6 +127,9 @@ def test_athena_ctas(bucket, database, kms_key):
         partition_cols=["par0", "par1"],
     )["paths"]
     wr.s3.wait_objects_exist(paths=paths)
+    dirs = wr.s3.list_directories(path=f"s3://{bucket}/test_athena_ctas/")
+    for d in dirs:
+        assert d.startswith(f"s3://{bucket}/test_athena_ctas/par0=")
     df = wr.s3.read_parquet_table(table="test_athena_ctas", database=database)
     assert len(df.index) == 3
     ensure_data_types(df=df, has_list=True)
diff --git a/testing/test_awswrangler/test_emr.py b/testing/test_awswrangler/test_emr.py
@@ -146,3 +146,36 @@ def test_cluster_single_node(bucket, cloudformation_outputs):
     wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
     wr.emr.terminate_cluster(cluster_id=cluster_id)
     wr.s3.delete_objects(f"s3://{bucket}/emr-logs/")
+
+
+def test_default_logging_path(cloudformation_outputs):
+    path = wr.emr._get_default_logging_path(subnet_id=cloudformation_outputs["SubnetId"])
+    assert path.startswith("s3://aws-logs-")
+    assert path.endswith("/elasticmapreduce/")
+    with pytest.raises(wr.exceptions.InvalidArgumentCombination):
+        wr.emr._get_default_logging_path()
+
+
+def test_docker(cloudformation_outputs):
+    cluster_id = wr.emr.create_cluster(
+        subnet_id=cloudformation_outputs["SubnetId"],
+        docker=True,
+        spark_docker=True,
+        spark_docker_image="787535711150.dkr.ecr.us-east-1.amazonaws.com/docker-emr:docker-emr",
+        hive_docker=True,
+        ecr_credentials_step=True,
+        custom_classifications=[
+            {
+                "Classification": "livy-conf",
+                "Properties": {
+                    "livy.spark.master": "yarn",
+                    "livy.spark.deploy-mode": "cluster",
+                    "livy.server.session.timeout": "16h",
+                },
+            }
+        ],
+        steps=[wr.emr.build_step("spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")],
+    )
+    wr.emr.submit_step(cluster_id=cluster_id, command="spark-submit --deploy-mode cluster s3://igor-tavares/emr.py")
+    wr.emr.update_ecr_credentials(cluster_id=cluster_id)
+    wr.emr.terminate_cluster(cluster_id=cluster_id)
diff --git a/testing/test_awswrangler/test_moto.py b/testing/test_awswrangler/test_moto.py
@@ -20,6 +20,21 @@ def emr():
         yield True
 
 
+@pytest.fixture(scope="module")
+def sts():
+    with moto.mock_sts():
+        yield True
+
+
+@pytest.fixture(scope="module")
+def subnet():
+    with moto.mock_ec2():
+        ec2 = boto3.resource("ec2", region_name="us-west-1")
+        vpc = ec2.create_vpc(CidrBlock="10.0.0.0/16")
+        subnet = ec2.create_subnet(VpcId=vpc.id, CidrBlock="10.0.0.0/24", AvailabilityZone="us-west-1a")
+        yield subnet.id
+
+
 def test_csv(s3):
     path = "s3://bucket/test.csv"
     wr.s3.to_csv(df=get_df_csv(), path=path, index=False)
@@ -37,12 +52,13 @@ def test_parquet(s3):
     assert len(df.columns) == 18
 
 
-def test_emr(s3, emr):
+def test_emr(s3, emr, sts, subnet):
+    session = boto3.Session(region_name="us-west-1")
     cluster_id = wr.emr.create_cluster(
         cluster_name="wrangler_cluster",
         logging_s3_path="s3://bucket/emr-logs/",
         emr_release="emr-5.29.0",
-        subnet_id="foo",
+        subnet_id=subnet,
         emr_ec2_role="EMR_EC2_DefaultRole",
         emr_role="EMR_DefaultRole",
         instance_type_master="m5.xlarge",
@@ -87,11 +103,12 @@ def test_emr(s3, emr):
         termination_protected=False,
         spark_pyarrow=False,
         tags={"foo": "boo", "bar": "xoo"},
+        boto3_session=session,
     )
-    wr.emr.get_cluster_state(cluster_id=cluster_id)
+    wr.emr.get_cluster_state(cluster_id=cluster_id, boto3_session=session)
     steps = []
     for cmd in ['echo "Hello"', "ls -la"]:
         steps.append(wr.emr.build_step(name=cmd, command=cmd))
-    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps)
-    wr.emr.terminate_cluster(cluster_id=cluster_id)
+    wr.emr.submit_steps(cluster_id=cluster_id, steps=steps, boto3_session=session)
+    wr.emr.terminate_cluster(cluster_id=cluster_id, boto3_session=session)
     wr.s3.delete_objects("s3://bucket/emr-logs/")
diff --git a/tutorials/15 - EMR.ipynb b/tutorials/15 - EMR.ipynb
diff --git a/tutorials/16 - EMR & Docker.ipynb b/tutorials/16 - EMR & Docker.ipynb

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def loggroup(cloudformation_outputs):`
`48`	`48`	`def test_query_cancelled(loggroup):`
`49`	`49`	`client_logs = boto3.client("logs")`
`50`	`50`	`query_id = wr.cloudwatch.start_query(`
`51`		`- log_group_names=[loggroup], query="fields @timestamp, @message \| sort @timestamp desc \| limit 5"`
	`51`	`+ log_group_names=[loggroup], query="fields @timestamp, @message \| sort @timestamp desc"`
`52`	`52`	`)`
`53`	`53`	`client_logs.stop_query(queryId=query_id)`
`54`	`54`	`with pytest.raises(exceptions.QueryCancelled):`