feat: Apache Spark on Amazon Athena - wr.athena.create_spark_session & wr.athena.run_spark_calculation (#2314)

kukushking · web-flow · commit 311563eb596a · 2023-06-06T16:25:31.000+01:00
* feat: Spark on Athena - checkpoint

* feat: Spark on Athena - temp remove result handling, add create_spark_session &amp; fix types

* [skip ci] Docstrings

* [skip ci] Fix output types &amp; add test case spark code

* Remove comments

* Upd api docs

* [skip ci] Add tutorial

* [skip ci] Add IAM role

* [skip ci] Update docstrings

* [skip ci] Add examples

* [skip ci] Reuse inline LF policy
diff --git a/awswrangler/athena/__init__.py b/awswrangler/athena/__init__.py
@@ -6,6 +6,7 @@
     start_query_execution,
     wait_query,
 )
+from awswrangler.athena._spark import create_spark_session, run_spark_calculation
 from awswrangler.athena._read import (  # noqa
     get_query_results,
     read_sql_query,
@@ -42,6 +43,8 @@
     "generate_create_query",
     "list_query_executions",
     "repair_table",
+    "create_spark_session",
+    "run_spark_calculation",
     "create_ctas_table",
     "show_create_table",
     "start_query_execution",
diff --git a/awswrangler/athena/_spark.py b/awswrangler/athena/_spark.py
@@ -0,0 +1,227 @@
+"""Apache Spark on Amazon Athena Module."""
+# pylint: disable=too-many-lines
+import logging
+import time
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+
+import boto3
+
+from awswrangler import _utils, exceptions
+
+_logger: logging.Logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from mypy_boto3_athena.type_defs import (
+        EngineConfigurationTypeDef,
+        GetCalculationExecutionResponseTypeDef,
+        GetCalculationExecutionStatusResponseTypeDef,
+        GetSessionStatusResponseTypeDef,
+    )
+
+_SESSION_FINAL_STATES: List[str] = ["IDLE", "TERMINATED", "DEGRADED", "FAILED"]
+_CALCULATION_EXECUTION_FINAL_STATES: List[str] = ["COMPLETED", "FAILED", "CANCELED"]
+_SESSION_WAIT_POLLING_DELAY: float = 5.0  # SECONDS
+_CALCULATION_EXECUTION_WAIT_POLLING_DELAY: float = 5.0  # SECONDS
+
+
+def _wait_session(
+    session_id: str,
+    boto3_session: Optional[boto3.Session] = None,
+    athena_session_wait_polling_delay: float = _SESSION_WAIT_POLLING_DELAY,
+) -> "GetSessionStatusResponseTypeDef":
+    client_athena = _utils.client(service_name="athena", session=boto3_session)
+
+    response: "GetSessionStatusResponseTypeDef" = client_athena.get_session_status(SessionId=session_id)
+    state: str = response["Status"]["State"]
+
+    while state not in _SESSION_FINAL_STATES:
+        time.sleep(athena_session_wait_polling_delay)
+        response = client_athena.get_session_status(SessionId=session_id)
+        state = response["Status"]["State"]
+    _logger.debug("Session state: %s", state)
+    _logger.debug("Session state change reason: %s", response["Status"].get("StateChangeReason"))
+    if state in ["FAILED", "DEGRADED", "TERMINATED"]:
+        raise exceptions.SessionFailed(response["Status"].get("StateChangeReason"))
+    return response
+
+
+def _wait_calculation_execution(
+    calculation_execution_id: str,
+    boto3_session: Optional[boto3.Session] = None,
+    athena_calculation_execution_wait_polling_delay: float = _CALCULATION_EXECUTION_WAIT_POLLING_DELAY,
+) -> "GetCalculationExecutionStatusResponseTypeDef":
+    client_athena = _utils.client(service_name="athena", session=boto3_session)
+
+    response: "GetCalculationExecutionStatusResponseTypeDef" = client_athena.get_calculation_execution_status(
+        CalculationExecutionId=calculation_execution_id
+    )
+    state: str = response["Status"]["State"]
+
+    while state not in _CALCULATION_EXECUTION_FINAL_STATES:
+        time.sleep(athena_calculation_execution_wait_polling_delay)
+        response = client_athena.get_calculation_execution_status(CalculationExecutionId=calculation_execution_id)
+        state = response["Status"]["State"]
+    _logger.debug("Calculation execution state: %s", state)
+    _logger.debug("Calculation execution state change reason: %s", response["Status"].get("StateChangeReason"))
+    if state in ["CANCELED", "FAILED"]:
+        raise exceptions.CalculationFailed(response["Status"].get("StateChangeReason"))
+    return response
+
+
+def _get_calculation_execution_results(
+    calculation_execution_id: str,
+    boto3_session: Optional[boto3.Session] = None,
+) -> Dict[str, Any]:
+    client_athena = _utils.client(service_name="athena", session=boto3_session)
+
+    _wait_calculation_execution(
+        calculation_execution_id=calculation_execution_id,
+        boto3_session=boto3_session,
+    )
+
+    response: "GetCalculationExecutionResponseTypeDef" = client_athena.get_calculation_execution(
+        CalculationExecutionId=calculation_execution_id,
+    )
+    return cast(Dict[str, Any], response)
+
+
+def create_spark_session(
+    workgroup: str,
+    coordinator_dpu_size: int = 1,
+    max_concurrent_dpus: int = 5,
+    default_executor_dpu_size: int = 1,
+    additional_configs: Optional[Dict[str, Any]] = None,
+    idle_timeout: int = 15,
+    boto3_session: Optional[boto3.Session] = None,
+) -> str:
+    """
+    Create session and wait until ready to accept calculations.
+
+    Parameters
+    ----------
+    workgroup : str
+        Athena workgroup name. Must be Spark-enabled.
+    coordinator_dpu_size : int, optional
+        The number of DPUs to use for the coordinator. A coordinator is a special executor that orchestrates
+        processing work and manages other executors in a notebook session. The default is 1.
+    max_concurrent_dpus : int, optional
+        The maximum number of DPUs that can run concurrently. The default is 5.
+    default_executor_dpu_size: int, optional
+        The default number of DPUs to use for executors. The default is 1.
+    additional_configs : Dict[str, Any], optional
+        Contains additional engine parameter mappings in the form of key-value pairs.
+    idle_timeout : int, optional
+         The idle timeout in minutes for the session. The default is 15.
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    str
+        Session id
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> df = wr.athena.create_spark_session(workgroup="...", max_concurrent_dpus=10)
+
+    """
+    client_athena = _utils.client(service_name="athena", session=boto3_session)
+    engine_configuration: "EngineConfigurationTypeDef" = {
+        "CoordinatorDpuSize": coordinator_dpu_size,
+        "MaxConcurrentDpus": max_concurrent_dpus,
+        "DefaultExecutorDpuSize": default_executor_dpu_size,
+    }
+    if additional_configs:
+        engine_configuration["AdditionalConfigs"] = additional_configs
+    response = client_athena.start_session(
+        WorkGroup=workgroup,
+        EngineConfiguration=engine_configuration,
+        SessionIdleTimeoutInMinutes=idle_timeout,
+    )
+    _logger.info("Session info:\n%s", response)
+    session_id: str = response["SessionId"]
+    # Wait for the session to reach IDLE state to be able to accept calculations
+    _wait_session(
+        session_id=session_id,
+        boto3_session=boto3_session,
+    )
+    return session_id
+
+
+def run_spark_calculation(
+    code: str,
+    workgroup: str,
+    session_id: Optional[str] = None,
+    coordinator_dpu_size: int = 1,
+    max_concurrent_dpus: int = 5,
+    default_executor_dpu_size: int = 1,
+    additional_configs: Optional[Dict[str, Any]] = None,
+    idle_timeout: int = 15,
+    boto3_session: Optional[boto3.Session] = None,
+) -> Dict[str, Any]:
+    """
+    Execute Spark Calculation and wait for completion.
+
+    Parameters
+    ----------
+    code : str
+        A string that contains the code for the calculation.
+    workgroup : str
+        Athena workgroup name. Must be Spark-enabled.
+    session_id : str, optional
+        The session id. If not passed, a session will be started.
+    coordinator_dpu_size : int, optional
+        The number of DPUs to use for the coordinator. A coordinator is a special executor that orchestrates
+        processing work and manages other executors in a notebook session. The default is 1.
+    max_concurrent_dpus : int, optional
+        The maximum number of DPUs that can run concurrently. The default is 5.
+    default_executor_dpu_size: int, optional
+        The default number of DPUs to use for executors. The default is 1.
+    additional_configs : Dict[str, Any], optional
+        Contains additional engine parameter mappings in the form of key-value pairs.
+    idle_timeout : int, optional
+        The idle timeout in minutes for the session. The default is 15.
+    boto3_session : boto3.Session(), optional
+        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
+
+    Returns
+    -------
+    Dict[str, Any]
+        Calculation response
+
+    Examples
+    --------
+    >>> import awswrangler as wr
+    >>> df = wr.athena.run_spark_calculation(
+    ...     code="print(spark)",
+    ...     workgroup="...",
+    ... )
+
+    """
+    client_athena = _utils.client(service_name="athena", session=boto3_session)
+
+    session_id = (
+        create_spark_session(
+            workgroup=workgroup,
+            coordinator_dpu_size=coordinator_dpu_size,
+            max_concurrent_dpus=max_concurrent_dpus,
+            default_executor_dpu_size=default_executor_dpu_size,
+            additional_configs=additional_configs,
+            idle_timeout=idle_timeout,
+            boto3_session=boto3_session,
+        )
+        if not session_id
+        else session_id
+    )
+
+    response = client_athena.start_calculation_execution(
+        SessionId=session_id,
+        CodeBlock=code,
+    )
+    _logger.info("Calculation execution info:\n%s", response)
+
+    return _get_calculation_execution_results(
+        calculation_execution_id=response["CalculationExecutionId"],
+        boto3_session=boto3_session,
+    )
diff --git a/awswrangler/exceptions.py b/awswrangler/exceptions.py
@@ -49,6 +49,14 @@ class QueryCancelled(Exception):
     """QueryCancelled exception."""
 
 
+class SessionFailed(Exception):
+    """SessionFailed exception."""
+
+
+class CalculationFailed(Exception):
+    """CalculationFailed exception."""
+
+
 class EmptyDataFrame(Exception):
     """EmptyDataFrame exception."""
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -121,6 +121,7 @@ Amazon Athena
     :toctree: stubs
 
     create_athena_bucket
+    create_spark_session
     create_ctas_table
     generate_create_query
     get_query_columns_types
@@ -133,6 +134,7 @@ Amazon Athena
     read_sql_query
     read_sql_table
     repair_table
+    run_spark_calculation
     start_query_execution
     stop_query_execution
     to_iceberg
diff --git a/test_infra/stacks/base_stack.py b/test_infra/stacks/base_stack.py
@@ -87,6 +87,16 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
             resource_arn=self.bucket.bucket_arn,
             use_service_linked_role=True,
         )
+        inline_lf_policies = {
+            "GetDataAccess": iam.PolicyDocument(
+                statements=[
+                    iam.PolicyStatement(
+                        actions=["lakeformation:GetDataAccess"],
+                        resources=["*"],
+                    ),
+                ]
+            ),
+        }
         glue_data_quality_role = iam.Role(
             self,
             "aws-sdk-pandas-glue-data-quality-role",
@@ -96,16 +106,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
                 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"),
                 iam.ManagedPolicy.from_aws_managed_policy_name("AWSGlueConsoleFullAccess"),
             ],
-            inline_policies={
-                "GetDataAccess": iam.PolicyDocument(
-                    statements=[
-                        iam.PolicyStatement(
-                            actions=["lakeformation:GetDataAccess"],
-                            resources=["*"],
-                        ),
-                    ]
-                ),
-            },
+            inline_policies=inline_lf_policies,
         )
         emr_serverless_exec_role = iam.Role(
             self,
@@ -116,16 +117,19 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
                 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"),
                 iam.ManagedPolicy.from_aws_managed_policy_name("AWSGlueConsoleFullAccess"),
             ],
-            inline_policies={
-                "GetDataAccess": iam.PolicyDocument(
-                    statements=[
-                        iam.PolicyStatement(
-                            actions=["lakeformation:GetDataAccess"],
-                            resources=["*"],
-                        ),
-                    ]
-                ),
-            },
+            inline_policies=inline_lf_policies,
+        )
+        athena_spark_exec_role = iam.Role(
+            self,
+            "aws-sdk-pandas-athena-spark-exec-role",
+            role_name="AthenaSparkExecutionRole",
+            assumed_by=iam.ServicePrincipal("athena.amazonaws.com"),
+            managed_policies=[
+                iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"),
+                iam.ManagedPolicy.from_aws_managed_policy_name("AWSGlueConsoleFullAccess"),
+                iam.ManagedPolicy.from_aws_managed_policy_name("AmazonAthenaFullAccess"),
+            ],
+            inline_policies=inline_lf_policies,
         )
         glue_db = glue.Database(
             self,
@@ -199,6 +203,7 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs: str) -> None:
         CfnOutput(self, "GlueDatabaseName", value=glue_db.database_name)
         CfnOutput(self, "GlueDataQualityRole", value=glue_data_quality_role.role_arn)
         CfnOutput(self, "EMRServerlessExecutionRoleArn", value=emr_serverless_exec_role.role_arn)
+        CfnOutput(self, "AthenaSparkExecutionRoleArn", value=athena_spark_exec_role.role_arn)
         CfnOutput(self, "LogGroupName", value=log_group.log_group_name)
         CfnOutput(self, "LogStream", value=log_stream.log_stream_name)
 
diff --git a/tests/unit/test_athena_spark.py b/tests/unit/test_athena_spark.py
@@ -0,0 +1,49 @@
+import pytest
+
+import awswrangler as wr
+from tests._utils import create_workgroup
+
+
+@pytest.fixture(scope="session")
+def athena_spark_execution_role_arn(cloudformation_outputs):
+    return cloudformation_outputs["AthenaSparkExecutionRoleArn"]
+
+
+@pytest.fixture(scope="session")
+def workgroup_spark(bucket, kms_key, athena_spark_execution_role_arn):
+    return create_workgroup(
+        wkg_name="aws_sdk_pandas_spark",
+        config={
+            "EngineVersion": {
+                "SelectedEngineVersion": "PySpark engine version 3",
+            },
+            "ExecutionRole": athena_spark_execution_role_arn,
+            "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup_spark/"},
+        },
+    )
+
+
+@pytest.mark.parametrize(
+    "code",
+    [
+        "print(spark)",
+        """
+input_path = "s3://athena-examples-us-east-1/notebooks/yellow_tripdata_2016-01.parquet"
+output_path = "$PATH"
+
+taxi_df = spark.read.format("parquet").load(input_path)
+
+taxi_passenger_counts = taxi_df.groupBy("VendorID", "passenger_count").count()
+taxi_passenger_counts.coalesce(1).write.mode('overwrite').csv(output_path)
+        """,
+    ],
+)
+def test_athena_spark_calculation(code, path, workgroup_spark):
+    code = code.replace("$PATH", path)
+
+    result = wr.athena.run_spark_calculation(
+        code=code,
+        workgroup=workgroup_spark,
+    )
+
+    assert result["Status"]["State"] == "COMPLETED"
diff --git a/tutorials/041 - Apache Spark on Amazon Athena.ipynb b/tutorials/041 - Apache Spark on Amazon Athena.ipynb