|
| 1 | +"""Apache Spark on Amazon Athena Module.""" |
| 2 | +# pylint: disable=too-many-lines |
| 3 | +import logging |
| 4 | +import time |
| 5 | +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast |
| 6 | + |
| 7 | +import boto3 |
| 8 | + |
| 9 | +from awswrangler import _utils, exceptions |
| 10 | + |
| 11 | +_logger: logging.Logger = logging.getLogger(__name__) |
| 12 | + |
| 13 | +if TYPE_CHECKING: |
| 14 | + from mypy_boto3_athena.type_defs import ( |
| 15 | + EngineConfigurationTypeDef, |
| 16 | + GetCalculationExecutionResponseTypeDef, |
| 17 | + GetCalculationExecutionStatusResponseTypeDef, |
| 18 | + GetSessionStatusResponseTypeDef, |
| 19 | + ) |
| 20 | + |
| 21 | +_SESSION_FINAL_STATES: List[str] = ["IDLE", "TERMINATED", "DEGRADED", "FAILED"] |
| 22 | +_CALCULATION_EXECUTION_FINAL_STATES: List[str] = ["COMPLETED", "FAILED", "CANCELED"] |
| 23 | +_SESSION_WAIT_POLLING_DELAY: float = 5.0 # SECONDS |
| 24 | +_CALCULATION_EXECUTION_WAIT_POLLING_DELAY: float = 5.0 # SECONDS |
| 25 | + |
| 26 | + |
| 27 | +def _wait_session( |
| 28 | + session_id: str, |
| 29 | + boto3_session: Optional[boto3.Session] = None, |
| 30 | + athena_session_wait_polling_delay: float = _SESSION_WAIT_POLLING_DELAY, |
| 31 | +) -> "GetSessionStatusResponseTypeDef": |
| 32 | + client_athena = _utils.client(service_name="athena", session=boto3_session) |
| 33 | + |
| 34 | + response: "GetSessionStatusResponseTypeDef" = client_athena.get_session_status(SessionId=session_id) |
| 35 | + state: str = response["Status"]["State"] |
| 36 | + |
| 37 | + while state not in _SESSION_FINAL_STATES: |
| 38 | + time.sleep(athena_session_wait_polling_delay) |
| 39 | + response = client_athena.get_session_status(SessionId=session_id) |
| 40 | + state = response["Status"]["State"] |
| 41 | + _logger.debug("Session state: %s", state) |
| 42 | + _logger.debug("Session state change reason: %s", response["Status"].get("StateChangeReason")) |
| 43 | + if state in ["FAILED", "DEGRADED", "TERMINATED"]: |
| 44 | + raise exceptions.SessionFailed(response["Status"].get("StateChangeReason")) |
| 45 | + return response |
| 46 | + |
| 47 | + |
| 48 | +def _wait_calculation_execution( |
| 49 | + calculation_execution_id: str, |
| 50 | + boto3_session: Optional[boto3.Session] = None, |
| 51 | + athena_calculation_execution_wait_polling_delay: float = _CALCULATION_EXECUTION_WAIT_POLLING_DELAY, |
| 52 | +) -> "GetCalculationExecutionStatusResponseTypeDef": |
| 53 | + client_athena = _utils.client(service_name="athena", session=boto3_session) |
| 54 | + |
| 55 | + response: "GetCalculationExecutionStatusResponseTypeDef" = client_athena.get_calculation_execution_status( |
| 56 | + CalculationExecutionId=calculation_execution_id |
| 57 | + ) |
| 58 | + state: str = response["Status"]["State"] |
| 59 | + |
| 60 | + while state not in _CALCULATION_EXECUTION_FINAL_STATES: |
| 61 | + time.sleep(athena_calculation_execution_wait_polling_delay) |
| 62 | + response = client_athena.get_calculation_execution_status(CalculationExecutionId=calculation_execution_id) |
| 63 | + state = response["Status"]["State"] |
| 64 | + _logger.debug("Calculation execution state: %s", state) |
| 65 | + _logger.debug("Calculation execution state change reason: %s", response["Status"].get("StateChangeReason")) |
| 66 | + if state in ["CANCELED", "FAILED"]: |
| 67 | + raise exceptions.CalculationFailed(response["Status"].get("StateChangeReason")) |
| 68 | + return response |
| 69 | + |
| 70 | + |
| 71 | +def _get_calculation_execution_results( |
| 72 | + calculation_execution_id: str, |
| 73 | + boto3_session: Optional[boto3.Session] = None, |
| 74 | +) -> Dict[str, Any]: |
| 75 | + client_athena = _utils.client(service_name="athena", session=boto3_session) |
| 76 | + |
| 77 | + _wait_calculation_execution( |
| 78 | + calculation_execution_id=calculation_execution_id, |
| 79 | + boto3_session=boto3_session, |
| 80 | + ) |
| 81 | + |
| 82 | + response: "GetCalculationExecutionResponseTypeDef" = client_athena.get_calculation_execution( |
| 83 | + CalculationExecutionId=calculation_execution_id, |
| 84 | + ) |
| 85 | + return cast(Dict[str, Any], response) |
| 86 | + |
| 87 | + |
| 88 | +def create_spark_session( |
| 89 | + workgroup: str, |
| 90 | + coordinator_dpu_size: int = 1, |
| 91 | + max_concurrent_dpus: int = 5, |
| 92 | + default_executor_dpu_size: int = 1, |
| 93 | + additional_configs: Optional[Dict[str, Any]] = None, |
| 94 | + idle_timeout: int = 15, |
| 95 | + boto3_session: Optional[boto3.Session] = None, |
| 96 | +) -> str: |
| 97 | + """ |
| 98 | + Create session and wait until ready to accept calculations. |
| 99 | +
|
| 100 | + Parameters |
| 101 | + ---------- |
| 102 | + workgroup : str |
| 103 | + Athena workgroup name. Must be Spark-enabled. |
| 104 | + coordinator_dpu_size : int, optional |
| 105 | + The number of DPUs to use for the coordinator. A coordinator is a special executor that orchestrates |
| 106 | + processing work and manages other executors in a notebook session. The default is 1. |
| 107 | + max_concurrent_dpus : int, optional |
| 108 | + The maximum number of DPUs that can run concurrently. The default is 5. |
| 109 | + default_executor_dpu_size: int, optional |
| 110 | + The default number of DPUs to use for executors. The default is 1. |
| 111 | + additional_configs : Dict[str, Any], optional |
| 112 | + Contains additional engine parameter mappings in the form of key-value pairs. |
| 113 | + idle_timeout : int, optional |
| 114 | + The idle timeout in minutes for the session. The default is 15. |
| 115 | + boto3_session : boto3.Session(), optional |
| 116 | + Boto3 Session. The default boto3 session will be used if boto3_session receive None. |
| 117 | +
|
| 118 | + Returns |
| 119 | + ------- |
| 120 | + str |
| 121 | + Session id |
| 122 | +
|
| 123 | + Examples |
| 124 | + -------- |
| 125 | + >>> import awswrangler as wr |
| 126 | + >>> df = wr.athena.create_spark_session(workgroup="...", max_concurrent_dpus=10) |
| 127 | +
|
| 128 | + """ |
| 129 | + client_athena = _utils.client(service_name="athena", session=boto3_session) |
| 130 | + engine_configuration: "EngineConfigurationTypeDef" = { |
| 131 | + "CoordinatorDpuSize": coordinator_dpu_size, |
| 132 | + "MaxConcurrentDpus": max_concurrent_dpus, |
| 133 | + "DefaultExecutorDpuSize": default_executor_dpu_size, |
| 134 | + } |
| 135 | + if additional_configs: |
| 136 | + engine_configuration["AdditionalConfigs"] = additional_configs |
| 137 | + response = client_athena.start_session( |
| 138 | + WorkGroup=workgroup, |
| 139 | + EngineConfiguration=engine_configuration, |
| 140 | + SessionIdleTimeoutInMinutes=idle_timeout, |
| 141 | + ) |
| 142 | + _logger.info("Session info:\n%s", response) |
| 143 | + session_id: str = response["SessionId"] |
| 144 | + # Wait for the session to reach IDLE state to be able to accept calculations |
| 145 | + _wait_session( |
| 146 | + session_id=session_id, |
| 147 | + boto3_session=boto3_session, |
| 148 | + ) |
| 149 | + return session_id |
| 150 | + |
| 151 | + |
| 152 | +def run_spark_calculation( |
| 153 | + code: str, |
| 154 | + workgroup: str, |
| 155 | + session_id: Optional[str] = None, |
| 156 | + coordinator_dpu_size: int = 1, |
| 157 | + max_concurrent_dpus: int = 5, |
| 158 | + default_executor_dpu_size: int = 1, |
| 159 | + additional_configs: Optional[Dict[str, Any]] = None, |
| 160 | + idle_timeout: int = 15, |
| 161 | + boto3_session: Optional[boto3.Session] = None, |
| 162 | +) -> Dict[str, Any]: |
| 163 | + """ |
| 164 | + Execute Spark Calculation and wait for completion. |
| 165 | +
|
| 166 | + Parameters |
| 167 | + ---------- |
| 168 | + code : str |
| 169 | + A string that contains the code for the calculation. |
| 170 | + workgroup : str |
| 171 | + Athena workgroup name. Must be Spark-enabled. |
| 172 | + session_id : str, optional |
| 173 | + The session id. If not passed, a session will be started. |
| 174 | + coordinator_dpu_size : int, optional |
| 175 | + The number of DPUs to use for the coordinator. A coordinator is a special executor that orchestrates |
| 176 | + processing work and manages other executors in a notebook session. The default is 1. |
| 177 | + max_concurrent_dpus : int, optional |
| 178 | + The maximum number of DPUs that can run concurrently. The default is 5. |
| 179 | + default_executor_dpu_size: int, optional |
| 180 | + The default number of DPUs to use for executors. The default is 1. |
| 181 | + additional_configs : Dict[str, Any], optional |
| 182 | + Contains additional engine parameter mappings in the form of key-value pairs. |
| 183 | + idle_timeout : int, optional |
| 184 | + The idle timeout in minutes for the session. The default is 15. |
| 185 | + boto3_session : boto3.Session(), optional |
| 186 | + Boto3 Session. The default boto3 session will be used if boto3_session receive None. |
| 187 | +
|
| 188 | + Returns |
| 189 | + ------- |
| 190 | + Dict[str, Any] |
| 191 | + Calculation response |
| 192 | +
|
| 193 | + Examples |
| 194 | + -------- |
| 195 | + >>> import awswrangler as wr |
| 196 | + >>> df = wr.athena.run_spark_calculation( |
| 197 | + ... code="print(spark)", |
| 198 | + ... workgroup="...", |
| 199 | + ... ) |
| 200 | +
|
| 201 | + """ |
| 202 | + client_athena = _utils.client(service_name="athena", session=boto3_session) |
| 203 | + |
| 204 | + session_id = ( |
| 205 | + create_spark_session( |
| 206 | + workgroup=workgroup, |
| 207 | + coordinator_dpu_size=coordinator_dpu_size, |
| 208 | + max_concurrent_dpus=max_concurrent_dpus, |
| 209 | + default_executor_dpu_size=default_executor_dpu_size, |
| 210 | + additional_configs=additional_configs, |
| 211 | + idle_timeout=idle_timeout, |
| 212 | + boto3_session=boto3_session, |
| 213 | + ) |
| 214 | + if not session_id |
| 215 | + else session_id |
| 216 | + ) |
| 217 | + |
| 218 | + response = client_athena.start_calculation_execution( |
| 219 | + SessionId=session_id, |
| 220 | + CodeBlock=code, |
| 221 | + ) |
| 222 | + _logger.info("Calculation execution info:\n%s", response) |
| 223 | + |
| 224 | + return _get_calculation_execution_results( |
| 225 | + calculation_execution_id=response["CalculationExecutionId"], |
| 226 | + boto3_session=boto3_session, |
| 227 | + ) |
0 commit comments