|
1 | | -from typing import TYPE_CHECKING, Union |
2 | | - |
3 | 1 | from pyspark.sql import SparkSession |
4 | 2 |
|
5 | | -if TYPE_CHECKING: |
6 | | - from databricks.connect import DatabricksSession |
7 | | - |
8 | | - |
9 | | -def get_spark() -> Union[SparkSession, "DatabricksSession"]: |
| 3 | +def get_spark() -> SparkSession: |
10 | 4 | """ |
11 | 5 | Returns the SparkSession. In case databricks-connect is available we use it for |
12 | 6 | extended configuration mechanisms and notebook compatibility, |
13 | 7 | otherwise we use classic pyspark. |
14 | 8 | """ |
15 | 9 | try: |
16 | | - # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) |
17 | | - # the remote session is instantiated using the databricks module |
18 | | - # If the databricks-connect module is installed, we use a remote session |
19 | | - from databricks.connect import DatabricksSession |
20 | | - |
21 | 10 | # We can't test this as there's no Databricks test env available |
22 | | - try: |
23 | | - spark = DatabricksSession.builder.getOrCreate() # pragma: no cover |
24 | | - # this can't be narrowed down since databricks-connect throws error of Exception type |
25 | | - except Exception as e: |
26 | | - error_message = str(e) |
27 | | - if ( |
28 | | - error_message |
29 | | - == "Cluster id or serverless are required but were not specified." |
30 | | - ): |
31 | | - raise type(e)( |
32 | | - "DatabricksSession is expected to behave as singleton but it didn't. " |
33 | | - "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " |
34 | | - "env variables in your hooks prior to using the spark session. " |
35 | | - "Read more about these variables here: " |
36 | | - "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" |
37 | | - ) from e |
38 | | - pass |
| 11 | + spark = _create_databricks_session() # pragma: no cover |
39 | 12 |
|
40 | 13 | except ImportError: |
41 | 14 | # For "normal" spark sessions that don't use databricks-connect |
42 | 15 | # we get spark normally |
43 | 16 | spark = SparkSession.builder.getOrCreate() |
44 | 17 |
|
45 | 18 | return spark |
| 19 | + |
| 20 | + |
| 21 | +def _create_databricks_session() -> SparkSession: |
| 22 | + # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) |
| 23 | + # the remote session is instantiated using the databricks module |
| 24 | + # If the databricks-connect module is installed, we use a remote session |
| 25 | + from databricks.connect import DatabricksSession |
| 26 | + |
| 27 | + try: |
| 28 | + return DatabricksSession.builder.getOrCreate() |
| 29 | + # this can't be narrowed down since databricks-connect throws error of Exception type |
| 30 | + except Exception as exception: |
| 31 | + if str(exception) == "Cluster id or serverless are required but were not specified.": |
| 32 | + raise type(exception)( |
| 33 | + "DatabricksSession is expected to behave as singleton but it didn't. " |
| 34 | + "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " |
| 35 | + "env variables in your hooks prior to using the spark session. " |
| 36 | + "Read more about these variables here: " |
| 37 | + "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" |
| 38 | + ) from exception |
0 commit comments