|
| 1 | +from typing import TYPE_CHECKING, Union |
| 2 | + |
1 | 3 | from pyspark.sql import SparkSession |
2 | 4 |
|
| 5 | +if TYPE_CHECKING: |
| 6 | + from databricks.connect import DatabricksSession |
| 7 | + |
3 | 8 |
|
4 | | -def get_spark() -> SparkSession: |
| 9 | +def get_spark() -> Union[SparkSession, "DatabricksSession"]: |
5 | 10 | """ |
6 | 11 | Returns the SparkSession. In case databricks-connect is available we use it for |
7 | 12 | extended configuration mechanisms and notebook compatibility, |
8 | 13 | otherwise we use classic pyspark. |
9 | 14 | """ |
10 | 15 | try: |
| 16 | + # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) |
| 17 | + # the remote session is instantiated using the databricks module |
| 18 | + # If the databricks-connect module is installed, we use a remote session |
| 19 | + from databricks.connect import DatabricksSession |
| 20 | + |
11 | 21 | # We can't test this as there's no Databricks test env available |
12 | | - spark = _create_databricks_session() # pragma: no cover |
| 22 | + try: |
| 23 | + spark = DatabricksSession.builder.getOrCreate() # pragma: no cover |
| 24 | + # this can't be narrowed down since databricks-connect throws error of Exception type |
| 25 | + except Exception as e: |
| 26 | + error_message = str(e) |
| 27 | + if ( |
| 28 | + error_message |
| 29 | + == "Cluster id or serverless are required but were not specified." |
| 30 | + ): |
| 31 | + raise type(e)( |
| 32 | + "DatabricksSession is expected to behave as singleton but it didn't. " |
| 33 | + "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " |
| 34 | + "env variables in your hooks prior to using the spark session. " |
| 35 | + "Read more about these variables here: " |
| 36 | + "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" |
| 37 | + ) from e |
| 38 | + pass |
13 | 39 |
|
14 | 40 | except ImportError: |
15 | 41 | # For "normal" spark sessions that don't use databricks-connect |
16 | 42 | # we get spark normally |
17 | 43 | spark = SparkSession.builder.getOrCreate() |
18 | 44 |
|
19 | 45 | return spark |
20 | | - |
21 | | - |
22 | | -def _create_databricks_session() -> SparkSession: |
23 | | - # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) |
24 | | - # the remote session is instantiated using the databricks module |
25 | | - # If the databricks-connect module is installed, we use a remote session |
26 | | - from databricks.connect import DatabricksSession |
27 | | - |
28 | | - try: |
29 | | - return DatabricksSession.builder.getOrCreate() |
30 | | - # this can't be narrowed down since databricks-connect throws error of Exception type |
31 | | - except Exception as exception: |
32 | | - if str(exception) == "Cluster id or serverless are required but were not specified.": |
33 | | - raise type(exception)( |
34 | | - "DatabricksSession is expected to behave as singleton but it didn't. " |
35 | | - "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " |
36 | | - "env variables in your hooks prior to using the spark session. " |
37 | | - "Read more about these variables here: " |
38 | | - "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" |
39 | | - ) from exception |
|
0 commit comments