From 22293b46db9b49c90ae5adcd27764b0ae9e8efdb Mon Sep 17 00:00:00 2001 From: star-yar Date: Fri, 14 Mar 2025 15:44:53 -0400 Subject: [PATCH 1/5] Extend err message Signed-off-by: star-yar --- .../kedro_datasets/_utils/spark_utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/_utils/spark_utils.py b/kedro-datasets/kedro_datasets/_utils/spark_utils.py index e55012275..048a1a409 100644 --- a/kedro-datasets/kedro_datasets/_utils/spark_utils.py +++ b/kedro-datasets/kedro_datasets/_utils/spark_utils.py @@ -19,7 +19,23 @@ def get_spark() -> Union[SparkSession, "DatabricksSession"]: from databricks.connect import DatabricksSession # We can't test this as there's no Databricks test env available - spark = DatabricksSession.builder.getOrCreate() # pragma: no cover + try: + spark = DatabricksSession.builder.getOrCreate() # pragma: no cover + # this can't be narrowed down since databricks-connect throws error of Exception type + except Exception as e: + error_message = str(e) + if ( + error_message + == "Cluster id or serverless are required but were not specified." + ): + raise type(e)( + "DatabricksSession is expected to behave as singleton but it didn't. " + "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " + "env variables in your hooks prior to using the spark session. " + "Read more about these variables here: " + "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" + ) from e + pass except ImportError: # For "normal" spark sessions that don't use databricks-connect From 25a48779217cbaf07276dcc5bca543f08f94b612 Mon Sep 17 00:00:00 2001 From: star-yar Date: Fri, 14 Mar 2025 15:59:20 -0400 Subject: [PATCH 2/5] Update RELEASE.md Signed-off-by: star-yar --- kedro-datasets/RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 39cdd2f5a..8e460227c 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -10,6 +10,7 @@ - Fixed `polars.CSVDataset` `save` method on Windows using `utf-8` as default encoding. - Made `table_name` a keyword argument in the `ibis.FileDataset` implementation to be compatible with Ibis 10.0. - Fixed how sessions are handled in the `snowflake.SnowflakeTableDataset` implementation. +- Provide enhanced error message for the spark session created via databricks-connect if the builder args are incomplete provided. ## Breaking Changes From 78aa2659b0ee9bad6d8e7528887c29dd0a06a69e Mon Sep 17 00:00:00 2001 From: star-yar Date: Fri, 14 Mar 2025 17:44:54 -0400 Subject: [PATCH 3/5] Refactor Signed-off-by: star-yar --- .../kedro_datasets/_utils/spark_utils.py | 50 ++++++++----------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/kedro-datasets/kedro_datasets/_utils/spark_utils.py b/kedro-datasets/kedro_datasets/_utils/spark_utils.py index 048a1a409..f6b61f8ac 100644 --- a/kedro-datasets/kedro_datasets/_utils/spark_utils.py +++ b/kedro-datasets/kedro_datasets/_utils/spark_utils.py @@ -1,41 +1,15 @@ -from typing import TYPE_CHECKING, Union - from pyspark.sql import SparkSession -if TYPE_CHECKING: - from databricks.connect import DatabricksSession - -def get_spark() -> Union[SparkSession, "DatabricksSession"]: +def get_spark() -> SparkSession: """ Returns the SparkSession. In case databricks-connect is available we use it for extended configuration mechanisms and notebook compatibility, otherwise we use classic pyspark. """ try: - # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) - # the remote session is instantiated using the databricks module - # If the databricks-connect module is installed, we use a remote session - from databricks.connect import DatabricksSession - # We can't test this as there's no Databricks test env available - try: - spark = DatabricksSession.builder.getOrCreate() # pragma: no cover - # this can't be narrowed down since databricks-connect throws error of Exception type - except Exception as e: - error_message = str(e) - if ( - error_message - == "Cluster id or serverless are required but were not specified." - ): - raise type(e)( - "DatabricksSession is expected to behave as singleton but it didn't. " - "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " - "env variables in your hooks prior to using the spark session. " - "Read more about these variables here: " - "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" - ) from e - pass + spark = _create_databricks_session() # pragma: no cover except ImportError: # For "normal" spark sessions that don't use databricks-connect @@ -43,3 +17,23 @@ def get_spark() -> Union[SparkSession, "DatabricksSession"]: spark = SparkSession.builder.getOrCreate() return spark + + +def _create_databricks_session() -> SparkSession: + # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) + # the remote session is instantiated using the databricks module + # If the databricks-connect module is installed, we use a remote session + from databricks.connect import DatabricksSession + + try: + return DatabricksSession.builder.getOrCreate() + # this can't be narrowed down since databricks-connect throws error of Exception type + except Exception as exception: + if str(exception) == "Cluster id or serverless are required but were not specified.": + raise type(exception)( + "DatabricksSession is expected to behave as singleton but it didn't. " + "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID " + "env variables in your hooks prior to using the spark session. " + "Read more about these variables here: " + "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" + ) from exception From b962af8b98f9b282881b5151637f782b427533f4 Mon Sep 17 00:00:00 2001 From: star-yar Date: Fri, 14 Mar 2025 18:14:08 -0400 Subject: [PATCH 4/5] Add missing re-raise Signed-off-by: star-yar --- kedro-datasets/kedro_datasets/_utils/spark_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/kedro_datasets/_utils/spark_utils.py b/kedro-datasets/kedro_datasets/_utils/spark_utils.py index f6b61f8ac..d4d6ee33d 100644 --- a/kedro-datasets/kedro_datasets/_utils/spark_utils.py +++ b/kedro-datasets/kedro_datasets/_utils/spark_utils.py @@ -37,3 +37,4 @@ def _create_databricks_session() -> SparkSession: "Read more about these variables here: " "https://docs.databricks.com/aws/en/dev-tools/databricks-connect/cluster-config#config-profile-env-var" ) from exception + raise exception From 1f6f18373194f5664dc920e8f11d2f0666be3e8e Mon Sep 17 00:00:00 2001 From: star-yar Date: Fri, 14 Mar 2025 18:20:54 -0400 Subject: [PATCH 5/5] Format Signed-off-by: star-yar --- kedro-datasets/kedro_datasets/_utils/spark_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/_utils/spark_utils.py b/kedro-datasets/kedro_datasets/_utils/spark_utils.py index d4d6ee33d..4ff079354 100644 --- a/kedro-datasets/kedro_datasets/_utils/spark_utils.py +++ b/kedro-datasets/kedro_datasets/_utils/spark_utils.py @@ -29,7 +29,10 @@ def _create_databricks_session() -> SparkSession: return DatabricksSession.builder.getOrCreate() # this can't be narrowed down since databricks-connect throws error of Exception type except Exception as exception: - if str(exception) == "Cluster id or serverless are required but were not specified.": + if ( + str(exception) + == "Cluster id or serverless are required but were not specified." + ): raise type(exception)( "DatabricksSession is expected to behave as singleton but it didn't. " "Either set up DATABRICKS_CONFIG_PROFILE or DATABRICKS_PROFILE and DATABRICKS_SERVERLESS_COMPUTE_ID "