Make ucx pylsp plugin configurable (#2280)

vsevolodstep-db · web-flow · commit d3d42c0d84cc · 2024-08-02T14:18:30.000+02:00
## Changes
Make LSP linter plugin configurable with cluster information. This
config can be provided either in a file or by a client and its
provisioning is handled by pylsp infrastructure.
Spark Connect linter is now applied only to UC Shared clusters, as
Single-User clusters are running in Spark Classic mode.

### Tests
&lt;!-- How is this tested? Please see the checklist below and also
describe any other relevant tests --&gt;

- [x] manually tested
- [x] added unit tests
- [ ] added integration tests
- [ ] verified on staging environment (screenshot attached)
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,7 @@ pylsp = [
 runtime = "databricks.labs.ucx.runtime:main"
 
 [project.entry-points.pylsp]
-plugin = "databricks.labs.ucx.source_code.lsp_plugin"
+pylsp_ucx = "databricks.labs.ucx.source_code.lsp_plugin"
 
 [project.urls]
 Issues = "https://github.com/databricks/ucx/issues"
diff --git a/src/databricks/labs/ucx/hive_metastore/migration_status.py b/src/databricks/labs/ucx/hive_metastore/migration_status.py
@@ -26,6 +26,17 @@ class MigrationStatus:
     def destination(self):
         return f"{self.dst_catalog}.{self.dst_schema}.{self.dst_table}".lower()
 
+    @classmethod
+    def from_json(cls, raw: dict[str, str]) -> "MigrationStatus":
+        return cls(
+            src_schema=raw['src_schema'],
+            src_table=raw['src_table'],
+            dst_catalog=raw.get('dst_catalog', None),
+            dst_schema=raw.get('dst_schema', None),
+            dst_table=raw.get('dst_table', None),
+            update_ts=raw.get('update_ts', None),
+        )
+
 
 @dataclass(frozen=True)
 class TableView:
diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py
@@ -187,11 +187,19 @@ def from_json(cls, json: dict) -> CurrentSessionState:
             catalog=json.get('catalog', DEFAULT_CATALOG),
             spark_conf=json.get('spark_conf', None),
             named_parameters=json.get('named_parameters', None),
-            data_security_mode=json.get('data_security_mode', None),
+            data_security_mode=cls.parse_security_mode(json.get('data_security_mode', None)),
             is_serverless=json.get('is_serverless', False),
             dbr_version=tuple(json['dbr_version']) if 'dbr_version' in json else None,
         )
 
+    @staticmethod
+    def parse_security_mode(mode_str: str | None) -> compute.DataSecurityMode | None:
+        try:
+            return compute.DataSecurityMode(mode_str) if mode_str else None
+        except ValueError:
+            logger.warning(f'Unknown data_security_mode {mode_str}')
+            return None
+
 
 class SequentialLinter(Linter):
     def __init__(self, linters: list[Linter]):
diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py
@@ -9,6 +9,8 @@
     PythonLinter,
     CurrentSessionState,
 )
+from databricks.sdk.service.compute import DataSecurityMode
+
 from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeHelper
 
 
@@ -238,6 +240,10 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:
 
 class SparkConnectLinter(PythonLinter):
     def __init__(self, session_state: CurrentSessionState):
+        if session_state.data_security_mode != DataSecurityMode.USER_ISOLATION:
+            self._matchers = []
+            return
+
         self._matchers = [
             JvmAccessMatcher(session_state=session_state),
             RDDApiMatcher(session_state=session_state),
diff --git a/src/databricks/labs/ucx/source_code/lsp_plugin.py b/src/databricks/labs/ucx/source_code/lsp_plugin.py
@@ -1,16 +1,44 @@
+import logging
+from packaging import version
+
 from pylsp import hookimpl  # type: ignore
-from pylsp.workspace import Document, Workspace  # type: ignore
+from pylsp.config.config import Config  # type: ignore
+from pylsp.workspace import Document  # type: ignore
 from databricks.sdk.service.workspace import Language
 
+from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex, MigrationStatus
+from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.linters.context import LinterContext
 from databricks.labs.ucx.source_code.lsp import Diagnostic
 
 
+logger = logging.getLogger(__name__)
+
+
 @hookimpl
-def pylsp_lint(workspace: Workspace, document: Document) -> list[dict]:  # pylint: disable=unused-argument
-    # TODO: initialize migration index and session state from config / env variables
-    languages = LinterContext(index=None, session_state=None)
+def pylsp_lint(config: Config, document: Document) -> list[dict]:
+    cfg = config.plugin_settings('pylsp_ucx', document_path=document.uri)
+
+    migration_index = MigrationIndex([MigrationStatus.from_json(st) for st in cfg.get('migration_index', [])])
+
+    session_state = CurrentSessionState(
+        data_security_mode=CurrentSessionState.parse_security_mode(cfg.get('dataSecurityMode', None)),
+        dbr_version=parse_dbr_version(cfg.get('dbrVersion', None)),
+        is_serverless=bool(cfg.get('isServerless', False)),
+    )
+    languages = LinterContext(index=migration_index, session_state=session_state)
     analyser = languages.linter(Language.PYTHON)
     code = document.source
     diagnostics = [Diagnostic.from_advice(_) for _ in analyser.lint(code)]
     return [d.as_dict() for d in diagnostics]
+
+
+def parse_dbr_version(version_str: str | None) -> tuple[int, int] | None:
+    if not version_str:
+        return None
+    try:
+        release_version = version.parse(version_str).release
+        return release_version[0], release_version[1]
+    except version.InvalidVersion:
+        logger.warning(f'Incorrect DBR version string: {version_str}')
+        return None
diff --git a/tests/unit/source_code/linters/test_spark_connect.py b/tests/unit/source_code/linters/test_spark_connect.py
@@ -1,12 +1,21 @@
 from itertools import chain
 
+import pytest
+
+
 from databricks.labs.ucx.source_code.base import Failure, CurrentSessionState
 from databricks.labs.ucx.source_code.linters.python_ast import Tree
 from databricks.labs.ucx.source_code.linters.spark_connect import LoggingMatcher, SparkConnectLinter
+from databricks.sdk.service.compute import DataSecurityMode
+
+
+@pytest.fixture
+def session_state() -> CurrentSessionState:
+    return CurrentSessionState(data_security_mode=DataSecurityMode.USER_ISOLATION)
 
 
-def test_jvm_access_match_shared():
-    linter = SparkConnectLinter(CurrentSessionState())
+def test_jvm_access_match_shared(session_state):
+    linter = SparkConnectLinter(session_state)
     code = """
 spark.range(10).collect()
 spark._jspark._jvm.com.my.custom.Name()
@@ -25,8 +34,9 @@ def test_jvm_access_match_shared():
     assert actual == expected
 
 
-def test_jvm_access_match_serverless():
-    linter = SparkConnectLinter(CurrentSessionState(is_serverless=True))
+def test_jvm_access_match_serverless(session_state):
+    session_state.is_serverless = True
+    linter = SparkConnectLinter(session_state)
     code = """
 spark.range(10).collect()
 spark._jspark._jvm.com.my.custom.Name()
@@ -46,8 +56,8 @@ def test_jvm_access_match_serverless():
     assert actual == expected
 
 
-def test_rdd_context_match_shared():
-    linter = SparkConnectLinter(CurrentSessionState())
+def test_rdd_context_match_shared(session_state):
+    linter = SparkConnectLinter(session_state)
     code = """
 rdd1 = sc.parallelize([1, 2, 3])
 rdd2 = spark.createDataFrame(sc.emptyRDD(), schema)
@@ -90,8 +100,9 @@ def test_rdd_context_match_shared():
     assert actual == expected
 
 
-def test_rdd_context_match_serverless():
-    linter = SparkConnectLinter(CurrentSessionState(is_serverless=True))
+def test_rdd_context_match_serverless(session_state):
+    session_state.is_serverless = True
+    linter = SparkConnectLinter(session_state)
     code = """
 rdd1 = sc.parallelize([1, 2, 3])
 rdd2 = spark.createDataFrame(sc.emptyRDD(), schema)
@@ -132,8 +143,8 @@ def test_rdd_context_match_serverless():
     ] == list(linter.lint(code))
 
 
-def test_rdd_map_partitions():
-    linter = SparkConnectLinter(CurrentSessionState())
+def test_rdd_map_partitions(session_state):
+    linter = SparkConnectLinter(session_state)
     code = """
 df = spark.createDataFrame([])
 df.rdd.mapPartitions(myUdf)
@@ -152,8 +163,8 @@ def test_rdd_map_partitions():
     assert actual == expected
 
 
-def test_conf_shared():
-    linter = SparkConnectLinter(CurrentSessionState())
+def test_conf_shared(session_state):
+    linter = SparkConnectLinter(session_state)
     code = """df.sparkContext.getConf().get('spark.my.conf')"""
     assert [
         Failure(
@@ -167,8 +178,9 @@ def test_conf_shared():
     ] == list(linter.lint(code))
 
 
-def test_conf_serverless():
-    linter = SparkConnectLinter(CurrentSessionState(is_serverless=True))
+def test_conf_serverless(session_state):
+    session_state.is_serverless = True
+    linter = SparkConnectLinter(session_state)
     code = """sc._conf().get('spark.my.conf')"""
     expected = [
         Failure(
@@ -184,8 +196,8 @@ def test_conf_serverless():
     assert actual == expected
 
 
-def test_logging_shared():
-    logging_matcher = LoggingMatcher(CurrentSessionState())
+def test_logging_shared(session_state):
+    logging_matcher = LoggingMatcher(session_state)
     code = """
 sc.setLogLevel("INFO")
 setLogLevel("WARN")
@@ -225,8 +237,9 @@ def test_logging_shared():
     ] == list(chain.from_iterable([logging_matcher.lint(node) for node in Tree.parse(code).walk()]))
 
 
-def test_logging_serverless():
-    logging_matcher = LoggingMatcher(CurrentSessionState(is_serverless=True))
+def test_logging_serverless(session_state):
+    session_state.is_serverless = True
+    logging_matcher = LoggingMatcher(session_state)
     code = """
 sc.setLogLevel("INFO")
 log4jLogger = sc._jvm.org.apache.log4j
@@ -255,7 +268,7 @@ def test_logging_serverless():
 
 
 def test_valid_code():
-    linter = SparkConnectLinter(CurrentSessionState())
+    linter = SparkConnectLinter(CurrentSessionState(data_security_mode=DataSecurityMode.USER_ISOLATION))
     code = """
 df = spark.range(10)
 df.collect()
diff --git a/tests/unit/source_code/samples/functional/spark-connect/catalog-api_13_3.py b/tests/unit/source_code/samples/functional/spark-connect/catalog-api_13_3.py
@@ -1,4 +1,4 @@
-# ucx[session-state] {"dbr_version": [13, 3]}
+# ucx[session-state] {"dbr_version": [13, 3], "data_security_mode": "USER_ISOLATION"}
 # ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters
 spark.catalog.tableExists("table")
 # ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters
diff --git a/tests/unit/source_code/samples/functional/spark-connect/catalog-api_14_3.py b/tests/unit/source_code/samples/functional/spark-connect/catalog-api_14_3.py
@@ -1,4 +1,4 @@
-# ucx[session-state] {"dbr_version": [14, 3]}
+# ucx[session-state] {"dbr_version": [14, 3], "data_security_mode": "USER_ISOLATION"}
 spark.catalog.tableExists("table")
 spark.catalog.listDatabases()
 
diff --git a/tests/unit/source_code/samples/functional/spark-connect/command-context.py b/tests/unit/source_code/samples/functional/spark-connect/command-context.py
@@ -1,3 +1,4 @@
+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}
 # ucx[to-json-in-shared-clusters:+1:6:+1:80] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information.
 print(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson())
 dbutils.notebook.entry_point.getDbutils().notebook().getContext().toSafeJson()
diff --git a/tests/unit/source_code/samples/functional/spark-connect/jvm-access.py b/tests/unit/source_code/samples/functional/spark-connect/jvm-access.py
@@ -1,3 +1,4 @@
+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}
 spark.range(10).collect()
 # ucx[jvm-access-in-shared-clusters:+1:0:+1:18] Cannot access Spark Driver JVM on UC Shared Clusters
 spark._jspark._jvm.com.my.custom.Name()
diff --git a/tests/unit/source_code/samples/functional/spark-connect/python-udfs_13_3.py b/tests/unit/source_code/samples/functional/spark-connect/python-udfs_13_3.py
@@ -1,4 +1,4 @@
-# ucx[session-state] {"dbr_version": [13, 3]}
+# ucx[session-state] {"dbr_version": [13, 3], "data_security_mode": "USER_ISOLATION"}
 from pyspark.sql.functions import udf, udtf, lit
 import pandas as pd
 
diff --git a/tests/unit/source_code/samples/functional/spark-connect/python-udfs_14_3.py b/tests/unit/source_code/samples/functional/spark-connect/python-udfs_14_3.py
@@ -1,4 +1,4 @@
-# ucx[session-state] {"dbr_version": [14, 3]}
+# ucx[session-state] {"dbr_version": [14, 3], "data_security_mode": "USER_ISOLATION"}
 from pyspark.sql.functions import udf, udtf, lit
 import pandas as pd
 
diff --git a/tests/unit/source_code/samples/functional/spark-connect/rdd-apis.py b/tests/unit/source_code/samples/functional/spark-connect/rdd-apis.py
@@ -1,3 +1,4 @@
+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}
 df = spark.createDataFrame([])
 # ucx[rdd-in-shared-clusters:+1:0:+1:27] RDD APIs are not supported on UC Shared Clusters. Use mapInArrow() or Pandas UDFs instead
 df.rdd.mapPartitions(myUdf)
diff --git a/tests/unit/source_code/samples/functional/spark-connect/spark-logging.py b/tests/unit/source_code/samples/functional/spark-connect/spark-logging.py
@@ -1,3 +1,4 @@
+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}
 # ucx[legacy-context-in-shared-clusters:+2:0:+2:14] sc is not supported on UC Shared Clusters. Rewrite it using spark
 # ucx[spark-logging-in-shared-clusters:+1:0:+1:22] Cannot set Spark log level directly from code on UC Shared Clusters. Remove the call and set the cluster spark conf 'spark.log.level' instead
 sc.setLogLevel("INFO")
diff --git a/tests/unit/source_code/test_lsp_plugin.py b/tests/unit/source_code/test_lsp_plugin.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# ucx[session-state] {"dbr_version": [13, 3]}`
	`1`	`+# ucx[session-state] {"dbr_version": [13, 3], "data_security_mode": "USER_ISOLATION"}`
`2`	`2`	`# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters`
`3`	`3`	`spark.catalog.tableExists("table")`
`4`	`4`	`# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}`
`1`	`2`	`# ucx[to-json-in-shared-clusters:+1:6:+1:80] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information.`
`2`	`3`	`print(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson())`
`3`	`4`	`dbutils.notebook.entry_point.getDbutils().notebook().getContext().toSafeJson()`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# ucx[session-state] {"data_security_mode": "USER_ISOLATION"}`
`1`	`2`	`spark.range(10).collect()`
`2`	`3`	`# ucx[jvm-access-in-shared-clusters:+1:0:+1:18] Cannot access Spark Driver JVM on UC Shared Clusters`
`3`	`4`	`spark._jspark._jvm.com.my.custom.Name()`