Deduplicate errors emitted by Spark Connect linter (#1824)

vsevolodstep-db · web-flow · commit 49dcb08825a5 · 2024-06-04T12:23:47.000+02:00
## Changes
This PR adds deduplication of errors emitted by Spark Connect linter. It
also adds more functional tests using the new framework

### Functionality 

- [ ] added relevant user documentation
- [ ] added new CLI command
- [ ] modified existing command: `databricks labs ucx ...`
- [ ] added a new workflow
- [ ] modified existing workflow: `...`
- [ ] added a new table
- [ ] modified existing table: `...`

### Tests
- [ ] manually tested
- [x] added unit tests
- [ ] added integration tests
- [ ] verified on staging environment (screenshot attached)
diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py
@@ -22,6 +22,15 @@ def _cluster_type_str(self) -> str:
     def lint(self, node: ast.AST) -> Iterator[Advice]:
         pass
 
+    def lint_tree(self, tree: ast.AST) -> Iterator[Advice]:
+        reported_locations = set()
+        for node in ast.walk(tree):
+            for advice in self.lint(node):
+                loc = (advice.start_line, advice.start_col)
+                if loc not in reported_locations:
+                    reported_locations.add(loc)
+                    yield advice
+
 
 class JvmAccessMatcher(SharedClusterMatcher):
     _FIELDS = [
@@ -195,6 +204,5 @@ def __init__(self, is_serverless: bool = False):
 
     def lint(self, code: str) -> Iterator[Advice]:
         tree = ast.parse(code)
-        for node in ast.walk(tree):
-            for matcher in self._matchers:
-                yield from matcher.lint(node)
+        for matcher in self._matchers:
+            yield from matcher.lint_tree(tree)
diff --git a/tests/unit/source_code/linters/test_spark_connect.py b/tests/unit/source_code/linters/test_spark_connect.py
@@ -21,14 +21,6 @@ def test_jvm_access_match_shared():
             end_line=3,
             end_col=18,
         ),
-        Failure(
-            code="jvm-access-in-shared-clusters",
-            message='Cannot access Spark Driver JVM on UC Shared Clusters',
-            start_line=3,
-            start_col=0,
-            end_line=3,
-            end_col=13,
-        ),
     ] == list(linter.lint(code))
 
 
@@ -48,14 +40,6 @@ def test_jvm_access_match_serverless():
             end_line=3,
             end_col=18,
         ),
-        Failure(
-            code="jvm-access-in-shared-clusters",
-            message='Cannot access Spark Driver JVM on Serverless Compute',
-            start_line=3,
-            start_col=0,
-            end_line=3,
-            end_col=13,
-        ),
     ] == list(linter.lint(code))
 
 
@@ -74,14 +58,6 @@ def test_rdd_context_match_shared():
             end_line=2,
             end_col=32,
         ),
-        Failure(
-            code='legacy-context-in-shared-clusters',
-            message='sc is not supported on UC Shared Clusters. Rewrite it using spark',
-            start_line=2,
-            start_col=7,
-            end_line=2,
-            end_col=21,
-        ),
         Failure(
             code="rdd-in-shared-clusters",
             message='RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API',
@@ -90,6 +66,14 @@ def test_rdd_context_match_shared():
             end_line=3,
             end_col=42,
         ),
+        Failure(
+            code='legacy-context-in-shared-clusters',
+            message='sc is not supported on UC Shared Clusters. Rewrite it using spark',
+            start_line=2,
+            start_col=7,
+            end_line=2,
+            end_col=21,
+        ),
         Failure(
             code="legacy-context-in-shared-clusters",
             message='sc is not supported on UC Shared Clusters. Rewrite it using spark',
@@ -116,14 +100,6 @@ def test_rdd_context_match_serverless():
             end_line=2,
             end_col=32,
         ),
-        Failure(
-            code='legacy-context-in-shared-clusters',
-            message='sc is not supported on Serverless Compute. Rewrite it using spark',
-            start_line=2,
-            start_col=7,
-            end_line=2,
-            end_col=21,
-        ),
         Failure(
             code="rdd-in-shared-clusters",
             message='RDD APIs are not supported on Serverless Compute. Rewrite it using DataFrame API',
@@ -132,6 +108,14 @@ def test_rdd_context_match_serverless():
             end_line=3,
             end_col=42,
         ),
+        Failure(
+            code='legacy-context-in-shared-clusters',
+            message='sc is not supported on Serverless Compute. Rewrite it using spark',
+            start_line=2,
+            start_col=7,
+            end_line=2,
+            end_col=21,
+        ),
         Failure(
             code="legacy-context-in-shared-clusters",
             message='sc is not supported on Serverless Compute. Rewrite it using spark',
@@ -158,14 +142,6 @@ def test_rdd_map_partitions():
             end_line=3,
             end_col=27,
         ),
-        Failure(
-            code="rdd-in-shared-clusters",
-            message='RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API',
-            start_line=3,
-            start_col=0,
-            end_line=3,
-            end_col=6,
-        ),
     ] == list(linter.lint(code))
 
 
diff --git a/tests/unit/source_code/samples/functional/jvm-access.py b/tests/unit/source_code/samples/functional/jvm-access.py
@@ -1,9 +1,6 @@
 spark.range(10).collect()
-# TODO: looks like a bug in linter, because we are hitting the same issue twice
-# ucx[jvm-access-in-shared-clusters:+2:0:+2:18] Cannot access Spark Driver JVM on UC Shared Clusters
 # ucx[jvm-access-in-shared-clusters:+1:0:+1:18] Cannot access Spark Driver JVM on UC Shared Clusters
 spark._jspark._jvm.com.my.custom.Name()
 
-# ucx[jvm-access-in-shared-clusters:+2:0:+2:18] Cannot access Spark Driver JVM on UC Shared Clusters
 # ucx[jvm-access-in-shared-clusters:+1:0:+1:18] Cannot access Spark Driver JVM on UC Shared Clusters
 spark._jspark._jvm.com.my.custom.Name()
diff --git a/tests/unit/source_code/samples/functional/rdd-apis.py b/tests/unit/source_code/samples/functional/rdd-apis.py
@@ -0,0 +1,12 @@
+df = spark.createDataFrame([])
+# ucx[rdd-in-shared-clusters:+1:0:+1:27] RDD APIs are not supported on UC Shared Clusters. Use mapInArrow() or Pandas UDFs instead
+df.rdd.mapPartitions(myUdf)
+
+# ucx[rdd-in-shared-clusters:+1:7:+1:32] RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API
+# ucx[legacy-context-in-shared-clusters:+1:7:+1:21] sc is not supported on UC Shared Clusters. Rewrite it using spark
+rdd1 = sc.parallelize([1, 2, 3])
+
+# ucx[rdd-in-shared-clusters:+1:29:+1:42] RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API
+# ucx[legacy-context-in-shared-clusters:+1:29:+1:40] sc is not supported on UC Shared Clusters. Rewrite it using spark
+rdd2 = spark.createDataFrame(sc.emptyRDD(), schema)
+
diff --git a/tests/unit/source_code/samples/functional/spark-logging.py b/tests/unit/source_code/samples/functional/spark-logging.py
@@ -0,0 +1,15 @@
+# ucx[legacy-context-in-shared-clusters:+1:0:+1:14] sc is not supported on UC Shared Clusters. Rewrite it using spark
+# ucx[spark-logging-in-shared-clusters:+1:0:+1:22] Cannot set Spark log level directly from code on UC Shared Clusters. Remove the call and set the cluster spark conf 'spark.log.level' instead
+sc.setLogLevel("INFO")
+setLogLevel("WARN")
+
+# ucx[jvm-access-in-shared-clusters:+1:14:+1:21] Cannot access Spark Driver JVM on UC Shared Clusters
+# ucx[legacy-context-in-shared-clusters:+1:14:+1:21] sc is not supported on UC Shared Clusters. Rewrite it using spark
+# ucx[spark-logging-in-shared-clusters:+1:14:+1:38] Cannot access Spark Driver JVM logger on UC Shared Clusters. Use logging.getLogger() instead
+log4jLogger = sc._jvm.org.apache.log4j
+LOGGER = log4jLogger.LogManager.getLogger(__name__)
+
+# ucx[jvm-access-in-shared-clusters:+1:0:+1:7] Cannot access Spark Driver JVM on UC Shared Clusters
+# ucx[legacy-context-in-shared-clusters:+1:0:+1:7] sc is not supported on UC Shared Clusters. Rewrite it using spark
+# ucx[spark-logging-in-shared-clusters:+1:12:+1:24] Cannot access Spark Driver JVM logger on UC Shared Clusters. Use logging.getLogger() instead
+sc._jvm.org.apache.log4j.LogManager.getLogger(__name__).info("test")
diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py
@@ -47,7 +47,7 @@ def __init__(self, path: Path):
 
     def verify(self):
         expected_problems = list(self._expected_problems())
-        actual_problems = list(self._lint())
+        actual_problems = sorted(list(self._lint()), key=lambda a: (a.start_line, a.start_col))
         high_level_expected = [f'{p.code}:{p.message}' for p in expected_problems]
         high_level_actual = [f'{p.code}:{p.message}' for p in actual_problems]
         assert high_level_expected == high_level_actual