[SPARK-53591][SDP] Simplify Pipeline Spec Pattern Glob Matching

jackywang-db · sryza · commit c6cea73e3cbe · 2025-09-23T09:19:37.000-07:00
### What changes were proposed in this pull request? In SDP, the recommended scaffolding is to put pipeline definition files in the `transformations` and any sub folder. Currently if users have both sql and py pipeline definition files, they would need to do something like below to specify all of them in the pipeline spec: ```yml libraries: glob: include: transformations/*/.py glob: include: transformations/*/.sql ``` This is cumbersome and requires more work from the user. `transformations` should only contain pipeline source files ending in `.py` or `.sql` so ideally, users shouldn't even need to specify the file extensions. PR introduces changes to support the below pattern for source file matching and throw exception to discourage user from using the above pattern because they shouldn't put other file types in this directory. ```yml libraries: glob: include: transformations/** # matches recursively ``` ### Why are the changes needed? Simplify the user experience of needing to manually supply the glob with file extensions. ### Does this PR introduce _any_ user-facing change? Yes, but SDP not released. ### How was this patch tested? New and existing tests and running CLI manually ### Was this patch authored or co-authored using generative AI tooling? NO Closes #52348 from JiaqiWang18/SPARK-53591-restrict-sdp-glob-matching. Authored-by: Jacky Wang <jacky.wang@databricks.com> Signed-off-by: Sandy Ryza <sandy.ryza@databricks.com>
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -901,6 +901,11 @@
       "No pipeline.yaml or pipeline.yml file provided in arguments or found in directory `<dir_path>` or readable ancestor directories."
     ]
   },
+  "PIPELINE_SPEC_INVALID_GLOB_PATTERN": {
+    "message": [
+      "Invalid glob pattern `<glob_pattern>` in libraries. Only file paths, or folder paths ending with /** are allowed."
+    ]
+  },
   "PIPELINE_SPEC_MISSING_REQUIRED_FIELD": {
     "message": [
       "Pipeline spec missing required field `<field_name>`."
diff --git a/python/pyspark/pipelines/cli.py b/python/pyspark/pipelines/cli.py
@@ -23,6 +23,7 @@
 """
 from contextlib import contextmanager
 import argparse
+import glob
 import importlib.util
 import os
 import yaml
@@ -58,6 +59,32 @@ class LibrariesGlob:
     include: str
 
 
+def validate_patch_glob_pattern(glob_pattern: str) -> str:
+    """Validates that a glob pattern is allowed.
+
+    Only allows:
+    - File paths (paths without wildcards except for the filename)
+    - Folder paths ending with /** (recursive directory patterns)
+
+    Disallows complex glob patterns like transformations/**/*.py
+    """
+    # Check if it's a simple file path (no wildcards at all)
+    if not glob.has_magic(glob_pattern):
+        return glob_pattern
+
+    # Check if it's a folder path ending with /**
+    if glob_pattern.endswith("/**"):
+        prefix = glob_pattern[:-3]
+        if not glob.has_magic(prefix):
+            # append "/*" to match everything under the directory recursively
+            return glob_pattern + "/*"
+
+    raise PySparkException(
+        errorClass="PIPELINE_SPEC_INVALID_GLOB_PATTERN",
+        messageParameters={"glob_pattern": glob_pattern},
+    )
+
+
 @dataclass(frozen=True)
 class PipelineSpec:
     """Spec for a pipeline.
@@ -75,6 +102,16 @@ class PipelineSpec:
     configuration: Mapping[str, str]
     libraries: Sequence[LibrariesGlob]
 
+    def __post_init__(self) -> None:
+        """Validate libraries automatically after instantiation."""
+        validated = [
+            LibrariesGlob(validate_patch_glob_pattern(lib.include)) for lib in self.libraries
+        ]
+
+        # If normalization changed anything, patch into frozen dataclass
+        if tuple(validated) != tuple(self.libraries):
+            object.__setattr__(self, "libraries", tuple(validated))
+
 
 def find_pipeline_spec(current_dir: Path) -> Path:
     """Looks in the current directory and its ancestors for a pipeline spec file."""
@@ -180,7 +217,11 @@ def register_definitions(
             log_with_curr_timestamp(f"Loading definitions. Root directory: '{path}'.")
             for libraries_glob in spec.libraries:
                 glob_expression = libraries_glob.include
-                matching_files = [p for p in path.glob(glob_expression) if p.is_file()]
+                matching_files = [
+                    p
+                    for p in path.glob(glob_expression)
+                    if p.is_file() and "__pycache__" not in p.parts  # ignore generated python cache
+                ]
                 log_with_curr_timestamp(
                     f"Found {len(matching_files)} files matching glob '{glob_expression}'"
                 )
diff --git a/python/pyspark/pipelines/init_cli.py b/python/pyspark/pipelines/init_cli.py
@@ -21,9 +21,7 @@
 name: {{ name }}
 libraries:
   - glob:
-      include: transformations/**/*.py
-  - glob:
-      include: transformations/**/*.sql
+      include: transformations/**
 """
 
 PYTHON_EXAMPLE = """from pyspark import pipelines as dp
diff --git a/python/pyspark/pipelines/tests/test_cli.py b/python/pyspark/pipelines/tests/test_cli.py
@@ -240,7 +240,7 @@ def test_register_definitions(self):
             catalog=None,
             database=None,
             configuration={},
-            libraries=[LibrariesGlob(include="subdir1/*")],
+            libraries=[LibrariesGlob(include="subdir1/**")],
         )
         with tempfile.TemporaryDirectory() as temp_dir:
             outer_dir = Path(temp_dir)
@@ -283,7 +283,7 @@ def test_register_definitions_file_raises_error(self):
             catalog=None,
             database=None,
             configuration={},
-            libraries=[LibrariesGlob(include="*")],
+            libraries=[LibrariesGlob(include="./**")],
         )
         with tempfile.TemporaryDirectory() as temp_dir:
             outer_dir = Path(temp_dir)
@@ -301,7 +301,7 @@ def test_register_definitions_unsupported_file_extension_matches_glob(self):
             catalog=None,
             database=None,
             configuration={},
-            libraries=[LibrariesGlob(include="*")],
+            libraries=[LibrariesGlob(include="./**")],
         )
         with tempfile.TemporaryDirectory() as temp_dir:
             outer_dir = Path(temp_dir)
@@ -451,6 +451,70 @@ def test_parse_table_list_with_spaces(self):
         result = parse_table_list("table1, table2 , table3")
         self.assertEqual(result, ["table1", "table2", "table3"])
 
+    def test_valid_glob_patterns(self):
+        """Test that valid glob patterns are accepted."""
+        from pyspark.pipelines.cli import validate_patch_glob_pattern
+
+        cases = {
+            # Simple file paths
+            "src/main.py": "src/main.py",
+            "data/file.sql": "data/file.sql",
+            # Folder paths ending with /** (normalized)
+            "src/**": "src/**/*",
+            "transformations/**": "transformations/**/*",
+            "notebooks/production/**": "notebooks/production/**/*",
+        }
+
+        for pattern, expected in cases.items():
+            with self.subTest(pattern=pattern):
+                self.assertEqual(validate_patch_glob_pattern(pattern), expected)
+
+    def test_invalid_glob_patterns(self):
+        """Test that invalid glob patterns are rejected."""
+        from pyspark.pipelines.cli import validate_patch_glob_pattern
+
+        invalid_patterns = [
+            "transformations/**/*.py",
+            "src/**/utils/*.py",
+            "*/main.py",
+            "src/*/test/*.py",
+            "**/*.py",
+            "data/*/file.sql",
+        ]
+
+        for pattern in invalid_patterns:
+            with self.subTest(pattern=pattern):
+                with self.assertRaises(PySparkException) as context:
+                    validate_patch_glob_pattern(pattern)
+                self.assertEqual(
+                    context.exception.getCondition(), "PIPELINE_SPEC_INVALID_GLOB_PATTERN"
+                )
+                self.assertEqual(
+                    context.exception.getMessageParameters(), {"glob_pattern": pattern}
+                )
+
+    def test_pipeline_spec_with_invalid_glob_pattern(self):
+        """Test that pipeline spec with invalid glob pattern is rejected."""
+        with tempfile.NamedTemporaryFile(mode="w") as tmpfile:
+            tmpfile.write(
+                """
+                {
+                    "name": "test_pipeline",
+                    "libraries": [
+                        {"glob": {"include": "transformations/**/*.py"}}
+                    ]
+                }
+                """
+            )
+            tmpfile.flush()
+            with self.assertRaises(PySparkException) as context:
+                load_pipeline_spec(Path(tmpfile.name))
+            self.assertEqual(context.exception.getCondition(), "PIPELINE_SPEC_INVALID_GLOB_PATTERN")
+            self.assertEqual(
+                context.exception.getMessageParameters(),
+                {"glob_pattern": "transformations/**/*.py"},
+            )
+
 
 if __name__ == "__main__":
     try:
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/EndToEndAPISuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/EndToEndAPISuite.scala
@@ -55,8 +55,13 @@ class EndToEndAPISuite extends PipelineTest with APITest with SparkConnectServer
     // Create each source file in the temporary directory
     sources.foreach { file =>
       val filePath = Paths.get(file.name)
-      val fileName = filePath.getFileName.toString
-      val tempFilePath = projectDir.resolve(fileName)
+      val tempFilePath = projectDir.resolve(filePath)
+
+      // Create any necessary parent directories
+      val parentDir = tempFilePath.getParent
+      if (parentDir != null) {
+        Files.createDirectories(parentDir)
+      }
 
       // Create the file with the specified contents
       Files.write(tempFilePath, file.contents.getBytes("UTF-8"))
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/APITest.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/APITest.scala