SNOW-2867055: Fix leading slash in PATTERN FILES path for INFER_SCHEMA (#4128)

sfc-gh-dyadav · web-flow · commit 1d0b1fb2a021 · 2026-03-20T12:35:41.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -52,6 +52,7 @@
 
 - Fixed a bug where `cloudpickle` was not automatically added to the package list when using `artifact_repository` with custom packages, causing `ModuleNotFoundError` at runtime.
 - Fixed a bug when reading xml with custom schema, result include element attributes when column is not `StructType` type.
+- Fixed a bug where `INFER_SCHEMA` with `PATTERN` silently fell back to unfiltered inference when the stage location had no trailing slash, causing metadata files (e.g., `_common_metadata`) to corrupt type inference for timestamp columns.
 
 #### Improvements
 
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -1272,7 +1272,7 @@ def _infer_schema_for_file_format(
                         prefix = match[0]
                         for regex in regexes:
                             prefix = regex.sub("", prefix)
-                        files.append(prefix)
+                        files.append(prefix.lstrip("/"))
                     infer_schema_options["FILES"] = files
 
                     # Reconstruct path using just stage and any qualifiers
diff --git a/tests/integ/test_dataframe_reader_file.py b/tests/integ/test_dataframe_reader_file.py
@@ -2,9 +2,17 @@
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
 
+import datetime
+import os
+import tempfile
+
+import pyarrow as pa
+import pyarrow.parquet as pq
 import pytest
+
+from snowflake.snowpark._internal.utils import TempObjectType
 from snowflake.snowpark.functions import col, fl_get_file_type
-from snowflake.snowpark.types import FileType
+from snowflake.snowpark.types import FileType, TimestampType
 from tests.utils import Utils, TestFiles
 
 
@@ -356,3 +364,90 @@ def test_file_pattern_escape_single_quotes(session, resources_path):
 
     finally:
         Utils.drop_stage(session, test_stage)
+
+
+@pytest.mark.skipif(
+    "config.getoption('local_testing_mode', default=False)",
+    reason="FEAT: parquet not supported",
+)
+def test_parquet_pattern_infer_with_metadata_files(session):
+    """When a stage contains both .parquet data files and _common_metadata
+    files with a mismatched schema, reading with PATTERN should correctly
+    infer timestamp columns instead of falling back to VARIANT."""
+    stage_name = Utils.random_name_for_temp_object(TempObjectType.STAGE)
+
+    ts1 = datetime.datetime(2024, 1, 15, 10, 30, 0, 123456)
+    ts2 = datetime.datetime(2024, 6, 20, 14, 45, 30, 789012)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field("name", pa.string()),
+            pa.field("updated_time", pa.timestamp("us", tz="UTC")),
+        ]
+    )
+
+    table = pa.table(
+        {
+            "id": [1, 2],
+            "name": ["Alice", "Bob"],
+            "updated_time": [ts1, ts2],
+        },
+        schema=arrow_schema,
+    )
+
+    bad_arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field("name", pa.string()),
+            pa.field("updated_time", pa.timestamp("us")),
+        ]
+    )
+
+    try:
+        Utils.create_stage(session, stage_name, is_temporary=True)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            data_path = os.path.join(tmpdir, "data.parquet")
+            pq.write_table(table, data_path)
+
+            meta_path = os.path.join(tmpdir, "_common_metadata")
+            empty_table = pa.table(
+                {
+                    name: pa.array([], type=bad_arrow_schema.field(name).type)
+                    for name in bad_arrow_schema.names
+                },
+                schema=bad_arrow_schema,
+            )
+            pq.write_table(empty_table, meta_path)
+
+            session.file.put(
+                data_path,
+                f"@{stage_name}/subdir",
+                auto_compress=False,
+                overwrite=True,
+            )
+            session.file.put(
+                meta_path,
+                f"@{stage_name}/subdir",
+                auto_compress=False,
+                overwrite=True,
+            )
+
+        df = session.read.option("PATTERN", ".*[.]parquet").parquet(
+            f"@{stage_name}/subdir"
+        )
+
+        schema = df.schema
+        ts_field = [
+            f for f in schema.fields if f.name.strip('"').upper() == "UPDATED_TIME"
+        ][0]
+        assert isinstance(
+            ts_field.datatype, TimestampType
+        ), f"Expected TimestampType, got {ts_field.datatype}"
+
+        rows = df.collect()
+        assert len(rows) == 2
+
+    finally:
+        Utils.drop_stage(session, stage_name)