@@ -2218,7 +2218,8 @@ class Validate:
2218
2218
- BigQuery table (`"bigquery"`)*
2219
2219
- Parquet table (`"parquet"`)*
2220
2220
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
2221
- - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` extension, or Spark-style partitioned dataset)
2221
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2222
+ extension, or partitioned dataset)
2222
2223
2223
2224
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2224
2225
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
@@ -2735,11 +2736,11 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2735
2736
Process data parameter to handle Parquet file inputs.
2736
2737
2737
2738
Supports:
2738
- - Single .parquet file (string or Path)
2739
- - Glob patterns for multiple .parquet files (e.g., "data/*.parquet")
2740
- - Directory containing .parquet files
2741
- - Spark-style partitioned datasets with automatic partition column inference
2742
- - List /sequence of .parquet file paths
2739
+ - single .parquet file (string or Path)
2740
+ - glob patterns for multiple .parquet files (e.g., "data/*.parquet")
2741
+ - directory containing .parquet files
2742
+ - partitioned Parquet datasets with automatic partition column inference
2743
+ - list /sequence of .parquet file paths
2743
2744
2744
2745
Returns the original data if it's not a Parquet file input.
2745
2746
"""
@@ -2753,8 +2754,8 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2753
2754
data_str = str(data)
2754
2755
path_obj = Path(data)
2755
2756
2756
- # Check if it's a glob pattern containing .parquet first
2757
- # Look for glob characters: *, ?, [, ]
2757
+ # Check if it's a glob pattern containing .parquet first; look for glob
2758
+ # characters: `*`, `?`, `[`, `]`
2758
2759
if ".parquet" in data_str.lower() and any(
2759
2760
char in data_str for char in ["*", "?", "[", "]"]
2760
2761
):
@@ -2773,9 +2774,8 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2773
2774
2774
2775
# Check if it's a directory
2775
2776
elif path_obj.is_dir():
2776
- # First, try to read as a partitioned parquet dataset; This handles
2777
- # Spark-style partitioned datasets where parquet files are in subdirectories
2778
- # with partition columns encoded in paths
2777
+ # First, try to read as a partitioned parquet dataset; This handles datasets where
2778
+ # Parquet files are in subdirectories with partition columns encoded in paths
2779
2779
try:
2780
2780
# Both Polars and Pandas can handle partitioned datasets natively
2781
2781
if _is_lib_present(lib_name="polars"):
@@ -2826,8 +2826,7 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2826
2826
if not parquet_paths:
2827
2827
return data
2828
2828
2829
- # Read the parquet file(s) using available libraries
2830
- # Prefer Polars, fallback to Pandas
2829
+ # Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
2831
2830
if _is_lib_present(lib_name="polars"):
2832
2831
try:
2833
2832
import polars as pl
@@ -2836,7 +2835,7 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2836
2835
# Single file
2837
2836
return pl.read_parquet(parquet_paths[0])
2838
2837
else:
2839
- # Multiple files - concatenate them
2838
+ # Multiple files: concatenate them
2840
2839
dfs = [pl.read_parquet(path) for path in parquet_paths]
2841
2840
return pl.concat(dfs, how="vertical_relaxed")
2842
2841
except Exception as e:
@@ -2847,7 +2846,7 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2847
2846
if len(parquet_paths) == 1:
2848
2847
return pd.read_parquet(parquet_paths[0])
2849
2848
else:
2850
- # Multiple files - concatenate them
2849
+ # Multiple files: concatenate them
2851
2850
dfs = [pd.read_parquet(path) for path in parquet_paths]
2852
2851
return pd.concat(dfs, ignore_index=True)
2853
2852
else:
@@ -2862,7 +2861,7 @@ def _process_parquet_input(self, data: FrameT | Any) -> FrameT | Any:
2862
2861
if len(parquet_paths) == 1:
2863
2862
return pd.read_parquet(parquet_paths[0])
2864
2863
else:
2865
- # Multiple files - concatenate them
2864
+ # Multiple files: concatenate them
2866
2865
dfs = [pd.read_parquet(path) for path in parquet_paths]
2867
2866
return pd.concat(dfs, ignore_index=True)
2868
2867
except Exception as e:
0 commit comments