@@ -60,6 +60,7 @@ def _read_parquet_metadata_file(
6060 s3_additional_kwargs : Optional [Dict [str , str ]],
6161 use_threads : Union [bool , int ],
6262 version_id : Optional [str ] = None ,
63+ ignore_null : bool = False ,
6364 pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
6465) -> Optional [Dict [str , str ]]:
6566 pyarrow_args = _set_default_pyarrow_additional_kwargs (pyarrow_additional_kwargs )
@@ -77,7 +78,9 @@ def _read_parquet_metadata_file(
7778 )
7879 if pq_file is None :
7980 return None
80- return _data_types .athena_types_from_pyarrow_schema (schema = pq_file .schema .to_arrow_schema (), partitions = None )[0 ]
81+ return _data_types .athena_types_from_pyarrow_schema (
82+ schema = pq_file .schema .to_arrow_schema (), partitions = None , ignore_null = ignore_null
83+ )[0 ]
8184
8285
8386def _read_schemas_from_files (
@@ -87,6 +90,7 @@ def _read_schemas_from_files(
8790 boto3_session : boto3 .Session ,
8891 s3_additional_kwargs : Optional [Dict [str , str ]],
8992 version_ids : Optional [Dict [str , str ]] = None ,
93+ ignore_null : bool = False ,
9094 pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
9195) -> Tuple [Dict [str , str ], ...]:
9296
@@ -102,6 +106,7 @@ def _read_schemas_from_files(
102106 s3_additional_kwargs = s3_additional_kwargs ,
103107 use_threads = use_threads ,
104108 version_id = version_ids .get (p ) if isinstance (version_ids , dict ) else None ,
109+ ignore_null = ignore_null ,
105110 pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
106111 )
107112 for p in paths
@@ -117,6 +122,7 @@ def _read_schemas_from_files(
117122 itertools .repeat (s3_additional_kwargs ),
118123 itertools .repeat (use_threads ),
119124 versions ,
125+ itertools .repeat (ignore_null ),
120126 itertools .repeat (pyarrow_additional_kwargs ),
121127 )
122128 )
@@ -175,6 +181,7 @@ def _read_parquet_metadata(
175181 path_suffix : Optional [str ],
176182 path_ignore_suffix : Optional [str ],
177183 ignore_empty : bool ,
184+ ignore_null : bool ,
178185 dtype : Optional [Dict [str , str ]],
179186 sampling : float ,
180187 dataset : bool ,
@@ -207,6 +214,7 @@ def _read_parquet_metadata(
207214 else {paths [0 ]: version_id }
208215 if isinstance (version_id , str )
209216 else None ,
217+ ignore_null = ignore_null ,
210218 pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
211219 )
212220 columns_types : Dict [str , str ] = _merge_schemas (schemas = schemas )
@@ -990,6 +998,7 @@ def read_parquet_metadata(
990998 path_suffix : Optional [str ] = None ,
991999 path_ignore_suffix : Optional [str ] = None ,
9921000 ignore_empty : bool = True ,
1001+ ignore_null : bool = False ,
9931002 dtype : Optional [Dict [str , str ]] = None ,
9941003 sampling : float = 1.0 ,
9951004 dataset : bool = False ,
@@ -1030,6 +1039,8 @@ def read_parquet_metadata(
10301039 If None, will try to read all files. (default)
10311040 ignore_empty: bool
10321041 Ignore files with 0 bytes.
1042+ ignore_null: bool
1043+ Ignore columns with null type.
10331044 dtype : Dict[str, str], optional
10341045 Dictionary of columns names and Athena/Glue types to be casted.
10351046 Useful when you have columns with undetermined data types as partitions columns.
@@ -1083,6 +1094,7 @@ def read_parquet_metadata(
10831094 path_suffix = path_suffix ,
10841095 path_ignore_suffix = path_ignore_suffix ,
10851096 ignore_empty = ignore_empty ,
1097+ ignore_null = ignore_null ,
10861098 dtype = dtype ,
10871099 sampling = sampling ,
10881100 dataset = dataset ,
0 commit comments