88import logging
99import pprint
1010import warnings
11- from typing import Any , Callable , Dict , Iterator , List , Optional , Tuple , Union , cast
11+ from typing import Any , Callable , Dict , Iterable , Iterator , List , Optional , Tuple , Union , cast
1212
1313import boto3
1414import pandas as pd
@@ -801,7 +801,7 @@ def read_parquet_table(
801801 ----
802802 ``Batching`` (`chunked` argument) (Memory Friendly):
803803
804- Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
804+ Will enable the function to return an Iterable of DataFrames instead of a regular DataFrame.
805805
806806 There are two batching strategies on Wrangler:
807807
@@ -836,8 +836,8 @@ def read_parquet_table(
836836 If none is provided, the AWS account ID is used by default.
837837 partition_filter: Optional[Callable[[Dict[str, str]], bool]]
838838 Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
839- This function MUST receive a single argument (Dict[str, str]) where keys are partitions
840- names and values are partitions values. Partitions values will be always strings extracted from S3.
839+ This function MUST receive a single argument (Dict[str, str]) where keys are partition
840+ names and values are partition values. Partition values will be always strings extracted from S3.
841841 This function MUST return a bool, True to read the partition or False to ignore it.
842842 Ignored if `dataset=False`.
843843 E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False``
@@ -861,7 +861,7 @@ def read_parquet_table(
861861 used to override the default pandas type for conversion of built-in
862862 pyarrow types or in absence of pandas_metadata in the Table schema.
863863 chunked : bool
864- If True will break the data in smaller DataFrames (Non deterministic number of lines).
864+ If True will break the data in smaller DataFrames (Non- deterministic number of lines).
865865 Otherwise return a single DataFrame with the whole data.
866866 use_threads : Union[bool, int]
867867 True to enable concurrent requests, False to disable multiple threads.
@@ -931,7 +931,7 @@ def read_parquet_table(
931931 catalog_id = catalog_id ,
932932 boto3_session = boto3_session ,
933933 )
934- available_partitions = list (available_partitions_dict .keys ())
934+ available_partitions = list (_ensure_locations_are_valid ( available_partitions_dict .keys () ))
935935 if available_partitions :
936936 paths = []
937937 path_root = path
@@ -973,6 +973,16 @@ def read_parquet_table(
973973 return map (partial_cast_function , df )
974974
975975
976+ def _ensure_locations_are_valid (paths : Iterable [str ]) -> Iterator [str ]:
977+ for path in paths :
978+ suffix : str = path .rpartition ("/" )[2 ]
979+ # If the suffix looks like a partition,
980+ if (suffix != "" ) and (suffix .count ("=" ) == 1 ):
981+ # the path should end in a '/' character.
982+ path = f"{ path } /"
983+ yield path
984+
985+
976986@apply_configs
977987def read_parquet_metadata (
978988 path : Union [str , List [str ]],
0 commit comments