@@ -530,6 +530,11 @@ def to_parquet( # pylint: disable=too-many-arguments
530530 The concept of Dataset goes beyond the simple idea of files and enable more
531531 complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
532532
533+ Note
534+ ----
535+ The table name and all column names will be automatically sanitize using
536+ `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
537+
533538 Note
534539 ----
535540 In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
@@ -833,7 +838,7 @@ def _to_parquet_file(
833838 fs : s3fs .S3FileSystem ,
834839 dtype : Dict [str , str ],
835840) -> str :
836- table : pa .Table = pyarrow .Table .from_pandas (df = df , schema = schema , nthreads = cpus , preserve_index = index , safe = False )
841+ table : pa .Table = pyarrow .Table .from_pandas (df = df , schema = schema , nthreads = cpus , preserve_index = index , safe = True )
837842 for col_name , col_type in dtype .items ():
838843 if col_name in table .column_names :
839844 col_index = table .column_names .index (col_name )
@@ -1190,6 +1195,7 @@ def _read_text_full(
11901195def _read_parquet_init (
11911196 path : Union [str , List [str ]],
11921197 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
1198+ categories : List [str ] = None ,
11931199 dataset : bool = False ,
11941200 use_threads : bool = True ,
11951201 boto3_session : Optional [boto3 .Session ] = None ,
@@ -1206,7 +1212,7 @@ def _read_parquet_init(
12061212 fs : s3fs .S3FileSystem = _utils .get_fs (session = boto3_session , s3_additional_kwargs = s3_additional_kwargs )
12071213 cpus : int = _utils .ensure_cpu_count (use_threads = use_threads )
12081214 data : pyarrow .parquet .ParquetDataset = pyarrow .parquet .ParquetDataset (
1209- path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters
1215+ path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters , read_dictionary = categories
12101216 )
12111217 return data
12121218
@@ -1217,6 +1223,7 @@ def read_parquet(
12171223 columns : Optional [List [str ]] = None ,
12181224 chunked : bool = False ,
12191225 dataset : bool = False ,
1226+ categories : List [str ] = None ,
12201227 use_threads : bool = True ,
12211228 boto3_session : Optional [boto3 .Session ] = None ,
12221229 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
@@ -1243,6 +1250,9 @@ def read_parquet(
12431250 Otherwise return a single DataFrame with the whole data.
12441251 dataset: bool
12451252 If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns.
1253+ categories: List[str], optional
1254+ List of columns names that should be returned as pandas.Categorical.
1255+ Recommended for memory restricted environments.
12461256 use_threads : bool
12471257 True to enable concurrent requests, False to disable multiple threads.
12481258 If enabled os.cpu_count() will be used as the max number of threads.
@@ -1292,66 +1302,59 @@ def read_parquet(
12921302 path = path ,
12931303 filters = filters ,
12941304 dataset = dataset ,
1305+ categories = categories ,
12951306 use_threads = use_threads ,
12961307 boto3_session = boto3_session ,
12971308 s3_additional_kwargs = s3_additional_kwargs ,
12981309 )
1299- common_metadata = data .common_metadata
1300- common_metadata = None if common_metadata is None else common_metadata .metadata .get (b"pandas" , None )
13011310 if chunked is False :
1302- return _read_parquet (data = data , columns = columns , use_threads = use_threads , common_metadata = common_metadata )
1303- return _read_parquet_chunked (data = data , columns = columns , use_threads = use_threads , common_metadata = common_metadata )
1311+ return _read_parquet (data = data , columns = columns , categories = categories , use_threads = use_threads )
1312+ return _read_parquet_chunked (data = data , columns = columns , categories = categories , use_threads = use_threads )
13041313
13051314
13061315def _read_parquet (
13071316 data : pyarrow .parquet .ParquetDataset ,
13081317 columns : Optional [List [str ]] = None ,
1318+ categories : List [str ] = None ,
13091319 use_threads : bool = True ,
1310- common_metadata : Any = None ,
13111320) -> pd .DataFrame :
1312- # Data
13131321 tables : List [pa .Table ] = []
13141322 for piece in data .pieces :
13151323 table : pa .Table = piece .read (
1316- columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = True
1324+ columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
13171325 )
13181326 tables .append (table )
13191327 table = pa .lib .concat_tables (tables )
1320-
1321- # Metadata
1322- current_metadata = table .schema .metadata or {}
1323- if common_metadata and b"pandas" not in current_metadata : # pragma: no cover
1324- table = table .replace_schema_metadata ({b"pandas" : common_metadata })
1325-
13261328 return table .to_pandas (
13271329 use_threads = use_threads ,
13281330 split_blocks = True ,
13291331 self_destruct = True ,
13301332 integer_object_nulls = False ,
13311333 date_as_object = True ,
1334+ ignore_metadata = True ,
1335+ categories = categories ,
13321336 types_mapper = _data_types .pyarrow2pandas_extension ,
13331337 )
13341338
13351339
13361340def _read_parquet_chunked (
13371341 data : pyarrow .parquet .ParquetDataset ,
13381342 columns : Optional [List [str ]] = None ,
1343+ categories : List [str ] = None ,
13391344 use_threads : bool = True ,
1340- common_metadata : Any = None ,
13411345) -> Iterator [pd .DataFrame ]:
13421346 for piece in data .pieces :
13431347 table : pa .Table = piece .read (
1344- columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = True
1348+ columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
13451349 )
1346- current_metadata = table .schema .metadata or {}
1347- if common_metadata and b"pandas" not in current_metadata : # pragma: no cover
1348- table = table .replace_schema_metadata ({b"pandas" : common_metadata })
13491350 yield table .to_pandas (
13501351 use_threads = use_threads ,
13511352 split_blocks = True ,
13521353 self_destruct = True ,
13531354 integer_object_nulls = False ,
13541355 date_as_object = True ,
1356+ ignore_metadata = True ,
1357+ categories = categories ,
13551358 types_mapper = _data_types .pyarrow2pandas_extension ,
13561359 )
13571360
@@ -1670,6 +1673,7 @@ def read_parquet_table(
16701673 database : str ,
16711674 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
16721675 columns : Optional [List [str ]] = None ,
1676+ categories : List [str ] = None ,
16731677 chunked : bool = False ,
16741678 use_threads : bool = True ,
16751679 boto3_session : Optional [boto3 .Session ] = None ,
@@ -1690,7 +1694,10 @@ def read_parquet_table(
16901694 filters: Union[List[Tuple], List[List[Tuple]]], optional
16911695 List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
16921696 columns : List[str], optional
1693- Names of columns to read from the file(s)
1697+ Names of columns to read from the file(s).
1698+ categories: List[str], optional
1699+ List of columns names that should be returned as pandas.Categorical.
1700+ Recommended for memory restricted environments.
16941701 chunked : bool
16951702 If True will break the data in smaller DataFrames (Non deterministic number of lines).
16961703 Otherwise return a single DataFrame with the whole data.
@@ -1740,6 +1747,7 @@ def read_parquet_table(
17401747 path = path ,
17411748 filters = filters ,
17421749 columns = columns ,
1750+ categories = categories ,
17431751 chunked = chunked ,
17441752 dataset = True ,
17451753 use_threads = use_threads ,
0 commit comments