@@ -833,7 +833,7 @@ def _to_parquet_file(
833833 fs : s3fs .S3FileSystem ,
834834 dtype : Dict [str , str ],
835835) -> str :
836- table : pa .Table = pyarrow .Table .from_pandas (df = df , schema = schema , nthreads = cpus , preserve_index = index , safe = False )
836+ table : pa .Table = pyarrow .Table .from_pandas (df = df , schema = schema , nthreads = cpus , preserve_index = index , safe = True )
837837 for col_name , col_type in dtype .items ():
838838 if col_name in table .column_names :
839839 col_index = table .column_names .index (col_name )
@@ -1190,6 +1190,7 @@ def _read_text_full(
11901190def _read_parquet_init (
11911191 path : Union [str , List [str ]],
11921192 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
1193+ categories : List [str ] = None ,
11931194 dataset : bool = False ,
11941195 use_threads : bool = True ,
11951196 boto3_session : Optional [boto3 .Session ] = None ,
@@ -1206,7 +1207,7 @@ def _read_parquet_init(
12061207 fs : s3fs .S3FileSystem = _utils .get_fs (session = boto3_session , s3_additional_kwargs = s3_additional_kwargs )
12071208 cpus : int = _utils .ensure_cpu_count (use_threads = use_threads )
12081209 data : pyarrow .parquet .ParquetDataset = pyarrow .parquet .ParquetDataset (
1209- path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters
1210+ path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters , read_dictionary = categories
12101211 )
12111212 return data
12121213
@@ -1217,6 +1218,7 @@ def read_parquet(
12171218 columns : Optional [List [str ]] = None ,
12181219 chunked : bool = False ,
12191220 dataset : bool = False ,
1221+ categories : List [str ] = None ,
12201222 use_threads : bool = True ,
12211223 boto3_session : Optional [boto3 .Session ] = None ,
12221224 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
@@ -1243,6 +1245,9 @@ def read_parquet(
12431245 Otherwise return a single DataFrame with the whole data.
12441246 dataset: bool
12451247 If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns.
1248+ categories: List[str], optional
1249+ List of columns names that should be returned as pandas.Categorical.
1250+ Recommended for memory restricted environments.
12461251 use_threads : bool
12471252 True to enable concurrent requests, False to disable multiple threads.
12481253 If enabled os.cpu_count() will be used as the max number of threads.
@@ -1292,66 +1297,59 @@ def read_parquet(
12921297 path = path ,
12931298 filters = filters ,
12941299 dataset = dataset ,
1300+ categories = categories ,
12951301 use_threads = use_threads ,
12961302 boto3_session = boto3_session ,
12971303 s3_additional_kwargs = s3_additional_kwargs ,
12981304 )
1299- common_metadata = data .common_metadata
1300- common_metadata = None if common_metadata is None else common_metadata .metadata .get (b"pandas" , None )
13011305 if chunked is False :
1302- return _read_parquet (data = data , columns = columns , use_threads = use_threads , common_metadata = common_metadata )
1303- return _read_parquet_chunked (data = data , columns = columns , use_threads = use_threads , common_metadata = common_metadata )
1306+ return _read_parquet (data = data , columns = columns , categories = categories , use_threads = use_threads )
1307+ return _read_parquet_chunked (data = data , columns = columns , categories = categories , use_threads = use_threads )
13041308
13051309
13061310def _read_parquet (
13071311 data : pyarrow .parquet .ParquetDataset ,
13081312 columns : Optional [List [str ]] = None ,
1313+ categories : List [str ] = None ,
13091314 use_threads : bool = True ,
1310- common_metadata : Any = None ,
13111315) -> pd .DataFrame :
1312- # Data
13131316 tables : List [pa .Table ] = []
13141317 for piece in data .pieces :
13151318 table : pa .Table = piece .read (
1316- columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = True
1319+ columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
13171320 )
13181321 tables .append (table )
13191322 table = pa .lib .concat_tables (tables )
1320-
1321- # Metadata
1322- current_metadata = table .schema .metadata or {}
1323- if common_metadata and b"pandas" not in current_metadata : # pragma: no cover
1324- table = table .replace_schema_metadata ({b"pandas" : common_metadata })
1325-
13261323 return table .to_pandas (
13271324 use_threads = use_threads ,
13281325 split_blocks = True ,
13291326 self_destruct = True ,
13301327 integer_object_nulls = False ,
13311328 date_as_object = True ,
1329+ ignore_metadata = True ,
1330+ categories = categories ,
13321331 types_mapper = _data_types .pyarrow2pandas_extension ,
13331332 )
13341333
13351334
13361335def _read_parquet_chunked (
13371336 data : pyarrow .parquet .ParquetDataset ,
13381337 columns : Optional [List [str ]] = None ,
1338+ categories : List [str ] = None ,
13391339 use_threads : bool = True ,
1340- common_metadata : Any = None ,
13411340) -> Iterator [pd .DataFrame ]:
13421341 for piece in data .pieces :
13431342 table : pa .Table = piece .read (
1344- columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = True
1343+ columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
13451344 )
1346- current_metadata = table .schema .metadata or {}
1347- if common_metadata and b"pandas" not in current_metadata : # pragma: no cover
1348- table = table .replace_schema_metadata ({b"pandas" : common_metadata })
13491345 yield table .to_pandas (
13501346 use_threads = use_threads ,
13511347 split_blocks = True ,
13521348 self_destruct = True ,
13531349 integer_object_nulls = False ,
13541350 date_as_object = True ,
1351+ ignore_metadata = True ,
1352+ categories = categories ,
13551353 types_mapper = _data_types .pyarrow2pandas_extension ,
13561354 )
13571355
@@ -1670,6 +1668,7 @@ def read_parquet_table(
16701668 database : str ,
16711669 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
16721670 columns : Optional [List [str ]] = None ,
1671+ categories : List [str ] = None ,
16731672 chunked : bool = False ,
16741673 use_threads : bool = True ,
16751674 boto3_session : Optional [boto3 .Session ] = None ,
@@ -1690,7 +1689,10 @@ def read_parquet_table(
16901689 filters: Union[List[Tuple], List[List[Tuple]]], optional
16911690 List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
16921691 columns : List[str], optional
1693- Names of columns to read from the file(s)
1692+ Names of columns to read from the file(s).
1693+ categories: List[str], optional
1694+ List of columns names that should be returned as pandas.Categorical.
1695+ Recommended for memory restricted environments.
16941696 chunked : bool
16951697 If True will break the data in smaller DataFrames (Non deterministic number of lines).
16961698 Otherwise return a single DataFrame with the whole data.
@@ -1740,6 +1742,7 @@ def read_parquet_table(
17401742 path = path ,
17411743 filters = filters ,
17421744 columns = columns ,
1745+ categories = categories ,
17431746 chunked = chunked ,
17441747 dataset = True ,
17451748 use_threads = use_threads ,
0 commit comments