aws
diff --git a/‎awswrangler/athena.py‎
Lines changed: 16 additions & 7 deletions b/‎awswrangler/athena.py‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎awswrangler/catalog.py‎
Lines changed: 18 additions & 13 deletions b/‎awswrangler/catalog.py‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎awswrangler/db.py‎
Lines changed: 8 additions & 0 deletions b/‎awswrangler/db.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎awswrangler/s3.py‎
Lines changed: 29 additions & 21 deletions b/‎awswrangler/s3.py‎
Lines changed: 29 additions & 21 deletions
diff --git a/‎testing/test_awswrangler/_utils.py‎
Lines changed: 32 additions & 0 deletions b/‎testing/test_awswrangler/_utils.py‎
Lines changed: 32 additions & 0 deletions
@@ -259,7 +259,7 @@ def _extract_ctas_manifest_paths(path: str, boto3_session: Optional[boto3.Sessio
 
 
 def _get_query_metadata(
-    query_execution_id: str, boto3_session: Optional[boto3.Session] = None
+    query_execution_id: str, categories: List[str] = None, boto3_session: Optional[boto3.Session] = None
 ) -> Tuple[Dict[str, str], List[str], List[str], Dict[str, Any], List[str]]:
     """Get query metadata."""
     cols_types: Dict[str, str] = get_query_columns_types(
@@ -285,7 +285,9 @@ def _get_query_metadata(
                 "Please use ctas_approach=True for Struct columns."
             )
         pandas_type: str = _data_types.athena2pandas(dtype=col_type)
-        if pandas_type in ["datetime64", "date"]:
+        if (categories is not None) and (col_name in categories):
+            dtype[col_name] = "category"
+        elif pandas_type in ["datetime64", "date"]:
             parse_timestamps.append(col_name)
             if pandas_type == "date":
                 parse_dates.append(col_name)
@@ -326,6 +328,7 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     sql: str,
     database: str,
     ctas_approach: bool = True,
+    categories: List[str] = None,
     chunksize: Optional[int] = None,
     s3_output: Optional[str] = None,
     workgroup: Optional[str] = None,
@@ -377,6 +380,9 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
     ctas_approach: bool
         Wraps the query using a CTAS, and read the resulted parquet data on S3.
         If false, read the regular CSV on S3.
+    categories: List[str], optional
+        List of columns names that should be returned as pandas.Categorical.
+        Recommended for memory restricted environments.
     chunksize: int, optional
         If specified, return an generator where chunksize is the number of rows to include in each chunk.
     s3_output : str, optional
@@ -457,10 +463,12 @@ def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
                 dfs = _utils.empty_generator()
         else:
             s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
-            dfs = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked)
+            dfs = s3.read_parquet(
+                path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories
+            )
         return dfs
     dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
-        query_execution_id=query_id, boto3_session=session
+        query_execution_id=query_id, categories=categories, boto3_session=session
     )
     path = f"{_s3_output}{query_id}.csv"
     s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session)
@@ -539,12 +547,13 @@ def get_work_group(workgroup: str, boto3_session: Optional[boto3.Session] = None
 def _ensure_workgroup(
     session: boto3.Session, workgroup: Optional[str] = None
 ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
-    if workgroup:
+    if workgroup is not None:
         res: Dict[str, Any] = get_work_group(workgroup=workgroup, boto3_session=session)
         config: Dict[str, Any] = res["WorkGroup"]["Configuration"]["ResultConfiguration"]
         wg_s3_output: Optional[str] = config.get("OutputLocation")
-        wg_encryption: Optional[str] = config["EncryptionConfiguration"].get("EncryptionOption")
-        wg_kms_key: Optional[str] = config["EncryptionConfiguration"].get("KmsKey")
+        encrypt_config: Optional[Dict[str, str]] = config.get("EncryptionConfiguration")
+        wg_encryption: Optional[str] = None if encrypt_config is None else encrypt_config.get("EncryptionOption")
+        wg_kms_key: Optional[str] = None if encrypt_config is None else encrypt_config.get("KmsKey")
     else:
         wg_s3_output, wg_encryption, wg_kms_key = None, None, None
     return wg_s3_output, wg_encryption, wg_kms_key
 
@@ -730,26 +730,21 @@ def table(
 
 
 def _sanitize_name(name: str) -> str:
-    name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")
-    name = name.replace("{", "_")
-    name = name.replace("}", "_")
-    name = name.replace("]", "_")
-    name = name.replace("[", "_")
-    name = name.replace(")", "_")
-    name = name.replace("(", "_")
-    name = name.replace(" ", "_")
-    name = name.replace("-", "_")
-    name = name.replace(".", "_")
-    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
-    name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)
-    return name.lower()
+    name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn")  # strip accents
+    name = re.sub("[^A-Za-z0-9_]+", "_", name)  # Removing non alphanumeric characters
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()  # Converting CamelCase to snake_case
 
 
 def sanitize_column_name(column: str) -> str:
     """Convert the column name to be compatible with Amazon Athena.
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     column : str
@@ -775,6 +770,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame:
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     df : pandas.DataFrame
@@ -800,6 +800,11 @@ def sanitize_table_name(table: str) -> str:
 
     https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
 
+    Possible transformations:
+    - Strip accents
+    - Remove non alphanumeric characters
+    - Convert CamelCase to snake_case
+
     Parameters
     ----------
     table : str
 
@@ -887,6 +887,7 @@ def unload_redshift(
     path: str,
     con: sqlalchemy.engine.Engine,
     iam_role: str,
+    categories: List[str] = None,
     chunked: bool = False,
     keep_files: bool = False,
     use_threads: bool = True,
@@ -920,6 +921,9 @@ def unload_redshift(
         wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine()
     iam_role : str
         AWS IAM role with the related permissions.
+    categories: List[str], optional
+        List of columns names that should be returned as pandas.Categorical.
+        Recommended for memory restricted environments.
     keep_files : bool
         Should keep the stage files?
     chunked : bool
@@ -960,6 +964,7 @@ def unload_redshift(
             return pd.DataFrame()
         df: pd.DataFrame = s3.read_parquet(
             path=paths,
+            categories=categories,
             chunked=chunked,
             dataset=False,
             use_threads=use_threads,
@@ -973,6 +978,7 @@ def unload_redshift(
         return _utils.empty_generator()
     return _read_parquet_iterator(
         paths=paths,
+        categories=categories,
         use_threads=use_threads,
         boto3_session=session,
         s3_additional_kwargs=s3_additional_kwargs,
@@ -984,11 +990,13 @@ def _read_parquet_iterator(
     paths: List[str],
     keep_files: bool,
     use_threads: bool,
+    categories: List[str] = None,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
 ) -> Iterator[pd.DataFrame]:
     dfs: Iterator[pd.DataFrame] = s3.read_parquet(
         path=paths,
+        categories=categories,
         chunked=True,
         dataset=False,
         use_threads=use_threads,
 
@@ -530,6 +530,11 @@ def to_parquet(  # pylint: disable=too-many-arguments
     The concept of Dataset goes beyond the simple idea of files and enable more
     complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
 
+    Note
+    ----
+    The table name and all column names will be automatically sanitize using
+    `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
+
     Note
     ----
     In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().
@@ -833,7 +838,7 @@ def _to_parquet_file(
     fs: s3fs.S3FileSystem,
     dtype: Dict[str, str],
 ) -> str:
-    table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=False)
+    table: pa.Table = pyarrow.Table.from_pandas(df=df, schema=schema, nthreads=cpus, preserve_index=index, safe=True)
     for col_name, col_type in dtype.items():
         if col_name in table.column_names:
             col_index = table.column_names.index(col_name)
@@ -1190,6 +1195,7 @@ def _read_text_full(
 def _read_parquet_init(
     path: Union[str, List[str]],
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
+    categories: List[str] = None,
     dataset: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
@@ -1206,7 +1212,7 @@ def _read_parquet_init(
     fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
     cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
     data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
-        path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters
+        path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters, read_dictionary=categories
     )
     return data
 
@@ -1217,6 +1223,7 @@ def read_parquet(
     columns: Optional[List[str]] = None,
     chunked: bool = False,
     dataset: bool = False,
+    categories: List[str] = None,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
     s3_additional_kwargs: Optional[Dict[str, str]] = None,
@@ -1243,6 +1250,9 @@ def read_parquet(
         Otherwise return a single DataFrame with the whole data.
     dataset: bool
         If True read a parquet dataset instead of simple file(s) loading all the related partitions as columns.
+    categories: List[str], optional
+        List of columns names that should be returned as pandas.Categorical.
+        Recommended for memory restricted environments.
     use_threads : bool
         True to enable concurrent requests, False to disable multiple threads.
         If enabled os.cpu_count() will be used as the max number of threads.
@@ -1292,66 +1302,59 @@ def read_parquet(
         path=path,
         filters=filters,
         dataset=dataset,
+        categories=categories,
         use_threads=use_threads,
         boto3_session=boto3_session,
         s3_additional_kwargs=s3_additional_kwargs,
     )
-    common_metadata = data.common_metadata
-    common_metadata = None if common_metadata is None else common_metadata.metadata.get(b"pandas", None)
     if chunked is False:
-        return _read_parquet(data=data, columns=columns, use_threads=use_threads, common_metadata=common_metadata)
-    return _read_parquet_chunked(data=data, columns=columns, use_threads=use_threads, common_metadata=common_metadata)
+        return _read_parquet(data=data, columns=columns, categories=categories, use_threads=use_threads)
+    return _read_parquet_chunked(data=data, columns=columns, categories=categories, use_threads=use_threads)
 
 
 def _read_parquet(
     data: pyarrow.parquet.ParquetDataset,
     columns: Optional[List[str]] = None,
+    categories: List[str] = None,
     use_threads: bool = True,
-    common_metadata: Any = None,
 ) -> pd.DataFrame:
-    # Data
     tables: List[pa.Table] = []
     for piece in data.pieces:
         table: pa.Table = piece.read(
-            columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=True
+            columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
         )
         tables.append(table)
     table = pa.lib.concat_tables(tables)
-
-    # Metadata
-    current_metadata = table.schema.metadata or {}
-    if common_metadata and b"pandas" not in current_metadata:  # pragma: no cover
-        table = table.replace_schema_metadata({b"pandas": common_metadata})
-
     return table.to_pandas(
         use_threads=use_threads,
         split_blocks=True,
         self_destruct=True,
         integer_object_nulls=False,
         date_as_object=True,
+        ignore_metadata=True,
+        categories=categories,
         types_mapper=_data_types.pyarrow2pandas_extension,
     )
 
 
 def _read_parquet_chunked(
     data: pyarrow.parquet.ParquetDataset,
     columns: Optional[List[str]] = None,
+    categories: List[str] = None,
     use_threads: bool = True,
-    common_metadata: Any = None,
 ) -> Iterator[pd.DataFrame]:
     for piece in data.pieces:
         table: pa.Table = piece.read(
-            columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=True
+            columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
         )
-        current_metadata = table.schema.metadata or {}
-        if common_metadata and b"pandas" not in current_metadata:  # pragma: no cover
-            table = table.replace_schema_metadata({b"pandas": common_metadata})
         yield table.to_pandas(
             use_threads=use_threads,
             split_blocks=True,
             self_destruct=True,
             integer_object_nulls=False,
             date_as_object=True,
+            ignore_metadata=True,
+            categories=categories,
             types_mapper=_data_types.pyarrow2pandas_extension,
         )
 
@@ -1670,6 +1673,7 @@ def read_parquet_table(
     database: str,
     filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None,
     columns: Optional[List[str]] = None,
+    categories: List[str] = None,
     chunked: bool = False,
     use_threads: bool = True,
     boto3_session: Optional[boto3.Session] = None,
@@ -1690,7 +1694,10 @@ def read_parquet_table(
     filters: Union[List[Tuple], List[List[Tuple]]], optional
         List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
     columns : List[str], optional
-        Names of columns to read from the file(s)
+        Names of columns to read from the file(s).
+    categories: List[str], optional
+        List of columns names that should be returned as pandas.Categorical.
+        Recommended for memory restricted environments.
     chunked : bool
         If True will break the data in smaller DataFrames (Non deterministic number of lines).
         Otherwise return a single DataFrame with the whole data.
@@ -1740,6 +1747,7 @@ def read_parquet_table(
         path=path,
         filters=filters,
         columns=columns,
+        categories=categories,
         chunked=chunked,
         dataset=True,
         use_threads=use_threads,
 
@@ -94,6 +94,25 @@ def get_df_cast():
     return df
 
 
+def get_df_category():
+    df = pd.DataFrame(
+        {
+            "id": [1, 2, 3],
+            "string_object": ["foo", None, "boo"],
+            "string": ["foo", None, "boo"],
+            "binary": [b"1", None, b"2"],
+            "float": [1.0, None, 2.0],
+            "int": [1, None, 2],
+            "par0": [1, 1, 2],
+            "par1": ["a", "b", "b"],
+        }
+    )
+    df["string"] = df["string"].astype("string")
+    df["int"] = df["int"].astype("Int64")
+    df["par1"] = df["par1"].astype("string")
+    return df
+
+
 def get_query_long():
     return """
 SELECT
@@ -324,3 +343,16 @@ def ensure_data_types(df, has_list=False):
         if has_list is True:
             assert str(type(row["list"][0]).__name__) == "int64"
             assert str(type(row["list_list"][0][0]).__name__) == "int64"
+
+
+def ensure_data_types_category(df):
+    assert len(df.columns) in (7, 8)
+    assert str(df["id"].dtype) in ("category", "Int64")
+    assert str(df["string_object"].dtype) == "category"
+    assert str(df["string"].dtype) == "category"
+    if "binary" in df.columns:
+        assert str(df["binary"].dtype) == "category"
+    assert str(df["float"].dtype) == "category"
+    assert str(df["int"].dtype) in ("category", "Int64")
+    assert str(df["par0"].dtype) in ("category", "Int64")
+    assert str(df["par1"].dtype) == "category"