Allow sanitize_columns=False when dataset=True and database=None. #380

igorborgest · igorborgest · commit 3a6c537a9999 · 2020-09-05T12:42:59.000-03:00
diff --git a/awswrangler/s3/_write.py b/awswrangler/s3/_write.py
@@ -52,7 +52,7 @@ def _validate_args(
     if dataset is False:
         if path.endswith("/"):
             raise exceptions.InvalidArgumentValue(
-                "If <dataset=False>, the argument <path> should be a object path, not a directory."
+                "If <dataset=False>, the argument <path> should be a file path, not a directory."
             )
         if partition_cols:
             raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.")
@@ -66,8 +66,8 @@ def _validate_args(
             )
     elif (database is None) != (table is None):
         raise exceptions.InvalidArgumentCombination(
-            "Arguments database and table must be passed together. If you want to store your dataset in the Glue "
-            "Catalog, please ensure you are passing both."
+            "Arguments database and table must be passed together. If you want to store your dataset metadata in "
+            "the Glue Catalog, please ensure you are passing both."
         )
 
 
diff --git a/awswrangler/s3/_write_parquet.py b/awswrangler/s3/_write_parquet.py
@@ -225,14 +225,14 @@ def to_parquet(  # pylint: disable=too-many-arguments,too-many-locals
 ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
     """Write Parquet file or dataset on Amazon S3.
 
-    The concept of Dataset goes beyond the simple idea of files and enable more
-    complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
+    The concept of Dataset goes beyond the simple idea of ordinary files and enable more
+    complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog).
 
     Note
     ----
-    If `dataset=True` The table name and all column names will be automatically sanitized using
-    `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
-    Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`.
+    If `database` and `table` arguments are passed, the table name and all column names
+    will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
+    Please, pass `sanitize_columns=True` to enforce this behaviour always.
 
     Note
     ----
@@ -267,12 +267,15 @@ def to_parquet(  # pylint: disable=too-many-arguments,too-many-locals
         "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
         e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}
     sanitize_columns : bool
-        True to sanitize columns names or False to keep it as is.
-        True value is forced if `dataset=True`.
+        True to sanitize columns names (using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`)
+        or False to keep it as is.
+        True value behaviour is enforced if `database` and `table` arguments are passed.
     dataset : bool
-        If True store a parquet dataset instead of a single file.
+        If True store a parquet dataset instead of a ordinary file(s)
         If True, enable all follow arguments:
-        partition_cols, mode, database, table, description, parameters, columns_comments, .
+        partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning,
+        catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values,
+        projection_intervals, projection_digits, catalog_id, schema_evolution.
     partition_cols: List[str], optional
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     concurrent_partitioning: bool
@@ -470,7 +473,7 @@ def to_parquet(  # pylint: disable=too-many-arguments,too-many-locals
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
     # Sanitize table to respect Athena's standards
-    if (sanitize_columns is True) or (dataset is True):
+    if (sanitize_columns is True) or (database is not None and table is not None):
         df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols)
 
     # Evaluating dtype
diff --git a/awswrangler/s3/_write_text.py b/awswrangler/s3/_write_text.py
@@ -88,14 +88,14 @@ def to_csv(  # pylint: disable=too-many-arguments,too-many-locals
 ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
     """Write CSV file or dataset on Amazon S3.
 
-    The concept of Dataset goes beyond the simple idea of files and enable more
-    complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog).
+    The concept of Dataset goes beyond the simple idea of ordinary files and enable more
+    complex features like partitioning and catalog integration (Amazon Athena/AWS Glue Catalog).
 
     Note
     ----
-    If `dataset=True` The table name and all column names will be automatically sanitized using
-    `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
-    Please, pass `sanitize_columns=True` to force the same behaviour for `dataset=False`.
+    If database` and `table` arguments are passed, the table name and all column names
+    will be automatically sanitized using `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`.
+    Please, pass `sanitize_columns=True` to enforce this behaviour always.
 
     Note
     ----
@@ -142,9 +142,11 @@ def to_csv(  # pylint: disable=too-many-arguments,too-many-locals
         True to sanitize columns names or False to keep it as is.
         True value is forced if `dataset=True`.
     dataset : bool
-        If True store a parquet dataset instead of a single file.
+        If True store a parquet dataset instead of a ordinary file(s)
         If True, enable all follow arguments:
-        partition_cols, mode, database, table, description, parameters, columns_comments, .
+        partition_cols, mode, database, table, description, parameters, columns_comments, concurrent_partitioning,
+        catalog_versioning, projection_enabled, projection_types, projection_ranges, projection_values,
+        projection_intervals, projection_digits, catalog_id, schema_evolution.
     partition_cols: List[str], optional
         List of column names that will be used to create partitions. Only takes effect if dataset=True.
     concurrent_partitioning: bool
@@ -358,7 +360,7 @@ def to_csv(  # pylint: disable=too-many-arguments,too-many-locals
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
 
     # Sanitize table to respect Athena's standards
-    if (sanitize_columns is True) or (dataset is True):
+    if (sanitize_columns is True) or (database is not None and table is not None):
         df, dtype, partition_cols = _sanitize(df=df, dtype=dtype, partition_cols=partition_cols)
 
     # Evaluating dtype
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -323,3 +323,29 @@ def test_multi_index_recovery_nameless(path, use_threads):
     wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads)
     df2 = wr.s3.read_parquet(f"{path}*.parquet", use_threads=use_threads)
     assert df.reset_index().equals(df2.reset_index())
+
+
+def test_to_parquet_dataset_sanitize(path):
+    df = pd.DataFrame({"C0": [0, 1], "camelCase": [2, 3], "c**--2": [4, 5], "Par": ["a", "b"]})
+
+    paths = wr.s3.to_parquet(df, path, dataset=True, partition_cols=["Par"], sanitize_columns=False)["paths"]
+    wr.s3.wait_objects_exist(paths)
+    df2 = wr.s3.read_parquet(path, dataset=True)
+    assert df.shape == df2.shape
+    assert list(df2.columns) == ["C0", "camelCase", "c**--2", "Par"]
+    assert df2.C0.sum() == 1
+    assert df2.camelCase.sum() == 5
+    assert df2["c**--2"].sum() == 9
+    assert df2.Par.to_list() == ["a", "b"]
+
+    paths = wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"], sanitize_columns=True, mode="overwrite")[
+        "paths"
+    ]
+    wr.s3.wait_objects_exist(paths)
+    df2 = wr.s3.read_parquet(path, dataset=True)
+    assert df.shape == df2.shape
+    assert list(df2.columns) == ["c0", "camel_case", "c_2", "par"]
+    assert df2.c0.sum() == 1
+    assert df2.camel_case.sum() == 5
+    assert df2.c_2.sum() == 9
+    assert df2.par.to_list() == ["a", "b"]

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def _validate_args(`
`52`	`52`	`if dataset is False:`
`53`	`53`	`if path.endswith("/"):`
`54`	`54`	`raise exceptions.InvalidArgumentValue(`
`55`		`- "If <dataset=False>, the argument <path> should be a object path, not a directory."`
	`55`	`+ "If <dataset=False>, the argument <path> should be a file path, not a directory."`
`56`	`56`	`)`
`57`	`57`	`if partition_cols:`
`58`	`58`	`raise exceptions.InvalidArgumentCombination("Please, pass dataset=True to be able to use partition_cols.")`
`@@ -66,8 +66,8 @@ def _validate_args(`
`66`	`66`	`)`
`67`	`67`	`elif (database is None) != (table is None):`
`68`	`68`	`raise exceptions.InvalidArgumentCombination(`
`69`		`- "Arguments database and table must be passed together. If you want to store your dataset in the Glue "`
`70`		`- "Catalog, please ensure you are passing both."`
	`69`	`+ "Arguments database and table must be passed together. If you want to store your dataset metadata in "`
	`70`	`+ "the Glue Catalog, please ensure you are passing both."`
`71`	`71`	`)`
`72`	`72`
`73`	`73`