Add skip_header_line_count arg to create_csv_table(). #338

igorborgest · igorborgest · commit 69c73788ab3b · 2020-07-30T10:16:57.000-03:00
diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py
@@ -1087,6 +1087,7 @@ def create_csv_table(
     mode: str = "overwrite",
     catalog_versioning: bool = False,
     sep: str = ",",
+    skip_header_line_count: Optional[int] = None,
     boto3_session: Optional[boto3.Session] = None,
     projection_enabled: bool = False,
     projection_types: Optional[Dict[str, str]] = None,
@@ -1125,6 +1126,8 @@ def create_csv_table(
         If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it.
     sep : str
         String of length 1. Field delimiter for the output file.
+    skip_header_line_count : Optional[int]
+        Number of Lines to skip regarding to the header.
     projection_enabled : bool
         Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html)
     projection_types : Optional[Dict[str, str]]
@@ -1181,6 +1184,7 @@ def create_csv_table(
         partitions_types=partitions_types,
         compression=compression,
         sep=sep,
+        skip_header_line_count=skip_header_line_count,
     )
     session: boto3.Session = _utils.ensure_session(session=boto3_session)
     _create_table(
@@ -1338,20 +1342,24 @@ def _csv_table_definition(
     partitions_types: Dict[str, str],
     compression: Optional[str],
     sep: str,
+    skip_header_line_count: Optional[int]
 ) -> Dict[str, Any]:
     compressed: bool = compression is not None
+    parameters: Dict[str, str] = {
+        "classification": "csv",
+        "compressionType": str(compression).lower(),
+        "typeOfData": "file",
+        "delimiter": sep,
+        "columnsOrdered": "true",
+        "areColumnsQuoted": "false",
+    }
+    if skip_header_line_count is not None:
+        parameters["skip.header.line.count"] = "1"
     return {
         "Name": table,
         "PartitionKeys": [{"Name": cname, "Type": dtype} for cname, dtype in partitions_types.items()],
         "TableType": "EXTERNAL_TABLE",
-        "Parameters": {
-            "classification": "csv",
-            "compressionType": str(compression).lower(),
-            "typeOfData": "file",
-            "delimiter": sep,
-            "columnsOrdered": "true",
-            "areColumnsQuoted": "false",
-        },
+        "Parameters": parameters,
         "StorageDescriptor": {
             "Columns": [{"Name": cname, "Type": dtype} for cname, dtype in columns_types.items()],
             "Location": path,
diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
     packages=["awswrangler", "awswrangler.s3", "awswrangler.quicksight", "awswrangler.athena"],
     include_package_data=True,
     python_requires=">=3.6, <3.9",
-    install_requires=[open("requirements.txt").read().strip().split("\n")],
+    install_requires=open("requirements.txt").read().strip().split("\n"),
     classifiers=[
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
diff --git a/tests/test_athena_csv.py b/tests/test_athena_csv.py
@@ -319,3 +319,29 @@ def test_athena_csv_types(path, glue_database, glue_table):
     ensure_data_types_csv(df2)
     wr.s3.delete_objects(path=paths)
     assert wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) is True
+
+
+@pytest.mark.parametrize("use_threads", [True, False])
+@pytest.mark.parametrize("ctas_approach", [True, False])
+def test_skip_header(path, glue_database, glue_table, use_threads, ctas_approach):
+    df = pd.DataFrame({"c0": [1, 2], "c1": [3.3, 4.4], "c2": ["foo", "boo"]})
+    df["c0"] = df["c0"].astype("Int64")
+    df["c2"] = df["c2"].astype("string")
+    paths = wr.s3.to_csv(
+        df=df,
+        path=f"{path}0.csv",
+        sep=",",
+        index=False,
+        header=True,
+        use_threads=use_threads
+    )["paths"]
+    wr.s3.wait_objects_exist(paths=paths, use_threads=use_threads)
+    wr.catalog.create_csv_table(
+        database=glue_database,
+        table=glue_table,
+        path=path,
+        columns_types={"c0": "bigint", "c1": "double", "c2": "string"},
+        skip_header_line_count=1
+    )
+    df2 = wr.athena.read_sql_table(glue_table, glue_database, use_threads=use_threads, ctas_approach=ctas_approach)
+    assert df.equals(df2)