Fix s3.read_parquet for timezone aware columns. #382 #383

igorborgest · igorborgest · commit 5499619eaf9a · 2020-09-07T13:19:27.000-03:00
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -191,9 +191,10 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
 
 def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
     for c in metadata["columns"]:
-        if c["pandas_type"] == "datetimetz":
+        if c["field_name"] in df and c["pandas_type"] == "datetimetz":
             _logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"])
-            df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
+            if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False:
+                df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
             df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"])
     return df
 
diff --git a/pytest.ini b/pytest.ini
@@ -3,6 +3,6 @@ log_cli=False
 filterwarnings =
     ignore::DeprecationWarning
 addopts =
-    --log-cli-format "[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s][%(thread)d] %(message)s"
+    --log-cli-format "[%(asctime)s][%(name)s][%(funcName)s][%(thread)d] %(message)s"
     --verbose
     --capture=sys
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -349,3 +349,25 @@ def test_to_parquet_dataset_sanitize(path):
     assert df2.camel_case.sum() == 5
     assert df2.c_2.sum() == 9
     assert df2.par.to_list() == ["a", "b"]
+
+
+@pytest.mark.parametrize("use_threads", [False, True])
+def test_timezone_file(path, use_threads):
+    file_path = f"{path}0.parquet"
+    df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()]})
+    df["c0"] = pd.DatetimeIndex(df.c0).tz_localize(tz="US/Eastern")
+    df.to_parquet(file_path)
+    wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
+    df2 = wr.s3.read_parquet(path, use_threads=use_threads)
+    assert df.equals(df2)
+
+
+@pytest.mark.parametrize("use_threads", [False])
+def test_timezone_file_columns(path, use_threads):
+    file_path = f"{path}0.parquet"
+    df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]})
+    df["c0"] = pd.DatetimeIndex(df.c0).tz_localize(tz="US/Eastern")
+    df.to_parquet(file_path)
+    wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
+    df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads)
+    assert df[["c1"]].equals(df2)