Skip to content

Commit 5499619

Browse files
committed
Fix s3.read_parquet for timezone aware columns. #382 #383
1 parent 58daf40 commit 5499619

File tree

3 files changed

+26
-3
lines changed

3 files changed

+26
-3
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,10 @@ def _apply_index(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
191191

192192
def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
193193
for c in metadata["columns"]:
194-
if c["pandas_type"] == "datetimetz":
194+
if c["field_name"] in df and c["pandas_type"] == "datetimetz":
195195
_logger.debug("applying timezone (%s) on column %s", c["metadata"]["timezone"], c["field_name"])
196-
df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
196+
if isinstance(df[c["field_name"]].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) is False:
197+
df[c["field_name"]] = df[c["field_name"]].dt.tz_localize(tz="UTC")
197198
df[c["field_name"]] = df[c["field_name"]].dt.tz_convert(tz=c["metadata"]["timezone"])
198199
return df
199200

pytest.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ log_cli=False
33
filterwarnings =
44
ignore::DeprecationWarning
55
addopts =
6-
--log-cli-format "[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s][%(thread)d] %(message)s"
6+
--log-cli-format "[%(asctime)s][%(name)s][%(funcName)s][%(thread)d] %(message)s"
77
--verbose
88
--capture=sys

tests/test_s3_parquet.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,25 @@ def test_to_parquet_dataset_sanitize(path):
349349
assert df2.camel_case.sum() == 5
350350
assert df2.c_2.sum() == 9
351351
assert df2.par.to_list() == ["a", "b"]
352+
353+
354+
@pytest.mark.parametrize("use_threads", [False, True])
355+
def test_timezone_file(path, use_threads):
356+
file_path = f"{path}0.parquet"
357+
df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()]})
358+
df["c0"] = pd.DatetimeIndex(df.c0).tz_localize(tz="US/Eastern")
359+
df.to_parquet(file_path)
360+
wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
361+
df2 = wr.s3.read_parquet(path, use_threads=use_threads)
362+
assert df.equals(df2)
363+
364+
365+
@pytest.mark.parametrize("use_threads", [False])
366+
def test_timezone_file_columns(path, use_threads):
367+
file_path = f"{path}0.parquet"
368+
df = pd.DataFrame({"c0": [datetime.utcnow(), datetime.utcnow()], "c1": [1.1, 2.2]})
369+
df["c0"] = pd.DatetimeIndex(df.c0).tz_localize(tz="US/Eastern")
370+
df.to_parquet(file_path)
371+
wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
372+
df2 = wr.s3.read_parquet(path, columns=["c1"], use_threads=use_threads)
373+
assert df[["c1"]].equals(df2)

0 commit comments

Comments
 (0)