Skip to content

Commit 8b2b64a

Browse files
committed
fix
1 parent ce6b5a6 commit 8b2b64a

File tree

4 files changed

+220
-0
lines changed

4 files changed

+220
-0
lines changed

dev/sparktestsupport/modules.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,8 @@ def __hash__(self):
489489
"pyspark.tests.test_util",
490490
"pyspark.tests.test_worker",
491491
"pyspark.tests.test_stage_sched",
492+
# unittests for upstream projects
493+
"pyspark.tests.upstream.pyarrow.test_pyarrow_ignore_timezone",
492494
],
493495
)
494496

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
import os
19+
import datetime
20+
from zoneinfo import ZoneInfo
21+
import unittest
22+
23+
from pyspark.testing.utils import (
24+
have_pandas,
25+
have_pyarrow,
26+
pandas_requirement_message,
27+
pyarrow_requirement_message,
28+
)
29+
30+
31+
# Test PYARROW_IGNORE_TIMEZONE introduced in PyArrow 2.0,
32+
# https://arrow.apache.org/blog/2020/10/22/2.0.0-release/
33+
# Conversion of timezone aware datetimes to and/from pyarrow arrays including pandas
34+
# now round-trip preserving timezone. To use the old behavior (e.g. for spark) set
35+
# the environment variable PYARROW_IGNORE_TIMEZONE to a truthy
36+
# value (i.e. PYARROW_IGNORE_TIMEZONE=1)
37+
38+
# Summary:
39+
# 1, pa.array and pa.Array.from_pandas
40+
# a, when PYARROW_IGNORE_TIMEZONE=1, and input is list[datetime.datetime|pd.Timestamp]
41+
# the tzinfo is used to infer the pyarrow datatype,
42+
# but not used in computing the underlying value (or treated as UTC time);
43+
# b, PYARROW_IGNORE_TIMEZONE takes no effect when the input is a Pandas Series;
44+
# 2, In pa.scalar, PYARROW_IGNORE_TIMEZONE takes no effect.
45+
46+
47+
@unittest.skipIf(not have_pyarrow, pyarrow_requirement_message)
48+
class PyArrowIgnoreTimeZoneTests(unittest.TestCase):
49+
def test_timezone_with_python(self):
50+
import pyarrow as pa
51+
52+
tz = "Asia/Singapore"
53+
ts1 = datetime.datetime(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo(tz))
54+
ts2 = datetime.datetime(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo("UTC"))
55+
pa_type = pa.timestamp("us", tz=tz)
56+
57+
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
58+
for s in [
59+
pa.scalar(ts1),
60+
pa.scalar(ts1, type=pa_type),
61+
]:
62+
self.assertEqual(s.type, pa_type)
63+
self.assertEqual(s.as_py(), ts1)
64+
65+
# when PYARROW_IGNORE_TIMEZONE=1 and input is list[datetime.datetime]
66+
# tzinfo is used to infer the datatype, but not used in computing the underlying value
67+
for a in [
68+
pa.array([ts1]),
69+
pa.array([ts1], type=pa_type),
70+
]:
71+
self.assertEqual(a.type, pa_type)
72+
for v in [a[0].as_py(), a.to_pylist()[0]]:
73+
self.assertNotEqual(v, ts1)
74+
self.assertEqual(v, ts2)
75+
76+
for a in [
77+
pa.array([pa.scalar(ts1)]),
78+
pa.array([pa.scalar(ts1)], type=pa_type),
79+
]:
80+
self.assertEqual(a.type, pa_type)
81+
for v in [a[0].as_py(), a.to_pylist()[0]]:
82+
self.assertEqual(v, ts1)
83+
84+
del os.environ["PYARROW_IGNORE_TIMEZONE"]
85+
for s in [
86+
pa.scalar(ts1),
87+
pa.scalar(ts1, type=pa_type),
88+
]:
89+
self.assertEqual(s.type, pa_type)
90+
self.assertEqual(s.as_py(), ts1)
91+
92+
for a in [
93+
pa.array([ts1]),
94+
pa.array([ts1], type=pa_type),
95+
pa.array([pa.scalar(ts1)]),
96+
pa.array([pa.scalar(ts1)], type=pa_type),
97+
]:
98+
self.assertEqual(a.type, pa_type)
99+
for v in [a[0].as_py(), a.to_pylist()[0]]:
100+
self.assertEqual(v, ts1)
101+
102+
@unittest.skipIf(not have_pandas, pandas_requirement_message)
103+
def test_timezone_with_pandas(self):
104+
import pyarrow as pa
105+
import pandas as pd
106+
107+
tz = "Asia/Singapore"
108+
ts1 = pd.Timestamp(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo(tz))
109+
ts2 = pd.Timestamp(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo("UTC"))
110+
pa_type = pa.timestamp("us", tz=tz)
111+
112+
# numpy-backed series
113+
ser1 = pd.Series([ts1], dtype=pd.DatetimeTZDtype("us", tz=tz))
114+
self.assertEqual(ser1.dtype.unit, "us")
115+
self.assertEqual(ser1.dtype.tz.zone, tz)
116+
117+
# pyarrow-backed series
118+
ser2 = pd.Series([ts1], dtype=pd.ArrowDtype(pa_type))
119+
self.assertEqual(ser2.dtype.pyarrow_dtype, pa_type)
120+
121+
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
122+
for s in [
123+
pa.scalar(ts1),
124+
pa.scalar(ts1, type=pa_type),
125+
]:
126+
self.assertEqual(s.type, pa_type)
127+
self.assertEqual(s.as_py(), ts1)
128+
129+
# when PYARROW_IGNORE_TIMEZONE=1 and input is list[pd.Timestamp]
130+
# tzinfo is used to infer the datatype, but not used in computing the underlying value
131+
for a in [
132+
pa.array([ts1]),
133+
pa.array([ts1], type=pa_type),
134+
]:
135+
self.assertEqual(a.type, pa_type)
136+
for v in [a[0].as_py(), a.to_pylist()[0]]:
137+
self.assertNotEqual(v, ts1)
138+
self.assertEqual(v, ts2)
139+
140+
for a in [
141+
pa.array([pa.scalar(ts1)]),
142+
pa.array([pa.scalar(ts1)], type=pa_type),
143+
pa.array(ser1),
144+
pa.array(ser1, type=pa_type),
145+
pa.Array.from_pandas(ser1),
146+
pa.Array.from_pandas(ser1, type=pa_type),
147+
pa.array(ser2),
148+
pa.array(ser2, type=pa_type),
149+
pa.Array.from_pandas(ser2),
150+
pa.Array.from_pandas(ser2, type=pa_type),
151+
]:
152+
self.assertEqual(a.type, pa_type)
153+
for v in [a[0].as_py(), a.to_pylist()[0], a.to_pandas()[0]]:
154+
self.assertEqual(v, ts1)
155+
156+
del os.environ["PYARROW_IGNORE_TIMEZONE"]
157+
for s in [
158+
pa.scalar(ts1),
159+
pa.scalar(ts1, type=pa_type),
160+
]:
161+
self.assertEqual(s.type, pa_type)
162+
self.assertEqual(s.as_py(), ts1)
163+
164+
for a in [
165+
pa.array([ts1]),
166+
pa.array([ts1], type=pa_type),
167+
pa.array([pa.scalar(ts1)]),
168+
pa.array([pa.scalar(ts1)], type=pa_type),
169+
pa.array(ser1),
170+
pa.array(ser1, type=pa_type),
171+
pa.Array.from_pandas(ser1),
172+
pa.Array.from_pandas(ser1, type=pa_type),
173+
pa.array(ser2),
174+
pa.array(ser2, type=pa_type),
175+
pa.Array.from_pandas(ser2),
176+
pa.Array.from_pandas(ser2, type=pa_type),
177+
]:
178+
self.assertEqual(a.type, pa_type)
179+
for v in [a[0].as_py(), a.to_pylist()[0], a.to_pandas()[0]]:
180+
self.assertEqual(v, ts1)
181+
182+
183+
if __name__ == "__main__":
184+
from pyspark.testing import main
185+
186+
main()

0 commit comments

Comments
 (0)