|
| 1 | +# |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | +# contributor license agreements. See the NOTICE file distributed with |
| 4 | +# this work for additional information regarding copyright ownership. |
| 5 | +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | +# (the "License"); you may not use this file except in compliance with |
| 7 | +# the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | + |
| 18 | +import os |
| 19 | +import datetime |
| 20 | +from zoneinfo import ZoneInfo |
| 21 | +import unittest |
| 22 | + |
| 23 | +from pyspark.testing.utils import ( |
| 24 | + have_pandas, |
| 25 | + have_pyarrow, |
| 26 | + pandas_requirement_message, |
| 27 | + pyarrow_requirement_message, |
| 28 | +) |
| 29 | + |
| 30 | + |
| 31 | +# Test PYARROW_IGNORE_TIMEZONE introduced in PyArrow 2.0, |
| 32 | +# https://arrow.apache.org/blog/2020/10/22/2.0.0-release/ |
| 33 | +# Conversion of timezone aware datetimes to and/from pyarrow arrays including pandas |
| 34 | +# now round-trip preserving timezone. To use the old behavior (e.g. for spark) set |
| 35 | +# the environment variable PYARROW_IGNORE_TIMEZONE to a truthy |
| 36 | +# value (i.e. PYARROW_IGNORE_TIMEZONE=1) |
| 37 | + |
| 38 | +# Summary: |
| 39 | +# 1, pa.array and pa.Array.from_pandas |
| 40 | +# a, when PYARROW_IGNORE_TIMEZONE=1, and input is list[datetime.datetime|pd.Timestamp] |
| 41 | +# the tzinfo is used to infer the pyarrow datatype, |
| 42 | +# but not used in computing the underlying value (or treated as UTC time); |
| 43 | +# b, PYARROW_IGNORE_TIMEZONE takes no effect when the input is a Pandas Series; |
| 44 | +# 2, In pa.scalar, PYARROW_IGNORE_TIMEZONE takes no effect. |
| 45 | + |
| 46 | + |
| 47 | +@unittest.skipIf(not have_pyarrow, pyarrow_requirement_message) |
| 48 | +class PyArrowIgnoreTimeZoneTests(unittest.TestCase): |
| 49 | + def test_timezone_with_python(self): |
| 50 | + import pyarrow as pa |
| 51 | + |
| 52 | + tz = "Asia/Singapore" |
| 53 | + ts1 = datetime.datetime(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo(tz)) |
| 54 | + ts2 = datetime.datetime(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo("UTC")) |
| 55 | + pa_type = pa.timestamp("us", tz=tz) |
| 56 | + |
| 57 | + os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" |
| 58 | + for s in [ |
| 59 | + pa.scalar(ts1), |
| 60 | + pa.scalar(ts1, type=pa_type), |
| 61 | + ]: |
| 62 | + self.assertEqual(s.type, pa_type) |
| 63 | + self.assertEqual(s.as_py(), ts1) |
| 64 | + |
| 65 | + # when PYARROW_IGNORE_TIMEZONE=1 and input is list[datetime.datetime] |
| 66 | + # tzinfo is used to infer the datatype, but not used in computing the underlying value |
| 67 | + for a in [ |
| 68 | + pa.array([ts1]), |
| 69 | + pa.array([ts1], type=pa_type), |
| 70 | + ]: |
| 71 | + self.assertEqual(a.type, pa_type) |
| 72 | + for v in [a[0].as_py(), a.to_pylist()[0]]: |
| 73 | + self.assertNotEqual(v, ts1) |
| 74 | + self.assertEqual(v, ts2) |
| 75 | + |
| 76 | + for a in [ |
| 77 | + pa.array([pa.scalar(ts1)]), |
| 78 | + pa.array([pa.scalar(ts1)], type=pa_type), |
| 79 | + ]: |
| 80 | + self.assertEqual(a.type, pa_type) |
| 81 | + for v in [a[0].as_py(), a.to_pylist()[0]]: |
| 82 | + self.assertEqual(v, ts1) |
| 83 | + |
| 84 | + del os.environ["PYARROW_IGNORE_TIMEZONE"] |
| 85 | + for s in [ |
| 86 | + pa.scalar(ts1), |
| 87 | + pa.scalar(ts1, type=pa_type), |
| 88 | + ]: |
| 89 | + self.assertEqual(s.type, pa_type) |
| 90 | + self.assertEqual(s.as_py(), ts1) |
| 91 | + |
| 92 | + for a in [ |
| 93 | + pa.array([ts1]), |
| 94 | + pa.array([ts1], type=pa_type), |
| 95 | + pa.array([pa.scalar(ts1)]), |
| 96 | + pa.array([pa.scalar(ts1)], type=pa_type), |
| 97 | + ]: |
| 98 | + self.assertEqual(a.type, pa_type) |
| 99 | + for v in [a[0].as_py(), a.to_pylist()[0]]: |
| 100 | + self.assertEqual(v, ts1) |
| 101 | + |
| 102 | + @unittest.skipIf(not have_pandas, pandas_requirement_message) |
| 103 | + def test_timezone_with_pandas(self): |
| 104 | + import pyarrow as pa |
| 105 | + import pandas as pd |
| 106 | + |
| 107 | + tz = "Asia/Singapore" |
| 108 | + ts1 = pd.Timestamp(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo(tz)) |
| 109 | + ts2 = pd.Timestamp(2022, 1, 5, 15, 0, 1, tzinfo=ZoneInfo("UTC")) |
| 110 | + pa_type = pa.timestamp("us", tz=tz) |
| 111 | + |
| 112 | + # numpy-backed series |
| 113 | + ser1 = pd.Series([ts1], dtype=pd.DatetimeTZDtype("us", tz=tz)) |
| 114 | + self.assertEqual(ser1.dtype.unit, "us") |
| 115 | + self.assertEqual(ser1.dtype.tz.zone, tz) |
| 116 | + |
| 117 | + # pyarrow-backed series |
| 118 | + ser2 = pd.Series([ts1], dtype=pd.ArrowDtype(pa_type)) |
| 119 | + self.assertEqual(ser2.dtype.pyarrow_dtype, pa_type) |
| 120 | + |
| 121 | + os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" |
| 122 | + for s in [ |
| 123 | + pa.scalar(ts1), |
| 124 | + pa.scalar(ts1, type=pa_type), |
| 125 | + ]: |
| 126 | + self.assertEqual(s.type, pa_type) |
| 127 | + self.assertEqual(s.as_py(), ts1) |
| 128 | + |
| 129 | + # when PYARROW_IGNORE_TIMEZONE=1 and input is list[pd.Timestamp] |
| 130 | + # tzinfo is used to infer the datatype, but not used in computing the underlying value |
| 131 | + for a in [ |
| 132 | + pa.array([ts1]), |
| 133 | + pa.array([ts1], type=pa_type), |
| 134 | + ]: |
| 135 | + self.assertEqual(a.type, pa_type) |
| 136 | + for v in [a[0].as_py(), a.to_pylist()[0]]: |
| 137 | + self.assertNotEqual(v, ts1) |
| 138 | + self.assertEqual(v, ts2) |
| 139 | + |
| 140 | + for a in [ |
| 141 | + pa.array([pa.scalar(ts1)]), |
| 142 | + pa.array([pa.scalar(ts1)], type=pa_type), |
| 143 | + pa.array(ser1), |
| 144 | + pa.array(ser1, type=pa_type), |
| 145 | + pa.Array.from_pandas(ser1), |
| 146 | + pa.Array.from_pandas(ser1, type=pa_type), |
| 147 | + pa.array(ser2), |
| 148 | + pa.array(ser2, type=pa_type), |
| 149 | + pa.Array.from_pandas(ser2), |
| 150 | + pa.Array.from_pandas(ser2, type=pa_type), |
| 151 | + ]: |
| 152 | + self.assertEqual(a.type, pa_type) |
| 153 | + for v in [a[0].as_py(), a.to_pylist()[0], a.to_pandas()[0]]: |
| 154 | + self.assertEqual(v, ts1) |
| 155 | + |
| 156 | + del os.environ["PYARROW_IGNORE_TIMEZONE"] |
| 157 | + for s in [ |
| 158 | + pa.scalar(ts1), |
| 159 | + pa.scalar(ts1, type=pa_type), |
| 160 | + ]: |
| 161 | + self.assertEqual(s.type, pa_type) |
| 162 | + self.assertEqual(s.as_py(), ts1) |
| 163 | + |
| 164 | + for a in [ |
| 165 | + pa.array([ts1]), |
| 166 | + pa.array([ts1], type=pa_type), |
| 167 | + pa.array([pa.scalar(ts1)]), |
| 168 | + pa.array([pa.scalar(ts1)], type=pa_type), |
| 169 | + pa.array(ser1), |
| 170 | + pa.array(ser1, type=pa_type), |
| 171 | + pa.Array.from_pandas(ser1), |
| 172 | + pa.Array.from_pandas(ser1, type=pa_type), |
| 173 | + pa.array(ser2), |
| 174 | + pa.array(ser2, type=pa_type), |
| 175 | + pa.Array.from_pandas(ser2), |
| 176 | + pa.Array.from_pandas(ser2, type=pa_type), |
| 177 | + ]: |
| 178 | + self.assertEqual(a.type, pa_type) |
| 179 | + for v in [a[0].as_py(), a.to_pylist()[0], a.to_pandas()[0]]: |
| 180 | + self.assertEqual(v, ts1) |
| 181 | + |
| 182 | + |
| 183 | +if __name__ == "__main__": |
| 184 | + from pyspark.testing import main |
| 185 | + |
| 186 | + main() |
0 commit comments