Skip to content

Commit 186a186

Browse files
sfc-gh-aalamdvorst
andauthored
Test pr 1707 (#1720)
* Add support for use_logical_type in write_pandas. use_logical_type is a new file format option of Snowflake. It is a Boolean that specifies whether Snowflake interprets Parquet logical types during data loading. The default behavior of write_pandas is unchanged. When users write a dataframe that contains datetimes with timezones and do not pass use_logical_type = True as an argument, a warning is raised (see #1687). Providing this option also fixes issue #1687 * FIX: removed pandas import and used descriptive naming over concise naming for is_datetime64tz_dtype. STYLE: if statement to idiomatic form. STYLE: broke copy_into_sql command into multiple lines, with each file_format argument on a separate line. * STYLE rearranged imports test_pandas_tools.py * REFAC: Utilized 'equal sign specifier' in f-string for improved use_logical_type warning * changelog updates --------- Co-authored-by: Dennis Van de Vorst <[email protected]>
1 parent 5b61af7 commit 186a186

File tree

3 files changed

+104
-7
lines changed

3 files changed

+104
-7
lines changed

DESCRIPTION.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
88

99
# Release Notes
1010

11+
- v3.3.2(TBD)
12+
13+
- Added support for `use_logical_type` in `write_pandas`.
14+
1115
- v3.3.1(October 16,2023)
1216

1317
- Added for non-Windows platforms command suggestions (chown/chmod) for insufficient file permissions of config files.

src/snowflake/connector/pandas_tools.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,15 @@ def _create_temp_stage(
118118

119119

120120
def _do_create_temp_file_format(
121-
cursor: SnowflakeCursor, file_format_location: str, compression: str
121+
cursor: SnowflakeCursor,
122+
file_format_location: str,
123+
compression: str,
124+
sql_use_logical_type: str,
122125
) -> None:
123126
file_format_sql = (
124127
f"CREATE TEMP FILE FORMAT {file_format_location} "
125128
f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ "
126-
f"TYPE=PARQUET COMPRESSION={compression}"
129+
f"TYPE=PARQUET COMPRESSION={compression}{sql_use_logical_type}"
127130
)
128131
logger.debug(f"creating file format with '{file_format_sql}'")
129132
cursor.execute(file_format_sql, _is_internal=True)
@@ -135,6 +138,7 @@ def _create_temp_file_format(
135138
schema: str | None,
136139
quote_identifiers: bool,
137140
compression: str,
141+
sql_use_logical_type: str,
138142
) -> str:
139143
file_format_name = random_string()
140144
file_format_location = build_location_helper(
@@ -144,15 +148,19 @@ def _create_temp_file_format(
144148
quote_identifiers=quote_identifiers,
145149
)
146150
try:
147-
_do_create_temp_file_format(cursor, file_format_location, compression)
151+
_do_create_temp_file_format(
152+
cursor, file_format_location, compression, sql_use_logical_type
153+
)
148154
except ProgrammingError as e:
149155
# User may not have the privilege to create file format on the target schema, so fall back to use current schema
150156
# as the old behavior.
151157
logger.debug(
152158
f"creating stage {file_format_location} failed. Exception {str(e)}. Fall back to use current schema"
153159
)
154160
file_format_location = file_format_name
155-
_do_create_temp_file_format(cursor, file_format_location, compression)
161+
_do_create_temp_file_format(
162+
cursor, file_format_location, compression, sql_use_logical_type
163+
)
156164

157165
return file_format_location
158166

@@ -172,6 +180,7 @@ def write_pandas(
172180
create_temp_table: bool = False,
173181
overwrite: bool = False,
174182
table_type: Literal["", "temp", "temporary", "transient"] = "",
183+
use_logical_type: bool | None = None,
175184
**kwargs: Any,
176185
) -> tuple[
177186
bool,
@@ -232,6 +241,11 @@ def write_pandas(
232241
Pandas DataFrame.
233242
table_type: The table type of to-be-created table. The supported table types include ``temp``/``temporary``
234243
and ``transient``. Empty means permanent table as per SQL convention.
244+
use_logical_type: Boolean that specifies whether to use Parquet logical types. With this file format option,
245+
Snowflake can interpret Parquet logical types during data loading. To enable Parquet logical types,
246+
set use_logical_type as True. Set to None to use Snowflakes default. For more information, see:
247+
https://docs.snowflake.com/en/sql-reference/sql/create-file-format
248+
235249
236250
Returns:
237251
Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were
@@ -280,6 +294,27 @@ def write_pandas(
280294
stacklevel=2,
281295
)
282296

297+
# use_logical_type should be True when dataframe contains datetimes with timezone.
298+
# https://github.com/snowflakedb/snowflake-connector-python/issues/1687
299+
if not use_logical_type and any(
300+
[pandas.api.types.is_datetime64tz_dtype(df[c]) for c in df.columns]
301+
):
302+
warnings.warn(
303+
"Dataframe contains a datetime with timezone column, but "
304+
f"'{use_logical_type=}'. This can result in dateimes "
305+
"being incorrectly written to Snowflake. Consider setting "
306+
"'use_logical_type = True'",
307+
UserWarning,
308+
stacklevel=2,
309+
)
310+
311+
if use_logical_type is None:
312+
sql_use_logical_type = ""
313+
elif use_logical_type:
314+
sql_use_logical_type = " USE_LOGICAL_TYPE = TRUE"
315+
else:
316+
sql_use_logical_type = " USE_LOGICAL_TYPE = FALSE"
317+
283318
cursor = conn.cursor()
284319
stage_location = _create_temp_stage(
285320
cursor,
@@ -329,7 +364,12 @@ def drop_object(name: str, object_type: str) -> None:
329364

330365
if auto_create_table or overwrite:
331366
file_format_location = _create_temp_file_format(
332-
cursor, database, schema, quote_identifiers, compression_map[compression]
367+
cursor,
368+
database,
369+
schema,
370+
quote_identifiers,
371+
compression_map[compression],
372+
sql_use_logical_type,
333373
)
334374
infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@{stage_location}', file_format=>'{file_format_location}'))"
335375
logger.debug(f"inferring schema with '{infer_schema_sql}'")
@@ -381,7 +421,12 @@ def drop_object(name: str, object_type: str) -> None:
381421
f"COPY INTO {target_table_location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
382422
f"({columns}) "
383423
f"FROM (SELECT {parquet_columns} FROM @{stage_location}) "
384-
f"FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression_map[compression]}{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}) "
424+
f"FILE_FORMAT=("
425+
f"TYPE=PARQUET "
426+
f"COMPRESSION={compression_map[compression]}"
427+
f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}"
428+
f"{sql_use_logical_type}"
429+
f") "
385430
f"PURGE=TRUE ON_ERROR={on_error}"
386431
)
387432
logger.debug(f"copying into with '{copy_into_sql}'")

test/integ/pandas/test_pandas_tools.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from __future__ import annotations
77

88
import math
9-
from datetime import datetime, timezone
9+
from datetime import datetime, timedelta, timezone
1010
from typing import TYPE_CHECKING, Any, Callable, Generator
1111
from unittest import mock
1212

@@ -417,6 +417,54 @@ def test_write_pandas_create_temp_table_deprecation_warning(
417417
cnx.execute_string(drop_sql)
418418

419419

420+
@pytest.mark.parametrize("use_logical_type", [None, True, False])
421+
def test_write_pandas_use_logical_type(
422+
conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]],
423+
use_logical_type: bool | None,
424+
):
425+
table_name = random_string(5, "USE_LOCAL_TYPE_").upper()
426+
col_name = "DT"
427+
create_sql = f"CREATE OR REPLACE TABLE {table_name} ({col_name} TIMESTAMP_TZ)"
428+
select_sql = f"SELECT * FROM {table_name}"
429+
drop_sql = f"DROP TABLE IF EXISTS {table_name}"
430+
timestamp = datetime(
431+
year=2020,
432+
month=1,
433+
day=2,
434+
hour=3,
435+
minute=4,
436+
second=5,
437+
microsecond=6,
438+
tzinfo=timezone(timedelta(hours=2)),
439+
)
440+
df_write = pandas.DataFrame({col_name: [timestamp]})
441+
442+
with conn_cnx() as cnx: # type: SnowflakeConnection
443+
cnx.cursor().execute(create_sql).fetchall()
444+
445+
write_pandas_kwargs = dict(
446+
conn=cnx,
447+
df=df_write,
448+
use_logical_type=use_logical_type,
449+
auto_create_table=False,
450+
table_name=table_name,
451+
)
452+
453+
try:
454+
# When use_logical_type = True, datetimes with timestamps should be
455+
# correctly written to Snowflake.
456+
if use_logical_type:
457+
write_pandas(**write_pandas_kwargs)
458+
df_read = cnx.cursor().execute(select_sql).fetch_pandas_all()
459+
assert all(df_write == df_read)
460+
# For other use_logical_type values, a UserWarning should be displayed.
461+
else:
462+
with pytest.warns(UserWarning, match="Dataframe contains a datetime.*"):
463+
write_pandas(**write_pandas_kwargs)
464+
finally:
465+
cnx.execute_string(drop_sql)
466+
467+
420468
def test_invalid_table_type_write_pandas(
421469
conn_cnx: Callable[..., Generator[SnowflakeConnection, None, None]],
422470
):

0 commit comments

Comments
 (0)