Skip to content

Commit 56e524e

Browse files
Snow 2117128 Fix arrow timestamp conversion (#2415)
1 parent c422eb3 commit 56e524e

File tree

3 files changed

+133
-65
lines changed

3 files changed

+133
-65
lines changed

DESCRIPTION.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
1010
- v3.16.1(TBD)
1111
- Added in-band OCSP exception telemetry.
1212
- Added in-band HTTP exception telemetry.
13+
- Fixed a bug where timezoned timestamps fetched as pandas.DataFrame or pyarrow.Table would overflow for the sake of unnecessary precision. In the case where an overflow cannot be prevented a clear error will be raised now.
1314

1415
- v3.16.0(July 04,2025)
1516
- Bumped numpy dependency from <2.1.0 to <=2.2.4.

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowTableIterator.cpp

Lines changed: 86 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,45 @@ void CArrowTableIterator::convertTimeColumn_nanoarrow(
600600
ArrowArrayMove(newArray, columnArray->array);
601601
}
602602

603+
/**
604+
* Helper function to detect nanosecond timestamp overflow and determine if
605+
* downscaling to microseconds is needed.
606+
* @param columnArray The Arrow array containing the timestamp data
607+
* @param epochArray The Arrow array containing epoch values
608+
* @param fractionArray The Arrow array containing fraction values
609+
* @return true if overflow was detected and downscaling to microseconds is
610+
* safe, false otherwise
611+
* @throws std::overflow_error if overflow is detected but downscaling would
612+
* lose precision
613+
*/
614+
static bool _checkNanosecondTimestampOverflowAndDownscale(
615+
ArrowArrayView* columnArray, ArrowArrayView* epochArray,
616+
ArrowArrayView* fractionArray) {
617+
int powTenSB4 = sf::internal::powTenSB4[9];
618+
for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
619+
if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
620+
int64_t epoch = ArrowArrayViewGetIntUnsafe(epochArray, rowIdx);
621+
int64_t fraction = ArrowArrayViewGetIntUnsafe(fractionArray, rowIdx);
622+
if (epoch > (INT64_MAX / powTenSB4) || epoch < (INT64_MIN / powTenSB4)) {
623+
if (fraction % 1000 != 0) {
624+
std::string errorInfo = Logger::formatString(
625+
"The total number of nanoseconds %d%d overflows int64 range. "
626+
"If you use a timestamp with "
627+
"the nanosecond part over 6-digits in the Snowflake database, "
628+
"the timestamp must be "
629+
"between '1677-09-21 00:12:43.145224192' and '2262-04-11 "
630+
"23:47:16.854775807' to not overflow.",
631+
epoch, fraction);
632+
throw std::overflow_error(errorInfo.c_str());
633+
} else {
634+
return true; // Safe to downscale
635+
}
636+
}
637+
}
638+
}
639+
return false;
640+
}
641+
603642
void CArrowTableIterator::convertTimestampColumn_nanoarrow(
604643
ArrowSchemaView* field, ArrowArrayView* columnArray, const int scale,
605644
const std::string timezone) {
@@ -614,11 +653,11 @@ void CArrowTableIterator::convertTimestampColumn_nanoarrow(
614653
newSchema->flags &=
615654
(field->schema->flags & ARROW_FLAG_NULLABLE); // map to nullable()
616655

617-
// calculate has_overflow_to_downscale
656+
// Find epoch and fraction arrays for overflow detection
657+
ArrowArrayView* epochArray = nullptr;
658+
ArrowArrayView* fractionArray = nullptr;
618659
bool has_overflow_to_downscale = false;
619660
if (scale > 6 && field->type == NANOARROW_TYPE_STRUCT) {
620-
ArrowArrayView* epochArray;
621-
ArrowArrayView* fractionArray;
622661
for (int64_t i = 0; i < field->schema->n_children; i++) {
623662
ArrowSchema* c_schema = field->schema->children[i];
624663
if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) ==
@@ -631,30 +670,8 @@ void CArrowTableIterator::convertTimestampColumn_nanoarrow(
631670
// do nothing
632671
}
633672
}
634-
635-
int powTenSB4 = sf::internal::powTenSB4[9];
636-
for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
637-
if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
638-
int64_t epoch = ArrowArrayViewGetIntUnsafe(epochArray, rowIdx);
639-
int64_t fraction = ArrowArrayViewGetIntUnsafe(fractionArray, rowIdx);
640-
if (epoch > (INT64_MAX / powTenSB4) ||
641-
epoch < (INT64_MIN / powTenSB4)) {
642-
if (fraction % 1000 != 0) {
643-
std::string errorInfo = Logger::formatString(
644-
"The total number of nanoseconds %d%d overflows int64 range. "
645-
"If you use a timestamp with "
646-
"the nanosecond part over 6-digits in the Snowflake database, "
647-
"the timestamp must be "
648-
"between '1677-09-21 00:12:43.145224192' and '2262-04-11 "
649-
"23:47:16.854775807' to not overflow.",
650-
epoch, fraction);
651-
throw std::overflow_error(errorInfo.c_str());
652-
} else {
653-
has_overflow_to_downscale = true;
654-
}
655-
}
656-
}
657-
}
673+
has_overflow_to_downscale = _checkNanosecondTimestampOverflowAndDownscale(
674+
columnArray, epochArray, fractionArray);
658675
}
659676

660677
if (scale <= 6) {
@@ -855,6 +872,29 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
855872
ArrowSchemaInit(newSchema);
856873
newSchema->flags &=
857874
(field->schema->flags & ARROW_FLAG_NULLABLE); // map to nullable()
875+
876+
// Find epoch and fraction arrays
877+
ArrowArrayView* epochArray = nullptr;
878+
ArrowArrayView* fractionArray = nullptr;
879+
for (int64_t i = 0; i < field->schema->n_children; i++) {
880+
ArrowSchema* c_schema = field->schema->children[i];
881+
if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) == 0) {
882+
epochArray = columnArray->children[i];
883+
} else if (std::strcmp(c_schema->name,
884+
internal::FIELD_NAME_FRACTION.c_str()) == 0) {
885+
fractionArray = columnArray->children[i];
886+
} else {
887+
// do nothing
888+
}
889+
}
890+
891+
// Check for timestamp overflow and determine if downscaling is needed
892+
bool has_overflow_to_downscale = false;
893+
if (scale > 6 && byteLength == 16) {
894+
has_overflow_to_downscale = _checkNanosecondTimestampOverflowAndDownscale(
895+
columnArray, epochArray, fractionArray);
896+
}
897+
858898
auto timeunit = NANOARROW_TIME_UNIT_SECOND;
859899
if (scale == 0) {
860900
timeunit = NANOARROW_TIME_UNIT_SECOND;
@@ -863,7 +903,9 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
863903
} else if (scale <= 6) {
864904
timeunit = NANOARROW_TIME_UNIT_MICRO;
865905
} else {
866-
timeunit = NANOARROW_TIME_UNIT_NANO;
906+
// Use microsecond precision if we detected overflow, otherwise nanosecond
907+
timeunit = has_overflow_to_downscale ? NANOARROW_TIME_UNIT_MICRO
908+
: NANOARROW_TIME_UNIT_NANO;
867909
}
868910

869911
if (!timezone.empty()) {
@@ -893,20 +935,6 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
893935
"from schema : %s, error code: %d",
894936
ArrowErrorMessage(&error), returnCode);
895937

896-
ArrowArrayView* epochArray;
897-
ArrowArrayView* fractionArray;
898-
for (int64_t i = 0; i < field->schema->n_children; i++) {
899-
ArrowSchema* c_schema = field->schema->children[i];
900-
if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) == 0) {
901-
epochArray = columnArray->children[i];
902-
} else if (std::strcmp(c_schema->name,
903-
internal::FIELD_NAME_FRACTION.c_str()) == 0) {
904-
fractionArray = columnArray->children[i];
905-
} else {
906-
// do nothing
907-
}
908-
}
909-
910938
for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
911939
if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
912940
if (byteLength == 8) {
@@ -920,8 +948,14 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
920948
returnCode = ArrowArrayAppendInt(
921949
newArray, epoch * sf::internal::powTenSB4[6 - scale]);
922950
} else {
923-
returnCode = ArrowArrayAppendInt(
924-
newArray, epoch * sf::internal::powTenSB4[9 - scale]);
951+
// Handle overflow by falling back to microsecond precision
952+
if (has_overflow_to_downscale) {
953+
returnCode = ArrowArrayAppendInt(
954+
newArray, epoch * sf::internal::powTenSB4[6]);
955+
} else {
956+
returnCode = ArrowArrayAppendInt(
957+
newArray, epoch * sf::internal::powTenSB4[9 - scale]);
958+
}
925959
}
926960
SF_CHECK_ARROW_RC(returnCode,
927961
"[Snowflake Exception] error appending int to "
@@ -941,8 +975,14 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
941975
newArray, epoch * sf::internal::powTenSB4[6] +
942976
fraction / sf::internal::powTenSB4[3]);
943977
} else {
944-
returnCode = ArrowArrayAppendInt(
945-
newArray, epoch * sf::internal::powTenSB4[9] + fraction);
978+
// Handle overflow by falling back to microsecond precision
979+
if (has_overflow_to_downscale) {
980+
returnCode = ArrowArrayAppendInt(
981+
newArray, epoch * sf::internal::powTenSB4[6] + fraction / 1000);
982+
} else {
983+
returnCode = ArrowArrayAppendInt(
984+
newArray, epoch * sf::internal::powTenSB4[9] + fraction);
985+
}
946986
}
947987
SF_CHECK_ARROW_RC(returnCode,
948988
"[Snowflake Exception] error appending int to "

test/integ/pandas_it/test_arrow_pandas.py

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -438,40 +438,67 @@ def test_timestampntz(conn_cnx, scale):
438438
[
439439
"'1400-01-01 01:02:03.123456789'::timestamp as low_ts",
440440
"'9999-01-01 01:02:03.123456789789'::timestamp as high_ts",
441+
"convert_timezone('UTC', '1400-01-01 01:02:03.123456789') as low_ts",
442+
"convert_timezone('UTC', '9999-01-01 01:02:03.123456789789') as high_ts",
441443
],
442444
)
443-
def test_timestampntz_raises_overflow(conn_cnx, timestamp_str):
445+
def test_timestamp_raises_overflow(conn_cnx, timestamp_str):
444446
with conn_cnx() as conn:
445447
r = conn.cursor().execute(f"select {timestamp_str}")
446448
with pytest.raises(OverflowError, match="overflows int64 range."):
447449
r.fetch_arrow_all()
448450

449451

450-
def test_timestampntz_down_scale(conn_cnx):
452+
def test_timestamp_down_scale(conn_cnx):
451453
with conn_cnx() as conn:
452454
r = conn.cursor().execute(
453-
"select '1400-01-01 01:02:03.123456'::timestamp as low_ts, '9999-01-01 01:02:03.123456'::timestamp as high_ts"
455+
"""select '1400-01-01 01:02:03.123456'::timestamp as low_ntz,
456+
'9999-01-01 01:02:03.123456'::timestamp as high_ntz,
457+
convert_timezone('UTC', '1400-01-01 01:02:03.123456') as low_tz,
458+
convert_timezone('UTC', '9999-01-01 01:02:03.123456') as high_tz
459+
"""
454460
)
455461
table = r.fetch_arrow_all()
456-
lower_dt = table[0][0].as_py() # type: datetime
462+
lower_ntz = table[0][0].as_py() # type: datetime
457463
assert (
458-
lower_dt.year,
459-
lower_dt.month,
460-
lower_dt.day,
461-
lower_dt.hour,
462-
lower_dt.minute,
463-
lower_dt.second,
464-
lower_dt.microsecond,
464+
lower_ntz.year,
465+
lower_ntz.month,
466+
lower_ntz.day,
467+
lower_ntz.hour,
468+
lower_ntz.minute,
469+
lower_ntz.second,
470+
lower_ntz.microsecond,
465471
) == (1400, 1, 1, 1, 2, 3, 123456)
466-
higher_dt = table[1][0].as_py()
472+
higher_ntz = table[1][0].as_py() # type: datetime
467473
assert (
468-
higher_dt.year,
469-
higher_dt.month,
470-
higher_dt.day,
471-
higher_dt.hour,
472-
higher_dt.minute,
473-
higher_dt.second,
474-
higher_dt.microsecond,
474+
higher_ntz.year,
475+
higher_ntz.month,
476+
higher_ntz.day,
477+
higher_ntz.hour,
478+
higher_ntz.minute,
479+
higher_ntz.second,
480+
higher_ntz.microsecond,
481+
) == (9999, 1, 1, 1, 2, 3, 123456)
482+
483+
lower_tz = table[2][0].as_py() # type: datetime
484+
assert (
485+
lower_tz.year,
486+
lower_tz.month,
487+
lower_tz.day,
488+
lower_tz.hour,
489+
lower_tz.minute,
490+
lower_tz.second,
491+
lower_tz.microsecond,
492+
) == (1400, 1, 1, 1, 2, 3, 123456)
493+
higher_tz = table[3][0].as_py() # type: datetime
494+
assert (
495+
higher_tz.year,
496+
higher_tz.month,
497+
higher_tz.day,
498+
higher_tz.hour,
499+
higher_tz.minute,
500+
higher_tz.second,
501+
higher_tz.microsecond,
475502
) == (9999, 1, 1, 1, 2, 3, 123456)
476503

477504

0 commit comments

Comments
 (0)