Snow 2117128 Fix arrow timestamp conversion (#2415)

sfc-gh-pczajka · web-flow · commit 56e524e05d07 · 2025-07-22T09:15:27.000Z
diff --git a/DESCRIPTION.md b/DESCRIPTION.md
@@ -10,6 +10,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
 - v3.16.1(TBD)
   - Added in-band OCSP exception telemetry.
   - Added in-band HTTP exception telemetry.
+  - Fixed a bug where timezoned timestamps fetched as pandas.DataFrame or pyarrow.Table would overflow for the sake of unnecessary precision. In the case where an overflow cannot be prevented a clear error will be raised now.
 
 - v3.16.0(July 04,2025)
   - Bumped numpy dependency from <2.1.0 to <=2.2.4.
diff --git a/src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowTableIterator.cpp b/src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowTableIterator.cpp
@@ -600,6 +600,45 @@ void CArrowTableIterator::convertTimeColumn_nanoarrow(
   ArrowArrayMove(newArray, columnArray->array);
 }
 
+/**
+ * Helper function to detect nanosecond timestamp overflow and determine if
+ * downscaling to microseconds is needed.
+ * @param columnArray The Arrow array containing the timestamp data
+ * @param epochArray The Arrow array containing epoch values
+ * @param fractionArray The Arrow array containing fraction values
+ * @return true if overflow was detected and downscaling to microseconds is
+ * safe, false otherwise
+ * @throws std::overflow_error if overflow is detected but downscaling would
+ * lose precision
+ */
+static bool _checkNanosecondTimestampOverflowAndDownscale(
+    ArrowArrayView* columnArray, ArrowArrayView* epochArray,
+    ArrowArrayView* fractionArray) {
+  int powTenSB4 = sf::internal::powTenSB4[9];
+  for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
+    if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
+      int64_t epoch = ArrowArrayViewGetIntUnsafe(epochArray, rowIdx);
+      int64_t fraction = ArrowArrayViewGetIntUnsafe(fractionArray, rowIdx);
+      if (epoch > (INT64_MAX / powTenSB4) || epoch < (INT64_MIN / powTenSB4)) {
+        if (fraction % 1000 != 0) {
+          std::string errorInfo = Logger::formatString(
+              "The total number of nanoseconds %d%d overflows int64 range. "
+              "If you use a timestamp with "
+              "the nanosecond part over 6-digits in the Snowflake database, "
+              "the timestamp must be "
+              "between '1677-09-21 00:12:43.145224192' and '2262-04-11 "
+              "23:47:16.854775807' to not overflow.",
+              epoch, fraction);
+          throw std::overflow_error(errorInfo.c_str());
+        } else {
+          return true;  // Safe to downscale
+        }
+      }
+    }
+  }
+  return false;
+}
+
 void CArrowTableIterator::convertTimestampColumn_nanoarrow(
     ArrowSchemaView* field, ArrowArrayView* columnArray, const int scale,
     const std::string timezone) {
@@ -614,11 +653,11 @@ void CArrowTableIterator::convertTimestampColumn_nanoarrow(
   newSchema->flags &=
       (field->schema->flags & ARROW_FLAG_NULLABLE);  // map to nullable()
 
-  // calculate has_overflow_to_downscale
+  // Find epoch and fraction arrays for overflow detection
+  ArrowArrayView* epochArray = nullptr;
+  ArrowArrayView* fractionArray = nullptr;
   bool has_overflow_to_downscale = false;
   if (scale > 6 && field->type == NANOARROW_TYPE_STRUCT) {
-    ArrowArrayView* epochArray;
-    ArrowArrayView* fractionArray;
     for (int64_t i = 0; i < field->schema->n_children; i++) {
       ArrowSchema* c_schema = field->schema->children[i];
       if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) ==
@@ -631,30 +670,8 @@ void CArrowTableIterator::convertTimestampColumn_nanoarrow(
         // do nothing
       }
     }
-
-    int powTenSB4 = sf::internal::powTenSB4[9];
-    for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
-      if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
-        int64_t epoch = ArrowArrayViewGetIntUnsafe(epochArray, rowIdx);
-        int64_t fraction = ArrowArrayViewGetIntUnsafe(fractionArray, rowIdx);
-        if (epoch > (INT64_MAX / powTenSB4) ||
-            epoch < (INT64_MIN / powTenSB4)) {
-          if (fraction % 1000 != 0) {
-            std::string errorInfo = Logger::formatString(
-                "The total number of nanoseconds %d%d overflows int64 range. "
-                "If you use a timestamp with "
-                "the nanosecond part over 6-digits in the Snowflake database, "
-                "the timestamp must be "
-                "between '1677-09-21 00:12:43.145224192' and '2262-04-11 "
-                "23:47:16.854775807' to not overflow.",
-                epoch, fraction);
-            throw std::overflow_error(errorInfo.c_str());
-          } else {
-            has_overflow_to_downscale = true;
-          }
-        }
-      }
-    }
+    has_overflow_to_downscale = _checkNanosecondTimestampOverflowAndDownscale(
+        columnArray, epochArray, fractionArray);
   }
 
   if (scale <= 6) {
@@ -855,6 +872,29 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
   ArrowSchemaInit(newSchema);
   newSchema->flags &=
       (field->schema->flags & ARROW_FLAG_NULLABLE);  // map to nullable()
+
+  // Find epoch and fraction arrays
+  ArrowArrayView* epochArray = nullptr;
+  ArrowArrayView* fractionArray = nullptr;
+  for (int64_t i = 0; i < field->schema->n_children; i++) {
+    ArrowSchema* c_schema = field->schema->children[i];
+    if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) == 0) {
+      epochArray = columnArray->children[i];
+    } else if (std::strcmp(c_schema->name,
+                           internal::FIELD_NAME_FRACTION.c_str()) == 0) {
+      fractionArray = columnArray->children[i];
+    } else {
+      // do nothing
+    }
+  }
+
+  // Check for timestamp overflow and determine if downscaling is needed
+  bool has_overflow_to_downscale = false;
+  if (scale > 6 && byteLength == 16) {
+    has_overflow_to_downscale = _checkNanosecondTimestampOverflowAndDownscale(
+        columnArray, epochArray, fractionArray);
+  }
+
   auto timeunit = NANOARROW_TIME_UNIT_SECOND;
   if (scale == 0) {
     timeunit = NANOARROW_TIME_UNIT_SECOND;
@@ -863,7 +903,9 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
   } else if (scale <= 6) {
     timeunit = NANOARROW_TIME_UNIT_MICRO;
   } else {
-    timeunit = NANOARROW_TIME_UNIT_NANO;
+    // Use microsecond precision if we detected overflow, otherwise nanosecond
+    timeunit = has_overflow_to_downscale ? NANOARROW_TIME_UNIT_MICRO
+                                         : NANOARROW_TIME_UNIT_NANO;
   }
 
   if (!timezone.empty()) {
@@ -893,20 +935,6 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
                     "from schema : %s, error code: %d",
                     ArrowErrorMessage(&error), returnCode);
 
-  ArrowArrayView* epochArray;
-  ArrowArrayView* fractionArray;
-  for (int64_t i = 0; i < field->schema->n_children; i++) {
-    ArrowSchema* c_schema = field->schema->children[i];
-    if (std::strcmp(c_schema->name, internal::FIELD_NAME_EPOCH.c_str()) == 0) {
-      epochArray = columnArray->children[i];
-    } else if (std::strcmp(c_schema->name,
-                           internal::FIELD_NAME_FRACTION.c_str()) == 0) {
-      fractionArray = columnArray->children[i];
-    } else {
-      // do nothing
-    }
-  }
-
   for (int64_t rowIdx = 0; rowIdx < columnArray->array->length; rowIdx++) {
     if (!ArrowArrayViewIsNull(columnArray, rowIdx)) {
       if (byteLength == 8) {
@@ -920,8 +948,14 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
           returnCode = ArrowArrayAppendInt(
               newArray, epoch * sf::internal::powTenSB4[6 - scale]);
         } else {
-          returnCode = ArrowArrayAppendInt(
-              newArray, epoch * sf::internal::powTenSB4[9 - scale]);
+          // Handle overflow by falling back to microsecond precision
+          if (has_overflow_to_downscale) {
+            returnCode = ArrowArrayAppendInt(
+                newArray, epoch * sf::internal::powTenSB4[6]);
+          } else {
+            returnCode = ArrowArrayAppendInt(
+                newArray, epoch * sf::internal::powTenSB4[9 - scale]);
+          }
         }
         SF_CHECK_ARROW_RC(returnCode,
                           "[Snowflake Exception] error appending int to "
@@ -941,8 +975,14 @@ void CArrowTableIterator::convertTimestampTZColumn_nanoarrow(
               newArray, epoch * sf::internal::powTenSB4[6] +
                             fraction / sf::internal::powTenSB4[3]);
         } else {
-          returnCode = ArrowArrayAppendInt(
-              newArray, epoch * sf::internal::powTenSB4[9] + fraction);
+          // Handle overflow by falling back to microsecond precision
+          if (has_overflow_to_downscale) {
+            returnCode = ArrowArrayAppendInt(
+                newArray, epoch * sf::internal::powTenSB4[6] + fraction / 1000);
+          } else {
+            returnCode = ArrowArrayAppendInt(
+                newArray, epoch * sf::internal::powTenSB4[9] + fraction);
+          }
         }
         SF_CHECK_ARROW_RC(returnCode,
                           "[Snowflake Exception] error appending int to "
diff --git a/test/integ/pandas_it/test_arrow_pandas.py b/test/integ/pandas_it/test_arrow_pandas.py
@@ -438,40 +438,67 @@ def test_timestampntz(conn_cnx, scale):
     [
         "'1400-01-01 01:02:03.123456789'::timestamp as low_ts",
         "'9999-01-01 01:02:03.123456789789'::timestamp as high_ts",
+        "convert_timezone('UTC', '1400-01-01 01:02:03.123456789') as low_ts",
+        "convert_timezone('UTC', '9999-01-01 01:02:03.123456789789') as high_ts",
     ],
 )
-def test_timestampntz_raises_overflow(conn_cnx, timestamp_str):
+def test_timestamp_raises_overflow(conn_cnx, timestamp_str):
     with conn_cnx() as conn:
         r = conn.cursor().execute(f"select {timestamp_str}")
         with pytest.raises(OverflowError, match="overflows int64 range."):
             r.fetch_arrow_all()
 
 
-def test_timestampntz_down_scale(conn_cnx):
+def test_timestamp_down_scale(conn_cnx):
     with conn_cnx() as conn:
         r = conn.cursor().execute(
-            "select '1400-01-01 01:02:03.123456'::timestamp as low_ts, '9999-01-01 01:02:03.123456'::timestamp as high_ts"
+            """select '1400-01-01 01:02:03.123456'::timestamp as low_ntz,
+            '9999-01-01 01:02:03.123456'::timestamp as high_ntz,
+            convert_timezone('UTC', '1400-01-01 01:02:03.123456') as low_tz,
+            convert_timezone('UTC', '9999-01-01 01:02:03.123456') as high_tz
+            """
         )
         table = r.fetch_arrow_all()
-        lower_dt = table[0][0].as_py()  # type: datetime
+        lower_ntz = table[0][0].as_py()  # type: datetime
         assert (
-            lower_dt.year,
-            lower_dt.month,
-            lower_dt.day,
-            lower_dt.hour,
-            lower_dt.minute,
-            lower_dt.second,
-            lower_dt.microsecond,
+            lower_ntz.year,
+            lower_ntz.month,
+            lower_ntz.day,
+            lower_ntz.hour,
+            lower_ntz.minute,
+            lower_ntz.second,
+            lower_ntz.microsecond,
         ) == (1400, 1, 1, 1, 2, 3, 123456)
-        higher_dt = table[1][0].as_py()
+        higher_ntz = table[1][0].as_py()  # type: datetime
         assert (
-            higher_dt.year,
-            higher_dt.month,
-            higher_dt.day,
-            higher_dt.hour,
-            higher_dt.minute,
-            higher_dt.second,
-            higher_dt.microsecond,
+            higher_ntz.year,
+            higher_ntz.month,
+            higher_ntz.day,
+            higher_ntz.hour,
+            higher_ntz.minute,
+            higher_ntz.second,
+            higher_ntz.microsecond,
+        ) == (9999, 1, 1, 1, 2, 3, 123456)
+
+        lower_tz = table[2][0].as_py()  # type: datetime
+        assert (
+            lower_tz.year,
+            lower_tz.month,
+            lower_tz.day,
+            lower_tz.hour,
+            lower_tz.minute,
+            lower_tz.second,
+            lower_tz.microsecond,
+        ) == (1400, 1, 1, 1, 2, 3, 123456)
+        higher_tz = table[3][0].as_py()  # type: datetime
+        assert (
+            higher_tz.year,
+            higher_tz.month,
+            higher_tz.day,
+            higher_tz.hour,
+            higher_tz.minute,
+            higher_tz.second,
+            higher_tz.microsecond,
         ) == (9999, 1, 1, 1, 2, 3, 123456)