Improve uint64 handling in tfds.

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 05f55cdaa15e · 2024-05-15T11:59:28.000-07:00
1) For encoding (raw), bitcast to int64 before writing. This is idempotent for values &lt; kint64max.
2) For decoding, read as int64, bitcast to uint64.

All other (uint) dtypes are handled by tf.io.decode_raw. This change could also be achieved by making tf.io.decode_raw support uint64, but the parts around tf.Example.int64_values of values over kint64max would be necessary.

PiperOrigin-RevId: 634029422
diff --git a/tensorflow_datasets/core/example_serializer.py b/tensorflow_datasets/core/example_serializer.py
@@ -187,7 +187,9 @@ def _item_to_tf_feature(
   # Convert boolean to integer (tf.train.Example does not support bool)
   if v.dtype == np.bool_:
     v = v.astype(int)
-
+  if v.dtype == np.uint64:
+    # We cannot store uint64 in tf.Example, so we bitcast to int64.
+    v = v.view(np.int64)
   vals = v.flat  # Convert v into a 1-d array (without extra copy)
   if dtype_utils.is_integer(v.dtype):
     return tf_feature_pb2.Feature(
diff --git a/tensorflow_datasets/core/example_serializer_test.py b/tensorflow_datasets/core/example_serializer_test.py
@@ -262,6 +262,12 @@ def test_add_ragged_fields_single_level_sequence(self, dtype):
     )
     self.assertEqual(out[1], tensor_info)
 
+  def test_uint_to_tf_feature_overflow(self):
+    tensor_info = feature_lib.TensorInfo(shape=(), dtype=np.uint64)
+    bigint = np.array((1 << 63) + 10, dtype=np.uint64)
+    # Does not raise value error.
+    example_serializer._item_to_tf_feature(bigint, tensor_info)
+
   @parameterized.parameters((np.int64), (tf.int64))
   def test_item_to_tf_feature_incorrect_shape(self, dtype):
     # Test shape check in _item_to_tf_feature raises ValueError.
diff --git a/tensorflow_datasets/core/features/tensor_feature.py b/tensorflow_datasets/core/features/tensor_feature.py
@@ -268,15 +268,27 @@ def _get_value_and_shape(self, example_data):
     else:
       value = example_data
       shape = np_utils.to_np_shape(self._shape)
+    if (
+        self._dtype == np.uint64
+        and not self._encoded_to_bytes
+        and isinstance(value, np.ndarray)
+    ):
+      # We can only store int64 inside tf.Example, so if we had a uint64, we
+      # bitcasted it to int64 at encoding time. Thus, when decoding, we need to
+      # bitcast it asback to uint64.
+      value = value.view(np.uint64)
     return value, shape
 
   def decode_example(self, tfexample_data):
     """See base class for details."""
     value, shape = self._get_value_and_shape(tfexample_data)
+    decode_dtype = self.tf_dtype if self.tf_dtype != tf.uint64 else tf.int64
     if self._encoded_to_bytes:
       if self._encoding == Encoding.ZLIB:
         value = tf.io.decode_compressed(value, compression_type='ZLIB')
-      value = tf.io.decode_raw(value, self.tf_dtype)
+      value = tf.io.decode_raw(value, decode_dtype)
+      if self.dtype == tf.uint64:
+        value = tf.bitcast(value, tf.uint64)
       value = tf.reshape(value, shape)
 
     return value
diff --git a/tensorflow_datasets/core/features/tensor_feature_test.py b/tensorflow_datasets/core/features/tensor_feature_test.py
@@ -101,6 +101,30 @@ def test_shape_static(
         },
     )
 
+  @parameterized.parameters([
+      features_lib.Encoding.BYTES,
+      features_lib.Encoding.ZLIB,
+  ])
+  def test_uint64_encoded_roundtrip(self, encoding: features_lib.Encoding):
+    bigint = np.array((1 << 63) + 10, dtype=np.uint64)
+    feature = features_lib.Tensor(shape=(), dtype=np.uint64, encoding=encoding)
+    self.assertEqual(
+        feature.decode_example(feature.encode_example(bigint)),
+        bigint,
+    )
+    self.assertEqual(
+        feature.decode_example_np(feature.encode_example(bigint)),
+        bigint,
+    )
+
+  def test_uint64_roundtrip(self):
+    feature = features_lib.Tensor(shape=(), dtype=np.uint64)
+    bigint = np.array((1 << 63) + 10, dtype=np.uint64)
+    # since we are using tf.Example int64 to hold this result, we start with
+    # the manually encoded (bitcasted) version of the value.
+    self.assertEqual(feature.decode_example(bigint.view(np.int64)), bigint)
+    self.assertEqual(feature.decode_example_np(bigint.view(np.int64)), bigint)
+
   @parameterized.parameters([
       (np.int32, features_lib.Encoding.NONE),
       (tf.int32, features_lib.Encoding.NONE),