fp8: fix failing tests

zackangelo · zackangelo · commit d0c8e243b008 · 2025-06-11T22:49:44.000Z
diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs
@@ -126,6 +126,21 @@ fn arange(device: &Device) -> Result<()> {
         Tensor::arange_step(5i64, 0i64, -1, device)?.to_vec1::<i64>()?,
         [5, 4, 3, 2, 1],
     );
+
+    assert_eq!(
+        Tensor::arange_step(
+            F8E4M3::from_f32(0.),
+            F8E4M3::from_f32(5.),
+            F8E4M3::from_f32(2.),
+            device
+        )?
+        .to_vec1::<F8E4M3>()?,
+        [
+            F8E4M3::from_f32(0.),
+            F8E4M3::from_f32(2.),
+            F8E4M3::from_f32(4.),
+        ],
+    );
     Ok(())
 }
 
diff --git a/candle-kernels/src/compatibility.cuh b/candle-kernels/src/compatibility.cuh
@@ -35,12 +35,11 @@ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-
 #if __CUDA_ARCH__ < 700
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
 // The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
 // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
-__device__ __half atomicAdd(__half *address, __half val) {
+//__device__ __half atomicAdd(__half *address, __half val) {
    //  unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
    //  unsigned int old = *address_as_ui;
    //  unsigned int assumed;
@@ -56,7 +55,7 @@ __device__ __half atomicAdd(__half *address, __half val) {
 
    // } while (assumed != old);
    // return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
-}
+//}
 #endif
 
 
diff --git a/candle-kernels/src/indexing.cu b/candle-kernels/src/indexing.cu
@@ -25,6 +25,18 @@ constexpr uint8_t max_value<uint8_t>() {
     return 0xFFu;
 }
 
+template <>
+__host__ __device__
+constexpr int32_t max_value<int32_t>() {
+    return 0x7FFFFFFF;
+}
+
+template <>
+__host__ __device__
+constexpr int16_t max_value<int16_t>() {
+    return 0x7FFF;
+}
+
 template<typename T, typename I>
 __device__ void index_select(
     const size_t numel,
@@ -134,7 +146,7 @@ __device__ void index_add(
       }
 }
 
-#if __CUDA_ARCH__ >= 800
+#if __CUDA_ARCH__ >= 890
 #define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
 
 template<typename I>
@@ -311,7 +323,9 @@ SA_OP(__nv_bfloat16, uint8_t, sa_u8_bf16)
 S_OP(__nv_bfloat16, int64_t, s_i64_bf16)
 S_OP(__nv_bfloat16, uint32_t, s_u32_bf16)
 S_OP(__nv_bfloat16, uint8_t, s_u8_bf16)
+#endif
 
+#if __CUDA_ARCH__ >= 890
 IS_OP(__nv_fp8_e4m3, int16_t, is_i16_f8_e4m3)
 IS_OP(__nv_fp8_e4m3, int32_t, is_i32_f8_e4m3)
 IS_OP(__nv_fp8_e4m3, int64_t, is_i64_f8_e4m3)