script

drisspg · drisspg · commit b94abfa23829 · 2025-07-02T13:53:36.000-04:00
stack-info: PR: #8, branch: drisspg/stack/2
diff --git a/examples/test_nvfp4_rounding.cu b/examples/test_nvfp4_rounding.cu
@@ -0,0 +1,231 @@
+// Compilation: nvcc -arch=sm_90 -std=c++17 test_nvfp4_rounding.cu -o test_nvfp4_rounding
+// For Godbolt: Add flags: -arch=sm_90 -std=c++17
+// Note: Requires SM 9.0+ for FP4 E2M1 intrinsics
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <iomanip>
+#include <cstring>
+
+// Convert float to bfloat16 and back
+float to_bfloat16_and_back(float val) {
+    uint32_t bits = *reinterpret_cast<uint32_t*>(&val);
+    bits = (bits + 0x8000) & 0xFFFF0000;  // Round to bfloat16
+    return *reinterpret_cast<float*>(&bits);
+}
+
+// FP4 E2M1 format decoder
+void decode_fp4_e2m1(uint8_t fp4, float& value, int& sign, int& exp, int& mantissa) {
+    sign = (fp4 >> 3) & 1;
+    exp = (fp4 >> 1) & 3;
+    mantissa = fp4 & 1;
+    
+    // E2M1 decoding with bias=1
+    if (exp == 0) {
+        // Denormal or zero
+        if (mantissa == 0) {
+            value = 0.0f;
+        } else {
+            value = (sign ? -1.0f : 1.0f) * 0.5f;  // 2^(-1) * 0.5
+        }
+    } else {
+        // Normal number: (-1)^s * 2^(e-1) * (1 + m/2)
+        float mantissa_val = 1.0f + mantissa * 0.5f;
+        value = (sign ? -1.0f : 1.0f) * std::pow(2.0f, exp - 1) * mantissa_val;
+    }
+}
+
+// Test kernel for single value
+__global__ void test_single_value_kernel(
+    float input,
+    uint8_t* cuda_result,
+    float* debug_info
+) {
+    // CUDA intrinsic conversion
+    uint32_t packed_result;
+    float dummy = 0.0f;  // Second value for x2 conversion
+    
+    asm volatile (
+        "{\n\t"
+        ".reg .b8 byte0;\n\t"
+        ".reg .b32 result;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 byte0, %1, %2;\n\t"
+        "mov.b32 result, {byte0, 0, 0, 0};\n\t"
+        "mov.b32 %0, result;\n\t"
+        "}"
+        : "=r"(packed_result)
+        : "f"(input), "f"(dummy)
+    );
+    
+    // Extract the FP4 values
+    cuda_result[0] = (packed_result >> 4) & 0xF;  // High nibble
+    cuda_result[1] = packed_result & 0xF;         // Low nibble (dummy)
+    
+    // Store debug info
+    debug_info[0] = input;
+}
+
+// Manual FP4 conversion matching PyTorch behavior
+uint8_t pytorch_style_fp4_convert(float val) {
+    constexpr float F4_E2M1_MAX = 6.0f;
+    
+    // Clamp to FP4 range
+    val = std::fmax(-F4_E2M1_MAX, std::fmin(F4_E2M1_MAX, val));
+    
+    if (val == 0.0f) return 0;
+    
+    uint32_t bits = *reinterpret_cast<uint32_t*>(&val);
+    uint32_t sign = (bits >> 31) & 1;
+    int32_t exp = ((bits >> 23) & 0xFF) - 127;
+    uint32_t mantissa = bits & 0x7FFFFF;
+    
+    // Handle special cases
+    if (exp < -2) return sign << 3;  // Underflow to zero
+    
+    // Handle denormals
+    if (exp == -2) {
+        // Can only represent ±0.5 as denormal in E2M1
+        if (mantissa >= 0x400000) {  // >= 0.5 in mantissa
+            return (sign << 3) | 0x1;  // Denormal 0.5
+        }
+        return sign << 3;  // Zero
+    }
+    
+    // Normal numbers
+    if (exp > 2) {
+        // Overflow to max (±6.0)
+        return (sign << 3) | 0x7;
+    }
+    
+    // Round mantissa to 1 bit
+    uint32_t mantissa_bit = (mantissa >> 22) & 1;
+    uint32_t round_bit = (mantissa >> 21) & 1;
+    
+    // Round to nearest, ties to even
+    if (round_bit && ((mantissa_bit == 1) || ((mantissa & 0x1FFFFF) != 0))) {
+        mantissa_bit++;
+        if (mantissa_bit > 1) {
+            mantissa_bit = 0;
+            exp++;
+            if (exp > 2) {
+                return (sign << 3) | 0x7;  // Overflow
+            }
+        }
+    }
+    
+    return (sign << 3) | ((exp + 1) << 1) | mantissa_bit;
+}
+
+int main() {
+    // Test the specific problematic bfloat16 values from the failing test
+    std::vector<float> test_values = {
+        1.171875f,   // From index 1011
+        -1.171875f,  // From index 4941
+        -0.585938f,  // From index 8192
+        0.585938f,   // From index 28410
+        2.5f,        // Additional test
+        -2.5f,       // Additional test
+        1.25f,       // Edge case
+        -1.25f,      // Edge case
+        0.75f,       // Near 0.5
+        -0.75f       // Near -0.5
+    };
+    
+    std::cout << "Testing NVFP4 quantization behavior with bfloat16 values\n";
+    std::cout << "=========================================================\n\n";
+    
+    for (float orig_val : test_values) {
+        // Convert to bfloat16 and back
+        float bf16_val = to_bfloat16_and_back(orig_val);
+        
+        // Allocate device memory
+        uint8_t* d_cuda_result;
+        float* d_debug_info;
+        cudaMalloc(&d_cuda_result, 2 * sizeof(uint8_t));
+        cudaMalloc(&d_debug_info, sizeof(float));
+        
+        // Run kernel
+        test_single_value_kernel<<<1, 1>>>(bf16_val, d_cuda_result, d_debug_info);
+        cudaDeviceSynchronize();
+        
+        // Get results
+        uint8_t h_cuda_result[2];
+        float h_debug_info;
+        cudaMemcpy(h_cuda_result, d_cuda_result, 2 * sizeof(uint8_t), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&h_debug_info, d_debug_info, sizeof(float), cudaMemcpyDeviceToHost);
+        
+        // Manual conversion
+        uint8_t pytorch_result = pytorch_style_fp4_convert(bf16_val);
+        
+        // Decode FP4 values back to float
+        float cuda_decoded, pytorch_decoded;
+        int cuda_sign, cuda_exp, cuda_mantissa;
+        int pt_sign, pt_exp, pt_mantissa;
+        
+        decode_fp4_e2m1(h_cuda_result[0], cuda_decoded, cuda_sign, cuda_exp, cuda_mantissa);
+        decode_fp4_e2m1(pytorch_result, pytorch_decoded, pt_sign, pt_exp, pt_mantissa);
+        
+        std::cout << std::fixed << std::setprecision(6);
+        std::cout << "Original value: " << orig_val << " → BF16: " << bf16_val << "\n";
+        std::cout << "  CUDA intrinsic result:\n";
+        std::cout << "    FP4 bits: 0x" << std::hex << (int)h_cuda_result[0] << std::dec 
+                  << " (s=" << cuda_sign << ", e=" << cuda_exp << ", m=" << cuda_mantissa << ")\n";
+        std::cout << "    Decoded: " << cuda_decoded << "\n";
+        std::cout << "    Error: " << std::abs(bf16_val - cuda_decoded) << "\n";
+        
+        std::cout << "  PyTorch-style result:\n";
+        std::cout << "    FP4 bits: 0x" << std::hex << (int)pytorch_result << std::dec
+                  << " (s=" << pt_sign << ", e=" << pt_exp << ", m=" << pt_mantissa << ")\n";
+        std::cout << "    Decoded: " << pytorch_decoded << "\n";
+        std::cout << "    Error: " << std::abs(bf16_val - pytorch_decoded) << "\n";
+        
+        if (h_cuda_result[0] != pytorch_result) {
+            std::cout << "  >>> MISMATCH! Difference: " << (int)h_cuda_result[0] - (int)pytorch_result << "\n";
+            std::cout << "  >>> CUDA chose: " << cuda_decoded << " (error=" << std::abs(bf16_val - cuda_decoded) << ")\n";
+            std::cout << "  >>> PyTorch chose: " << pytorch_decoded << " (error=" << std::abs(bf16_val - pytorch_decoded) << ")\n";
+        }
+        std::cout << "\n";
+        
+        // Cleanup
+        cudaFree(d_cuda_result);
+        cudaFree(d_debug_info);
+    }
+    
+    // Test rounding behavior around critical values
+    std::cout << "\nDetailed rounding analysis for 1.171875:\n";
+    std::cout << "==========================================\n";
+    float val = 1.171875f;
+    float bf16_val = to_bfloat16_and_back(val);
+    
+    std::cout << "BF16 value: " << bf16_val << "\n";
+    std::cout << "Possible FP4 E2M1 representations:\n";
+    
+    // Show all nearby FP4 values
+    for (uint8_t fp4 = 0; fp4 <= 15; fp4++) {
+        float decoded;
+        int sign, exp, mantissa;
+        decode_fp4_e2m1(fp4, decoded, sign, exp, mantissa);
+        float error = std::abs(bf16_val - decoded);
+        
+        if (error < 2.0f) {  // Only show nearby values
+            std::cout << "  FP4=0x" << std::hex << (int)fp4 << std::dec 
+                      << " → " << decoded 
+                      << " (error=" << error << ")";
+            if (fp4 == 4) std::cout << " <- PyTorch chooses this";
+            if (fp4 == 5) std::cout << " <- CUDA intrinsic chooses this";
+            std::cout << "\n";
+        }
+    }
+    
+    // Analyze the tie-breaking
+    std::cout << "\nThe value 1.171875 is exactly between 1.0 and 1.5\n";
+    std::cout << "Distance to 1.0: " << std::abs(1.171875f - 1.0f) << "\n";
+    std::cout << "Distance to 1.5: " << std::abs(1.171875f - 1.5f) << "\n";
+    std::cout << "This is a tie! The rounding rule determines which is chosen.\n";
+    std::cout << "CUDA intrinsic appears to round up, PyTorch rounds to even.\n";
+    
+    return 0;
+}