diff --git a/CMakeLists.txt b/CMakeLists.txt index 4af6f8d..4acc9e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,7 +53,7 @@ foreach(EXAMPLE_SOURCE ${EXAMPLE_SOURCES}) # CUDA properties provided by CMAKE set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 100) + set_target_properties(${EXAMPLE_NAME} PROPERTIES CUDA_ARCHITECTURES 100a) # Convert the flags string into a list of flags separate_arguments(EXTRA_CUDA_FLAGS_LIST UNIX_COMMAND "${EXTRA_CUDA_FLAGS}") diff --git a/examples/misc/atomic_max.cu b/examples/misc/atomic_max.cu index 7fa1aba..d78de31 100644 --- a/examples/misc/atomic_max.cu +++ b/examples/misc/atomic_max.cu @@ -2,7 +2,7 @@ #include #include -__device__ __forceinline__ float atomicMaxFloatBAD(float *addr, float value) { +__device__ __forceinline__ void atomicMaxFloatBAD(float *addr, float value) { // source: https://stackoverflow.com/a/51549250 (value >= 0) ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) diff --git a/examples/test_nvfp4_rounding.cu b/examples/test_nvfp4_rounding.cu new file mode 100644 index 0000000..de4dbe9 --- /dev/null +++ b/examples/test_nvfp4_rounding.cu @@ -0,0 +1,336 @@ +// Compilation: nvcc -arch=sm_90 -std=c++17 test_nvfp4_rounding.cu -o test_nvfp4_rounding +// For Godbolt: Add flags: -arch=sm_90 -std=c++17 +// Note: Requires SM 9.0+ for FP4 E2M1 intrinsics + +#include +#include +#include +#include +#include +#include +#include + +// FP4 E2M1 lookup table +const float fp4_e2m1_lut[16] = { + 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f, + -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f +}; + +// Simple test kernel to verify PTX instruction works +__global__ void test_ptx_instruction() { + // Test with known values + float test_vals[4] = {1.0f, 2.0f, 3.0f, 4.0f}; + unsigned int results[4]; + + for (int i = 0; i < 4; i++) { + asm volatile ( + "{\n\t" + " .reg .b8 fp4_byte;\n\t" + " cvt.rn.satfinite.e2m1x2.f32 fp4_byte, %1, %2;\n\t" + " cvt.u32.u8 %0, fp4_byte;\n\t" + "}\n\t" + : "=r"(results[i]) + : "f"(test_vals[i]), "f"(0.0f) + ); + } + + // Print results (only thread 0) + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("PTX Instruction Test:\n"); + for (int i = 0; i < 4; i++) { + uint8_t byte = results[i] & 0xFF; + uint8_t low = byte & 0xF; + uint8_t high = (byte >> 4) & 0xF; + printf(" %.1f → byte: 0x%02x (low: 0x%x, high: 0x%x)\n", + test_vals[i], byte, low, high); + } + printf("\n"); + } +} + +// Test kernel - correct PTX usage based on NVIDIA docs +__global__ void test_fp4_conversion_kernel( + float* inputs, + uint8_t* fp4_outputs, + uint8_t* raw_bytes, + int count +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= count) return; + + float val1 = inputs[idx]; + float val2 = 0.0f; // dummy second value + + // The cvt instruction outputs a single byte containing two FP4 values + // We need to use inline PTX with proper register declarations + unsigned int result = 0; + + asm volatile ( + "{\n\t" + " .reg .b8 fp4_byte;\n\t" + " cvt.rn.satfinite.e2m1x2.f32 fp4_byte, %1, %2;\n\t" + " cvt.u32.u8 %0, fp4_byte;\n\t" // Convert byte to 32-bit for output + "}\n\t" + : "=r"(result) + : "f"(val1), "f"(val2) + ); + + // Extract the byte + uint8_t byte_result = result & 0xFF; + raw_bytes[idx] = byte_result; + + // The e2m1x2 format packs two FP4 values: + // First float → ??? (need to determine which nibble) + // Second float → ??? (need to determine which nibble) + + // For now, store the full byte and we'll analyze it on the host + fp4_outputs[idx] = byte_result; +} + +int main() { + std::cout << "Testing CUDA FP4 E2M1 Rounding Behavior\n"; + std::cout << "=======================================\n\n"; + + // First, run a simple test to verify PTX instruction works + test_ptx_instruction<<<1, 1>>>(); + cudaDeviceSynchronize(); + + // Print all possible E2M1 values + std::cout << "All possible FP4 E2M1 values:\n"; + std::cout << "-----------------------------\n"; + std::cout << "FP4 Binary Value\n"; + std::cout << "--- ------ -----\n"; + + for (uint8_t i = 0; i <= 15; i++) { + std::cout << "0x" << std::hex << (int)i << std::dec + << " 0b" << ((i>>3)&1) << ((i>>2)&1) << ((i>>1)&1) << (i&1) + << " " << std::setw(5) << fp4_e2m1_lut[i] << "\n"; + } + std::cout << "\n"; + + // Test values - focusing on tie cases + std::vector test_values = { + // Perfect tie cases + 0.75f, // Exactly between 0.5 and 1.0 + 1.25f, // Exactly between 1.0 and 1.5 + 1.75f, // Exactly between 1.5 and 2.0 + 2.5f, // Exactly between 2.0 and 3.0 + 3.5f, // Exactly between 3.0 and 4.0 + 5.0f, // Exactly between 4.0 and 6.0 + // Negative ties + -0.75f, -1.25f, -1.75f, -2.5f, -3.5f, -5.0f, + // Non-ties for comparison + 0.6f, 1.1f, 2.2f, 2.8f + }; + + // Allocate memory + float* d_inputs; + uint8_t* d_outputs; + uint8_t* d_raw_bytes; + cudaMalloc(&d_inputs, test_values.size() * sizeof(float)); + cudaMalloc(&d_outputs, test_values.size() * sizeof(uint8_t)); + cudaMalloc(&d_raw_bytes, test_values.size() * sizeof(uint8_t)); + + // Copy inputs + cudaMemcpy(d_inputs, test_values.data(), + test_values.size() * sizeof(float), + cudaMemcpyHostToDevice); + + // Run kernel + int threads = 256; + int blocks = (test_values.size() + threads - 1) / threads; + test_fp4_conversion_kernel<<>>(d_inputs, d_outputs, d_raw_bytes, test_values.size()); + cudaDeviceSynchronize(); + + // Get results + std::vector h_outputs(test_values.size()); + std::vector h_raw_bytes(test_values.size()); + cudaMemcpy(h_outputs.data(), d_outputs, + h_outputs.size() * sizeof(uint8_t), + cudaMemcpyDeviceToHost); + cudaMemcpy(h_raw_bytes.data(), d_raw_bytes, + h_raw_bytes.size() * sizeof(uint8_t), + cudaMemcpyDeviceToHost); + + // Debug: Show raw bytes and figure out nibble ordering + std::cout << "\nDebug: Raw bytes from PTX instruction:\n"; + std::cout << "--------------------------------------\n"; + + for (size_t i = 0; i < 6 && i < test_values.size(); i++) { + uint8_t raw = h_raw_bytes[i]; + uint8_t low = raw & 0xF; + uint8_t high = (raw >> 4) & 0xF; + + std::cout << "Input: " << std::setw(6) << test_values[i] + << " → Raw byte: 0x" << std::hex << std::setw(2) << std::setfill('0') << (int)raw << std::dec; + + float decoded_low = fp4_e2m1_lut[low]; + float decoded_high = fp4_e2m1_lut[high]; + float error_low = std::abs(test_values[i] - decoded_low); + float error_high = std::abs(test_values[i] - decoded_high); + + std::cout << "\n Low nibble: 0x" << std::hex << (int)low << std::dec + << " = " << std::setw(5) << decoded_low + << " (error: " << error_low << ")"; + if (error_low < error_high) std::cout << " ← likely match"; + + std::cout << "\n High nibble: 0x" << std::hex << (int)high << std::dec + << " = " << std::setw(5) << decoded_high + << " (error: " << error_high << ")"; + if (error_high < error_low) std::cout << " ← likely match"; + + std::cout << std::setfill(' ') << "\n\n"; + } + + // Determine which nibble to use based on errors + std::cout << "Determining nibble assignment...\n"; + float total_error_low = 0, total_error_high = 0; + int count_low_better = 0, count_high_better = 0; + + for (size_t i = 0; i < test_values.size(); i++) { + uint8_t raw = h_raw_bytes[i]; + uint8_t low = raw & 0xF; + uint8_t high = (raw >> 4) & 0xF; + + float error_low = std::abs(test_values[i] - fp4_e2m1_lut[low]); + float error_high = std::abs(test_values[i] - fp4_e2m1_lut[high]); + + total_error_low += error_low; + total_error_high += error_high; + + if (error_low < error_high) count_low_better++; + else if (error_high < error_low) count_high_better++; + } + + bool use_low_nibble = (total_error_low <= total_error_high); + std::cout << "Total error using low nibble: " << total_error_low << "\n"; + std::cout << "Total error using high nibble: " << total_error_high << "\n"; + std::cout << "Decision: Use " << (use_low_nibble ? "LOW" : "HIGH") << " nibble for first float\n\n"; + + // Update outputs based on decision + for (size_t i = 0; i < h_outputs.size(); i++) { + if (use_low_nibble) { + h_outputs[i] = h_raw_bytes[i] & 0xF; + } else { + h_outputs[i] = (h_raw_bytes[i] >> 4) & 0xF; + } + } + + // Analyze results + std::cout << "Conversion Results:\n"; + std::cout << "==================\n"; + std::cout << std::setw(8) << "Input" + << std::setw(8) << "FP4" + << std::setw(10) << "Decoded" + << std::setw(10) << "Error" + << std::setw(20) << "Comment\n"; + + for (size_t i = 0; i < test_values.size(); i++) { + float input = test_values[i]; + uint8_t fp4 = h_outputs[i]; + float decoded = fp4_e2m1_lut[fp4]; + float error = std::abs(input - decoded); + + std::cout << std::fixed << std::setprecision(3); + std::cout << std::setw(8) << input + << std::setw(8) << "0x" << std::hex << (int)fp4 << std::dec + << std::setw(10) << decoded + << std::setw(10) << error; + + // Check for tie cases + bool is_tie = false; + float option1 = 0, option2 = 0; + for (int j = 0; j < 16; j++) { + for (int k = j+1; k < 16; k++) { + if (std::abs(std::abs(input - fp4_e2m1_lut[j]) - + std::abs(input - fp4_e2m1_lut[k])) < 1e-6f && + std::abs(input - fp4_e2m1_lut[j]) < 0.51f) { + is_tie = true; + option1 = fp4_e2m1_lut[j]; + option2 = fp4_e2m1_lut[k]; + if (option1 > option2) std::swap(option1, option2); + } + } + } + + if (is_tie) { + std::cout << " (tie: " << option1 << " vs " << option2 << ")"; + } + std::cout << "\n"; + } + + // Detailed tie analysis + std::cout << "\n\nTie-Breaking Analysis:\n"; + std::cout << "=====================\n"; + std::cout << "For round-to-nearest-even, ties should go to the value with even mantissa (m=0)\n\n"; + + std::cout << std::setw(8) << "Input" + << std::setw(15) << "Options" + << std::setw(10) << "Result" + << std::setw(10) << "Mantissa" + << std::setw(20) << "Round-to-even?\n"; + + // Analyze specific tie cases + std::vector tie_values = {0.75f, 1.25f, 1.75f, 2.5f, 3.5f, 5.0f}; + std::vector> tie_options = { + {0.5f, 1.0f}, {1.0f, 1.5f}, {1.5f, 2.0f}, + {2.0f, 3.0f}, {3.0f, 4.0f}, {4.0f, 6.0f} + }; + + for (size_t i = 0; i < tie_values.size(); i++) { + // Find the result in our test + for (size_t j = 0; j < test_values.size(); j++) { + if (std::abs(test_values[j] - tie_values[i]) < 1e-6f) { + uint8_t fp4 = h_outputs[j]; + float decoded = fp4_e2m1_lut[fp4]; + int mantissa = fp4 & 1; + + std::cout << std::setw(8) << tie_values[i] + << std::setw(15) << tie_options[i].first + << " vs " << tie_options[i].second + << std::setw(10) << decoded + << std::setw(10) << "m=" << mantissa + << std::setw(20) << (mantissa == 0 ? "Yes ✓" : "No ✗") + << "\n"; + break; + } + } + } + + // Special focus on 2.5 + std::cout << "\n\n2.5 Detailed Analysis:\n"; + std::cout << "=====================\n"; + for (size_t i = 0; i < test_values.size(); i++) { + if (std::abs(test_values[i] - 2.5f) < 1e-6f) { + uint8_t fp4 = h_outputs[i]; + float decoded = fp4_e2m1_lut[fp4]; + + std::cout << "Input: 2.5\n"; + std::cout << "FP4 encoding: 0x" << std::hex << (int)fp4 << std::dec + << " (0b" << ((fp4>>3)&1) << ((fp4>>2)&1) + << ((fp4>>1)&1) << (fp4&1) << ")\n"; + std::cout << "Decoded value: " << decoded << "\n"; + std::cout << "Mantissa bit: " << (fp4 & 1) << "\n\n"; + + std::cout << "Options were:\n"; + std::cout << " 2.0 (0x4, m=0) - even mantissa\n"; + std::cout << " 3.0 (0x5, m=1) - odd mantissa\n\n"; + + if (fp4 == 0x4) { + std::cout << "✓ Correctly rounded to even (2.0)\n"; + } else if (fp4 == 0x5) { + std::cout << "✗ Rounded to odd (3.0) - NOT round-to-nearest-even\n"; + } else { + std::cout << "! Unexpected result\n"; + } + break; + } + } + + // Cleanup + cudaFree(d_inputs); + cudaFree(d_outputs); + cudaFree(d_raw_bytes); + + return 0; +}