resolve cublas path resolution, fix fp4 q precision for triton and eager backnds, performance boost to triton fp4

Kosinkadink · Kosinkadink · commit 1969e3669c2f · 2026-01-05T03:56:52.000-08:00
diff --git a/comfy_kitchen/backends/cuda/__init__.py b/comfy_kitchen/backends/cuda/__init__.py
@@ -43,7 +43,7 @@ def find_lib_dir(start_dir, lib_pattern):
                     return root
         return None
 
-    nvidia_cu13_path = os.path.dirname(nvidia.cu13.__path__[0])
+    nvidia_cu13_path = nvidia.cu13.__path__[0]
 
     if sys.platform == "win32":
         lib_dir = find_lib_dir(nvidia_cu13_path, "cublasLt64")
diff --git a/comfy_kitchen/backends/eager/quantization.py b/comfy_kitchen/backends/eager/quantization.py
@@ -10,7 +10,6 @@
 
 from comfy_kitchen.float_utils import (
     F4_E2M1_MAX,
-    F8_E4M3_EPS,
     F8_E4M3_MAX,
     F8_E5M2_MAX,
     _f32_to_floatx_unpacked,
@@ -82,15 +81,19 @@ def quantize_nvfp4(
 
     x = x.reshape(orig_shape[0], -1, block_size)
     max_abs = torch.amax(torch.abs(x), dim=-1)
-    block_scale = max_abs / F4_E2M1_MAX
-    block_scale_fp32 = block_scale.to(torch.float32)
-    scaled_block_scales = block_scale_fp32 / per_tensor_scale
-    scaled_block_scales_fp8 = torch.clamp(scaled_block_scales, min=F8_E4M3_EPS, max=F8_E4M3_MAX)
+    block_scale = max_abs.to(torch.float32) / F4_E2M1_MAX
+    scaled_block_scales = block_scale / per_tensor_scale
+    scaled_block_scales_fp8 = torch.clamp(scaled_block_scales, max=F8_E4M3_MAX)
     scaled_block_scales_fp32 = _float8_round(scaled_block_scales_fp8)
-    # We "temporarily" dequant the scaled_block_scales_fp32 to get the per_tensor_scale
-    # To apply to data
     total_scale = per_tensor_scale * scaled_block_scales_fp32
-    data_scaled = x / total_scale.unsqueeze(-1)
+
+    # Handle zero blocks (from padding): avoid 0/0 NaN
+    zero_scale_mask = (total_scale == 0)
+    total_scale_safe = torch.where(zero_scale_mask, torch.ones_like(total_scale), total_scale)
+
+    data_scaled = x.float() / total_scale_safe.unsqueeze(-1)
+    data_scaled = torch.where(zero_scale_mask.unsqueeze(-1), torch.zeros_like(data_scaled), data_scaled)
+
     out_scales = scaled_block_scales_fp8
 
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
diff --git a/comfy_kitchen/backends/triton/quantization.py b/comfy_kitchen/backends/triton/quantization.py
@@ -262,8 +262,6 @@ def quantize_nvfp4_kernel_tl(
 
             # Scale block scale to FP8
             scaled_block_scale = block_scale / per_tensor_scale
-            # Clamp to [F8_E4M3_EPS, F8_E4M3_MAX] = [0.125, 448.0]
-            scaled_block_scale = tl.maximum(scaled_block_scale, 0.125)
             scaled_block_scale = tl.minimum(scaled_block_scale, 448.0)
 
             # Round to FP8 precision
@@ -280,44 +278,41 @@ def quantize_nvfp4_kernel_tl(
             # Calculate total scale for data quantization
             scaled_block_scale_fp32 = scaled_block_scale_fp8.to(tl.float32)
             total_scale = per_tensor_scale * scaled_block_scale_fp32
-            total_scale = tl.where(total_scale < 1e-10, 1.0, total_scale)
+            zero_scale_mask = total_scale < 1e-10
+            total_scale = tl.where(zero_scale_mask, 1.0, total_scale)
 
-            # Scale and clamp data
+            # Scale data (satfinite modifier in PTX will handle clamping)
             data_scaled = x / total_scale
-            data_scaled = tl.maximum(data_scaled, -6.0)  # -F4_E2M1_MAX
-            data_scaled = tl.minimum(data_scaled, 6.0)   # F4_E2M1_MAX
-
-            # Quantize to FP4 values and pack - optimized version
-            # Convert all values to FP4 representation
-            sign_all = tl.where(data_scaled < 0, 1, 0)
-            abs_all = tl.abs(data_scaled)
-
-            # Map all to FP4 bit pattern (E2M1)
-            q_all = tl.where(abs_all <= 0.25, 0,
-                     tl.where(abs_all < 0.75, 1,
-                     tl.where(abs_all <= 1.25, 2,
-                     tl.where(abs_all < 1.75, 3,
-                     tl.where(abs_all <= 2.5, 4,
-                     tl.where(abs_all < 3.5, 5,
-                     tl.where(abs_all <= 5.0, 6, 7)))))))
-
-            # Add sign bits to all values
-            fp4_all = (sign_all.to(tl.int32) << 3) | q_all.to(tl.int32)
-
-            # Pack consecutive pairs of FP4 values
-            # fp4_all has 16 elements: [v0, v1, v2, v3, ..., v15]
+            data_scaled = tl.where(zero_scale_mask, 0.0, data_scaled)
+
             # We want to pack: (v0,v1), (v2,v3), ..., (v14,v15)
             pair_idx = tl.arange(0, block_size // 2)
             even_idx = pair_idx * 2
             odd_idx = pair_idx * 2 + 1
 
             # Extract even and odd elements using one-hot selection
             indices = tl.arange(0, block_size)
-            fp4_even = tl.sum(tl.where(indices == even_idx[:, None], fp4_all, 0), axis=1)
-            fp4_odd = tl.sum(tl.where(indices == odd_idx[:, None], fp4_all, 0), axis=1)
-
-            # Pack two 4-bit values into one uint8
-            packed_bytes = ((fp4_even << 4) | fp4_odd).to(tl.uint8)
+            f32_even = tl.sum(tl.where(indices == even_idx[:, None], data_scaled, 0), axis=1)
+            f32_odd = tl.sum(tl.where(indices == odd_idx[:, None], data_scaled, 0), axis=1)
+
+            packed_bytes_u16 = tl.inline_asm_elementwise(
+                asm="""
+                {
+                    .reg .b8 fp4_byte;
+                    .reg .b16 result;
+                    cvt.rn.satfinite.e2m1x2.f32 fp4_byte, $1, $2;
+                    mov.b16 result, {fp4_byte, 0};
+                    mov.u16 $0, result;
+                }
+                """,
+                constraints="=h,f,f",
+                args=[f32_even, f32_odd],
+                dtype=tl.uint16,
+                is_pure=True,
+                pack=1,
+            )
+            # Extract the low byte
+            packed_bytes = (packed_bytes_u16 & 0xFF).to(tl.uint8)
 
             # Store packed bytes
             out_offs = pid_m * (n // 2) + pid_n * (block_size // 2) + pair_idx
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "comfy-kitchen"
-version = "0.1.4"
+version = "0.1.5"
 description = "Fast Kernel Library for ComfyUI with multiple compute backends"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/tests/test_qdq.py b/tests/test_qdq.py
@@ -3,9 +3,7 @@
 
 import comfy_kitchen as ck
 from comfy_kitchen.float_utils import (
-    F4_E2M1_EPS,
     F4_E2M1_MAX,
-    F8_E4M3_EPS,
     F8_E4M3_MAX,
     fp4_x2_to_f32,
 )
@@ -146,67 +144,48 @@ def capable_backends(self, device):
 
     @pytest.mark.parametrize("m,k", [
         (1024, 2048),
+        (512, 1024),
         (129, 128),   # Edge case: odd rows requiring padding
         (33, 65),     # Edge case: both dimensions odd
     ])
     def test_quantize_nvfp4_all_backends(self, capable_backends, device, seed, m, k):
-        """Test NVFP4 quantization across all capable backends."""
-        for backend_name in capable_backends:
-            inputs = ConstraintAwareTestInputs("quantize_nvfp4", backend_name, device)
-            x = inputs.tensor("x", shape=(m, k), dtype=torch.bfloat16)
-            x = x * 4  # Scale up for better test coverage
-
-            scale = torch.max(torch.abs(x)) / (F8_E4M3_MAX * F4_E2M1_MAX)
-            scale = scale.to(torch.float32)
-
-            needs_padding = (m % 16 != 0) or (k % 16 != 0)
-
-            with ck.use_backend(backend_name):
-                qx, sx = ck.quantize_nvfp4(x, scale, pad_16x=needs_padding)
-
-            assert qx.dtype == torch.uint8
-            assert sx.dtype == torch.float8_e4m3fn
-
-    @pytest.mark.parametrize("m,k", [(512, 1024)])
-    def test_quantize_nvfp4_cross_backend_consistency(
-        self, capable_backends, device, seed, m, k
-    ):
-        """Test that all backends produce consistent NVFP4 results."""
-        if len(capable_backends) < 2:
-            pytest.skip("Need at least 2 backends for cross-validation")
+        """Test NVFP4 quantization across all capable backends with accuracy testing."""
+        if "eager" not in capable_backends:
+            pytest.skip("Need eager backend as reference")
 
+        # Create test input
         x = torch.randn(m, k, device=device, dtype=torch.bfloat16) * 4
         scale = torch.max(torch.abs(x)) / (F8_E4M3_MAX * F4_E2M1_MAX)
         scale = scale.to(torch.float32)
+        needs_padding = (m % 16 != 0) or (k % 16 != 0)
+
+        with ck.use_backend("eager"):
+            ref_qx, ref_sx = ck.quantize_nvfp4(x, scale, pad_16x=needs_padding)
 
-        results = {}
         for backend_name in capable_backends:
             with ck.use_backend(backend_name):
-                qx, sx = ck.quantize_nvfp4(x, scale)
-                results[backend_name] = (qx, sx)
+                qx, sx = ck.quantize_nvfp4(x, scale, pad_16x=needs_padding)
 
-        # Compare all against first
-        ref_backend = capable_backends[0]
-        ref_qx, ref_sx = results[ref_backend]
+                # Check basic properties
+                assert qx.dtype == torch.uint8
+                assert sx.dtype == torch.float8_e4m3fn
 
-        for backend_name, (qx, sx) in results.items():
-            if backend_name != ref_backend:
                 assert_values_close(
                     sx.to(torch.float32),
                     ref_sx.to(torch.float32),
-                    rtol=F8_E4M3_EPS,
-                    atol=F8_E4M3_EPS,
-                    name=f"scales ({backend_name} vs {ref_backend})"
+                    rtol=1e-5,
+                    atol=1e-3,
+                    name=f"scales ({backend_name} vs eager)"
                 )
 
                 qx_f32 = fp4_x2_to_f32(qx)
                 ref_qx_f32 = fp4_x2_to_f32(ref_qx)
                 assert_values_close(
                     qx_f32,
                     ref_qx_f32,
-                    rtol=F4_E2M1_EPS,
-                    atol=F4_E2M1_EPS,
-                    name=f"quantized ({backend_name} vs {ref_backend})"
+                    rtol=1e-2,
+                    atol=2.0,
+                    name=f"quantized data ({backend_name} vs eager)"
                 )
 
     def test_quantize_nvfp4_cpu_fallback(self, seed):
@@ -235,28 +214,48 @@ def capable_backends(self, device):
             pytest.skip(f"No backend supports dequantize_nvfp4 on {device}")
         return backends
 
-    @pytest.mark.parametrize("m,k", [(1024, 2048), (512, 4096)])
+    @pytest.mark.parametrize("m,k", [
+        (1024, 2048),
+        (512, 4096),
+        (129, 128),  # Edge case with padding
+    ])
     @pytest.mark.parametrize("output_dtype", [torch.float16, torch.bfloat16])
     def test_dequantize_nvfp4_all_backends(
         self, capable_backends, device, seed, m, k, output_dtype
     ):
-        """Test NVFP4 dequantization across all capable backends."""
+        """Test NVFP4 dequantization across all capable backends with accuracy testing."""
+        if "eager" not in capable_backends:
+            pytest.skip("Need eager backend as reference")
+
         x = torch.randn(m, k, device=device, dtype=torch.bfloat16) * 4
         scale = torch.max(torch.abs(x)) / (F8_E4M3_MAX * F4_E2M1_MAX)
         scale = scale.to(torch.float32)
+        needs_padding = (m % 16 != 0) or (k % 16 != 0)
 
         # Quantize with eager
         with ck.use_backend("eager"):
-            qx, sx = ck.quantize_nvfp4(x, scale)
+            qx, sx = ck.quantize_nvfp4(x, scale, pad_16x=needs_padding)
+            ref_result = ck.dequantize_nvfp4(qx, scale, sx, output_type=output_dtype)
+            # Unpad if needed
+            ref_result = ref_result[:m, :k]
 
         for backend_name in capable_backends:
             with ck.use_backend(backend_name):
                 result = ck.dequantize_nvfp4(qx, scale, sx, output_type=output_dtype)
+                result = result[:m, :k]  # Unpad if needed
 
-            assert result.shape == x.shape
+            assert result.shape == (m, k)
             assert result.dtype == output_dtype
             assert result.device == x.device
 
+            assert_values_close(
+                result,
+                ref_result,
+                rtol=1e-3,
+                atol=1e-2,
+                name=f"dequantized output ({backend_name} vs eager)"
+            )
+
 
 class TestScaledMMNVFP4:
     """NVFP4 matrix multiplication tests."""