Speed up nvfp4 pack/unpack w/ torch.compile (#400)

fynnsu · web-flow · commit 15177be3fcd0 · 2025-08-25T14:43:04.000-04:00
* Speed up nvfp4 pack/unpack w/ torch.compile

Signed-off-by: Fynn Schmitt-Ulms &lt;fschmitt@redhat.com&gt;

* Add `dynamic=True` to torch.compile call in nvfp4 packing

Signed-off-by: Fynn Schmitt-Ulms &lt;fschmitt@redhat.com&gt;

---------

Signed-off-by: Fynn Schmitt-Ulms &lt;fschmitt@redhat.com&gt;
diff --git a/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/nvfp4_quantized.py
@@ -123,6 +123,7 @@ def decompress_weight(
         return decompressed_weight
 
 
+@torch.compile(fullgraph=True, dynamic=True)
 def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     """
     Packs a tensor with values in the fp4 range into uint8.
@@ -145,12 +146,11 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
 
     # Find closest valid FP4 value index for each element
     abs_x = torch.abs(x)
-    abs_indices = torch.zeros_like(abs_x, dtype=torch.long)
-    for i, val in enumerate(kE2M1):
-        abs_indices = torch.where(torch.isclose(abs_x, val), i, abs_indices)
+    abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE2M1)  # [m, n, 8]
+    abs_indices = torch.argmin(abs_diff_x, dim=-1)  # [m, n]
 
     # Apply sign bit (bit 3) to get final 4-bit representation
-    indices = abs_indices + (torch.signbit(x) << 3).to(torch.long)
+    indices = abs_indices + (torch.signbit(x).to(torch.long) << 3)
 
     # Reshape to prepare for packing pairs of values
     indices = indices.reshape(-1)
@@ -174,6 +174,7 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
 
 
 # reference: : https://github.com/vllm-project/vllm/pull/16362
+@torch.compile(fullgraph=True, dynamic=True)
 def unpack_fp4_from_uint8(
     a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
 ) -> torch.Tensor: