flashinfer-ai
diff --git a/‎flashinfer/__init__.py
Lines changed: 1 addition & 0 deletions b/‎flashinfer/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎flashinfer/fp4_quantization.py
Lines changed: 9 additions & 0 deletions b/‎flashinfer/fp4_quantization.py
Lines changed: 9 additions & 0 deletions
@@ -65,6 +65,7 @@
 )
 from .gemm import SegmentGEMMWrapper as SegmentGEMMWrapper
 from .gemm import bmm_fp8 as bmm_fp8
+from .gemm import mm_fp4 as mm_fp4
 from .mla import BatchMLAPagedAttentionWrapper as BatchMLAPagedAttentionWrapper
 from .norm import fused_add_rmsnorm as fused_add_rmsnorm
 from .norm import gemma_fused_add_rmsnorm as gemma_fused_add_rmsnorm
 
@@ -253,6 +253,11 @@ def fp4_quantize(
     if sf_vec_size != 16 and sf_vec_size != 32:
         raise NotImplementedError("sf_vec_size can only be 16 or 32")
 
+    # for column major input, we need to transpose the input
+    is_column_major = input.stride(-2) == 1
+    if is_column_major:
+        input = input.transpose(-2, -1)
+
     assert input.shape[-1] % sf_vec_size == 0
     x_q, sf = get_fp4_quantization_sm100_module().fp4_quantize_sm100(
         input,
@@ -262,6 +267,10 @@ def fp4_quantize(
         is_sf_swizzled_layout,
     )
     sf = sf.reshape((-1, input.shape[-1] // sf_vec_size))
+    if is_column_major:
+        x_q = x_q.transpose(-2, -1)
+        sf = sf.transpose(-2, -1)
+
     return x_q, sf
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@`
`65`	`65`	`)`
`66`	`66`	`from .gemm import SegmentGEMMWrapper as SegmentGEMMWrapper`
`67`	`67`	`from .gemm import bmm_fp8 as bmm_fp8`
	`68`	`+from .gemm import mm_fp4 as mm_fp4`
`68`	`69`	`from .mla import BatchMLAPagedAttentionWrapper as BatchMLAPagedAttentionWrapper`
`69`	`70`	`from .norm import fused_add_rmsnorm as fused_add_rmsnorm`
`70`	`71`	`from .norm import gemma_fused_add_rmsnorm as gemma_fused_add_rmsnorm`