mx: make CUDA kernel for dim1 cast in mxfp8_cublas recipe (#2661)

vkuzo · web-flow · commit 5f3ab63e7b53 · 2025-08-04T13:02:12.000-04:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -184,8 +184,10 @@ def from_recipe_name(
         if recipe_name is MXLinearRecipeName.MXFP8_EMULATED:
             return MXLinearConfig()
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS:
-            # TODO(future PR): default to CUDA dim1 kernel
-            return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUBLAS)
+            return MXLinearConfig(
+                gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,
+                mxfp8_cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+            )
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS_RCEIL:
             return MXLinearConfig(
                 gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,