Maalvi14
diff --git a/‎mlx/backend/cuda/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎mlx/backend/cuda/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mlx/backend/cuda/quantized/qmm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎mlx/backend/cuda/quantized/qmm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/quantized/qmv.cu‎ ‎mlx/backend/cuda/quantized/qmm/fp_qmv.cu‎mlx/backend/cuda/quantized/qmv.cu renamed to mlx/backend/cuda/quantized/qmm/fp_qmv.cu
Lines changed: 61 additions & 55 deletions b/‎mlx/backend/cuda/quantized/qmv.cu‎ ‎mlx/backend/cuda/quantized/qmm/fp_qmv.cu‎mlx/backend/cuda/quantized/qmv.cu renamed to mlx/backend/cuda/quantized/qmm/fp_qmv.cu
Lines changed: 61 additions & 55 deletions
diff --git a/‎mlx/backend/cuda/quantized/qmm/qmm.cpp‎
Lines changed: 107 additions & 0 deletions b/‎mlx/backend/cuda/quantized/qmm/qmm.cpp‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/quantized/qmm/qmm.h‎
Lines changed: 58 additions & 0 deletions b/‎mlx/backend/cuda/quantized/qmm/qmm.h‎
Lines changed: 58 additions & 0 deletions
@@ -56,7 +56,6 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_utils.cu
 
@@ -1,6 +1,8 @@
 target_sources(
   mlx
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/qmm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmv.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/fp_qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n16_m1.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n32_m1.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n64_m2.cu
 
@@ -2,19 +2,21 @@
 
 #include "mlx/backend/cuda/device/utils.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
-#include "mlx/backend/cuda/quantized/qmv.h"
+#include "mlx/backend/cuda/quantized/qmm/qmm.h"
 #include "mlx/backend/cuda/quantized/quantized_utils.cuh"
 #include "mlx/backend/cuda/quantized/quantized_utils.h"
 #include "mlx/dtype_utils.h"
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
-namespace mlx::core::cu {
+namespace mlx::core {
 
-namespace cg = cooperative_groups;
+constexpr int rows_per_block = 8;
+
+namespace cu {
 
-static constexpr int rows_per_block = 8;
+namespace cg = cooperative_groups;
 
 template <typename T>
 __device__ void adjust_matrix_offsets(
@@ -199,6 +201,8 @@ __global__ void fp_qmv_batched(
       mat, scales, vec, out, rows, cols);
 }
 
+} // namespace cu
+
 template <typename F>
 void dispatch_1_2_4(int n, F&& f) {
   switch (n) {
@@ -221,11 +225,13 @@ void fp_qmv(
     array& out,
     int bits,
     int group_size,
-    int M,
-    int N,
-    int K,
-    CommandEncoder& encoder,
+    cu::CommandEncoder& encoder,
     Stream s) {
+  uint32_t M = x.shape(-2);
+  uint32_t N = out.shape(-1);
+  uint32_t K = x.shape(-1);
+  uint32_t B = out.size() / (M * N);
+
   // Make sure the last two dims of x and w, s, b are contiguous. This should
   // be relaxed for x.
   array vec = ensure_row_contiguous_matrix(x, encoder, s);
@@ -240,7 +246,6 @@ void fp_qmv(
     using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
     if constexpr (!std::is_same_v<T, double>) {
       dim3 block_dims{WARP_SIZE, rows_per_block};
-      uint32_t B = out.size() / (M * N);
       uint32_t blocks_y = (N + rows_per_block - 1) / rows_per_block;
       const uint32_t* mat_ptr = gpu_ptr<uint32_t>(mat);
       const T* vec_ptr = gpu_ptr<T>(vec);
@@ -256,55 +261,56 @@ void fp_qmv(
         n = 2;
       }
       dispatch_1_2_4(n, [&](auto n) {
-        dispatch_bool(B > 1, [&](auto batched) {
-          if (!batched.value) {
-            auto kernel =
-                fp_qmv_single<T, rows_per_block, n.value, 4, 32, true>;
-            if (bits == 8) {
-              kernel = fp_qmv_single<T, rows_per_block, n.value, 8, 32, true>;
-            } else if (group_size == 16) {
-              kernel = fp_qmv_single<T, rows_per_block, n.value, 4, 16, false>;
-            }
-            encoder.add_kernel_node(
-                kernel,
-                {static_cast<uint32_t>(M), blocks_y},
-                block_dims,
-                mat_ptr,
-                gpu_ptr<uint8_t>(scales),
-                vec_ptr,
-                gpu_ptr<T>(out),
-                N,
-                K);
-          } else {
-            auto kernel =
-                fp_qmv_batched<T, rows_per_block, n.value, 4, 32, true>;
-            if (bits == 8) {
-              kernel = fp_qmv_batched<T, rows_per_block, n.value, 8, 32, true>;
-            } else if (group_size == 16) {
-              kernel = fp_qmv_batched<T, rows_per_block, n.value, 4, 16, false>;
-            }
-            encoder.add_kernel_node(
-                kernel,
-                {static_cast<uint32_t>(M), blocks_y, B},
-                block_dims,
-                mat_ptr,
-                gpu_ptr<uint8_t>(scales),
-                vec_ptr,
-                gpu_ptr<T>(out),
-                N,
-                K,
-                vec.ndim() - 2,
-                const_param(vec.shape()),
-                const_param(vec.strides()),
-                mat.ndim() - 2,
-                const_param(mat.shape()),
-                const_param(mat.strides()),
-                const_param(scales.strides()));
+        if (B == 1) {
+          auto kernel =
+              cu::fp_qmv_single<T, rows_per_block, n.value, 4, 32, true>;
+          if (bits == 8) {
+            kernel = cu::fp_qmv_single<T, rows_per_block, n.value, 8, 32, true>;
+          } else if (group_size == 16) {
+            kernel =
+                cu::fp_qmv_single<T, rows_per_block, n.value, 4, 16, false>;
           }
-        });
+          encoder.add_kernel_node(
+              kernel,
+              {uint32_t(x.size() / K), blocks_y},
+              block_dims,
+              mat_ptr,
+              gpu_ptr<uint8_t>(scales),
+              vec_ptr,
+              gpu_ptr<T>(out),
+              N,
+              K);
+        } else {
+          auto kernel =
+              cu::fp_qmv_batched<T, rows_per_block, n.value, 4, 32, true>;
+          if (bits == 8) {
+            kernel =
+                cu::fp_qmv_batched<T, rows_per_block, n.value, 8, 32, true>;
+          } else if (group_size == 16) {
+            kernel =
+                cu::fp_qmv_batched<T, rows_per_block, n.value, 4, 16, false>;
+          }
+          encoder.add_kernel_node(
+              kernel,
+              {M, blocks_y, B},
+              block_dims,
+              mat_ptr,
+              gpu_ptr<uint8_t>(scales),
+              vec_ptr,
+              gpu_ptr<T>(out),
+              N,
+              K,
+              vec.ndim() - 2,
+              const_param(vec.shape()),
+              const_param(vec.strides()),
+              mat.ndim() - 2,
+              const_param(mat.shape()),
+              const_param(mat.strides()),
+              const_param(scales.strides()));
+        }
       });
     }
   });
 }
 
-} // namespace mlx::core::cu
+} // namespace mlx::core
@@ -21,6 +21,46 @@ void qmm_impl_sm90(
     Stream s);
 #endif // defined(MLX_CUDA_SM90A_ENABLED)
 
+bool supports_qmm_sm90(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device) {
+  if (device.compute_capability_major() != 9) {
+    return false;
+  }
+  int k = x.shape(-1);
+  if (k % 64 != 0) {
+    return false;
+  }
+  if (!biases) {
+    return false;
+  }
+  if (!x.flags().row_contiguous || !w.flags().row_contiguous ||
+      !scales.flags().row_contiguous || !biases->flags().row_contiguous) {
+    return false;
+  }
+  if (!transpose) {
+    return false;
+  }
+  if (bits % 2 != 0) {
+    return false;
+  }
+  if (group_size < k) {
+    return false;
+  }
+  if (mode != QuantizationMode::Affine) {
+    return false;
+  }
+  return true;
+}
+
 void qmm_sm90(
     const array& x,
     const array& w,
@@ -57,4 +97,71 @@ void qmm_sm90(
 #endif // defined(MLX_CUDA_SM90A_ENABLED)
 }
 
+bool supports_fp_qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device) {
+  bool non_batched = w.ndim() == 2;
+  int k = x.shape(-1);
+  int n = out.shape(-1);
+  int vec_batch = non_batched ? x.size() / k : x.shape(-2);
+  if (vec_batch > 8) {
+    return false;
+  }
+  if (!transpose) {
+    return false;
+  }
+  if (mode == QuantizationMode::Affine) {
+    return false;
+  }
+  return true;
+}
+
+bool supports_qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device) {
+  int m = out.shape(-2);
+  int n = out.shape(-1);
+  int k = x.shape(-1);
+  int l = out.size() / (m * n);
+  if (l > 1) {
+    return false;
+  }
+  if (n % 8 != 0 || k % 8 != 0) {
+    return false;
+  }
+  if (!x.flags().row_contiguous || !w.flags().row_contiguous ||
+      !scales.flags().row_contiguous) {
+    return false;
+  }
+  if (biases && !biases->flags().row_contiguous) {
+    return false;
+  }
+  if (!transpose) {
+    return false;
+  }
+  if (bits % 2 != 0) {
+    return false;
+  }
+  if (mode != QuantizationMode::Affine) {
+    return false;
+  }
+  return true;
+}
+
 } // namespace mlx::core
@@ -3,11 +3,24 @@
 #pragma once
 
 #include "mlx/backend/cuda/device.h"
+#include "mlx/primitives.h"
 
 #include <optional>
 
 namespace mlx::core {
 
+bool supports_qmm_sm90(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device);
+
 void qmm_sm90(
     const array& x,
     const array& w,
@@ -19,4 +32,49 @@ void qmm_sm90(
     cu::CommandEncoder& encoder,
     Stream s);
 
+bool supports_fp_qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device);
+
+void fp_qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    array& out,
+    int bits,
+    int group_size,
+    cu::CommandEncoder& encoder,
+    Stream s);
+
+bool supports_qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    const array& out,
+    bool transpose,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::Device& device);
+
+void qmv(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const std::optional<array>& biases,
+    array& out,
+    int bits,
+    int group_size,
+    QuantizationMode mode,
+    cu::CommandEncoder& encoder);
+
 } // namespace mlx::core