reduce shared memory use

JohannesGaessler · JohannesGaessler · commit a2f702a9db3c · 2025-08-23T23:27:36.000+02:00
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -3,6 +3,23 @@
 
 #include <vector>
 
+// To reduce shared memory use, store "it" and "iex_used" with 22/10 bits each.
+struct mmq_ids_helper_store {
+    uint32_t data;
+
+    __device__ mmq_ids_helper_store(const uint32_t it, const uint32_t iex_used) {
+        data = (it & 0x003FFFFF) | (iex_used << 22);
+    }
+
+    __device__ uint32_t it() const {
+        return data & 0x003FFFFF;
+    }
+
+    __device__ uint32_t iex_used() const {
+        return data >> 22;
+    }
+};
+static_assert(sizeof(mmq_ids_helper_store) == 4, "unexpected size for mmq_ids_helper_store");
 
 // Helper function for mul_mat_id, converts ids to a more convenient format.
 // ids_src1 describes how to permute the flattened column indices of src1 in order to get a compact src1 tensor sorted by expert.
@@ -17,9 +34,8 @@ static __global__ void mmq_ids_helper(
     const int n_expert_used = n_expert_used_template == 0 ? n_expert_used_var : n_expert_used_template;
     const int expert = blockIdx.x;
 
-    extern __shared__ int data_mmq_ids_helper[];
-    int * ids_src1_shared = data_mmq_ids_helper;
-    int * ids_dst_shared  = ids_src1_shared + n_tokens;
+    extern __shared__ char data_mmq_ids_helper[];
+    mmq_ids_helper_store * store = (mmq_ids_helper_store *) data_mmq_ids_helper;
 
     int nex_prev   = 0; // Number of columns for experts with a lower index.
     int it_compact = 0; // Running index for the compact slice of this expert.
@@ -37,8 +53,7 @@ static __global__ void mmq_ids_helper(
             }
 
             if (iex_used != -1) {
-                ids_src1_shared[it_compact] = it*sis1          + iex_used % nchannels_y;
-                ids_dst_shared[it_compact]  = it*n_expert_used + iex_used;
+                store[it_compact] = mmq_ids_helper_store(it, iex_used);
             }
 
             if (warp_reduce_any<warp_size>(iex_used != -1)) {
@@ -72,8 +87,7 @@ static __global__ void mmq_ids_helper(
             }
 
             if (iex_used != -1) {
-                ids_src1_shared[it_compact + it_compact_add_lower] = it*sis1          + iex_used % nchannels_y;
-                ids_dst_shared[it_compact  + it_compact_add_lower] = it*n_expert_used + iex_used;
+                store[it_compact + it_compact_add_lower] = mmq_ids_helper_store(it, iex_used);
             }
 
             // The thread with the highest index in the warp always has the sum over the whole warp, use it to increment all threads:
@@ -82,9 +96,12 @@ static __global__ void mmq_ids_helper(
     }
     nex_prev = warp_reduce_sum<warp_size>(nex_prev);
 
-    for (int it = threadIdx.x; it < it_compact; it += warp_size) {
-        ids_src1[nex_prev + it] = ids_src1_shared[it];
-        ids_dst [nex_prev + it] = ids_dst_shared [it];
+    for (int itc = threadIdx.x; itc < it_compact; itc += warp_size) {
+        const mmq_ids_helper_store store_it = store[itc];
+        const int it       = store_it.it();
+        const int iex_used = store_it.iex_used();
+        ids_src1[nex_prev + itc] = it*sis1          + iex_used % nchannels_y;
+        ids_dst [nex_prev + itc] = it*n_expert_used + iex_used;
     }
 
     if (threadIdx.x != 0) {
@@ -104,14 +121,17 @@ template <int n_expert_used_template>
 static void launch_mmq_ids_helper(
         const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
         const int n_experts, const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1, cudaStream_t stream) {
+    GGML_ASSERT(n_tokens          < (1 << 22) && "too few bits in mmq_ids_helper_store");
+    GGML_ASSERT(n_expert_used_var < (1 << 10) && "too few bits in mmq_ids_helper_store");
+
     const int id = ggml_cuda_get_device();
     const int warp_size = ggml_cuda_info().devices[id].warp_size;
     const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
     CUDA_SET_SHARED_MEMORY_LIMIT(mmq_ids_helper<n_expert_used_template>, smpbo);
 
     const dim3 num_blocks(n_experts, 1, 1);
     const dim3 block_size(warp_size, 1, 1);
-    const size_t nbytes_shared = 2*n_tokens*sizeof(int);
+    const size_t nbytes_shared = n_tokens*sizeof(mmq_ids_helper_store);
     mmq_ids_helper<n_expert_used_template><<<num_blocks, block_size, nbytes_shared, stream>>>
         (ids, ids_src1, ids_dst, expert_bounds, n_tokens, n_expert_used_var, nchannels_y, si1, sis1);
 }