janhq
diff --git a/‎ci/run.sh‎
Lines changed: 1 addition & 1 deletion b/‎ci/run.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/topk-moe.cu‎
Lines changed: 23 additions & 19 deletions b/‎ggml/src/ggml-cuda/topk-moe.cu‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎ggml/src/ggml-hip/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎ggml/src/ggml-hip/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ggml/src/ggml-impl.h‎
Lines changed: 11 additions & 2 deletions b/‎ggml/src/ggml-impl.h‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-opencl/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
@@ -75,7 +75,7 @@ if [ ! -z ${GG_BUILD_ROCM} ]; then
         exit 1
     fi
 
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
 
@@ -73,8 +73,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
 
     float wt_sum = 0.f;
 
-    extern __shared__ float data_topk_shared[];
-    float *                 wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
+    float output_weights[experts_per_thread];
 
     for (int k = 0; k < n_expert_used; k++) {
         float max_val    = wt[0];
@@ -99,11 +98,14 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
             }
         }
 
+        if ((k & (WARP_SIZE - 1)) == threadIdx.x) {
+            output_weights[k / WARP_SIZE] = max_val;
+        }
+
         if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
             wt[max_expert / WARP_SIZE] = -INFINITY;
 
-            wt_shared_ptr[k] = max_val;
-            ids[k]           = max_expert;
+            ids[k] = max_expert;
             if constexpr (with_norm) {
                 wt_sum += max_val;
             }
@@ -115,12 +117,16 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         const float inv_sum = 1.0f / wt_sum;
 
         for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
-            wt_shared_ptr[i] = wt_shared_ptr[i] * inv_sum;
+            output_weights[i] *= inv_sum;
         }
     }
 
-    for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
-        weights[i] = wt_shared_ptr[i];
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const int idx = i * WARP_SIZE + threadIdx.x;
+        if (idx < n_expert_used) {
+            weights[idx] = output_weights[i];
+        }
     }
 }
 
@@ -137,48 +143,46 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
     dim3         block_dims(WARP_SIZE, rows_per_block, 1);
     cudaStream_t stream = ctx.stream();
 
-    const int nbytes_shared = n_expert_used * rows_per_block * sizeof(float);
-
     switch (n_expert) {
         case 1:
             topk_moe_cuda<1, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 2:
             topk_moe_cuda<2, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 4:
             topk_moe_cuda<4, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 8:
             topk_moe_cuda<8, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 16:
             topk_moe_cuda<16, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 32:
             topk_moe_cuda<32, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 64:
             topk_moe_cuda<64, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 128:
             topk_moe_cuda<128, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 256:
             topk_moe_cuda<256, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         case 512:
             topk_moe_cuda<512, with_norm>
-                <<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
             break;
         default:
             GGML_ASSERT(false && "fatal error");
 
@@ -28,8 +28,10 @@ if (CXX_IS_HIPCC)
                 " Prefer setting the HIP compiler directly. See README for details.")
     endif()
 else()
-    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-    if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+    # Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
+    elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
     endif()
     cmake_minimum_required(VERSION 3.21)
 
@@ -565,14 +565,23 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 
+static inline int32_t ggml_node_get_use_count(const struct ggml_cgraph * cgraph, int node_idx) {
+    const struct ggml_tensor * node = cgraph->nodes[node_idx];
+
+    size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
+    if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
+        return 0;
+    }
+    return cgraph->use_counts[hash_pos];
+}
+
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
 static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
     const struct ggml_tensor * node = cgraph->nodes[node_idx];
 
     // check the use count against how many we're replacing
-    size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
-    if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
+    if (ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
         return false;
     }
 
 
@@ -91,6 +91,8 @@ set(GGML_OPENCL_KERNELS
     mul_mv_id_q8_0_f32_flat
     mul_mv_id_mxfp4_f32
     mul_mv_id_mxfp4_f32_flat
+    gemm_moe_mxfp4_f32
+    gemv_moe_mxfp4_f32
     mul_mm_f32_f32_l4_lm
     mul_mm_f16_f32_l4_lm
     mul_mm_q8_0_f32_l4_lm