Skip to content

Commit adfcaf2

Browse files
Merge pull request #297 from menloresearch/update-dev-from-master-2025-10-19-00-38
Sync master with upstream release b6795
2 parents 90971bf + ee09828 commit adfcaf2

File tree

12 files changed

+1007
-38
lines changed

12 files changed

+1007
-38
lines changed

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ if [ ! -z ${GG_BUILD_ROCM} ]; then
7575
exit 1
7676
fi
7777

78-
CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
78+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
7979
fi
8080

8181
if [ ! -z ${GG_BUILD_SYCL} ]; then

ggml/src/ggml-cuda/topk-moe.cu

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
7373

7474
float wt_sum = 0.f;
7575

76-
extern __shared__ float data_topk_shared[];
77-
float * wt_shared_ptr = data_topk_shared + threadIdx.y * n_expert_used;
76+
float output_weights[experts_per_thread];
7877

7978
for (int k = 0; k < n_expert_used; k++) {
8079
float max_val = wt[0];
@@ -99,11 +98,14 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
9998
}
10099
}
101100

101+
if ((k & (WARP_SIZE - 1)) == threadIdx.x) {
102+
output_weights[k / WARP_SIZE] = max_val;
103+
}
104+
102105
if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
103106
wt[max_expert / WARP_SIZE] = -INFINITY;
104107

105-
wt_shared_ptr[k] = max_val;
106-
ids[k] = max_expert;
108+
ids[k] = max_expert;
107109
if constexpr (with_norm) {
108110
wt_sum += max_val;
109111
}
@@ -115,12 +117,16 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
115117
const float inv_sum = 1.0f / wt_sum;
116118

117119
for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
118-
wt_shared_ptr[i] = wt_shared_ptr[i] * inv_sum;
120+
output_weights[i] *= inv_sum;
119121
}
120122
}
121123

122-
for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
123-
weights[i] = wt_shared_ptr[i];
124+
#pragma unroll
125+
for (int i = 0; i < experts_per_thread; i++) {
126+
const int idx = i * WARP_SIZE + threadIdx.x;
127+
if (idx < n_expert_used) {
128+
weights[idx] = output_weights[i];
129+
}
124130
}
125131
}
126132

@@ -137,48 +143,46 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
137143
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
138144
cudaStream_t stream = ctx.stream();
139145

140-
const int nbytes_shared = n_expert_used * rows_per_block * sizeof(float);
141-
142146
switch (n_expert) {
143147
case 1:
144148
topk_moe_cuda<1, with_norm>
145-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
149+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
146150
break;
147151
case 2:
148152
topk_moe_cuda<2, with_norm>
149-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
153+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
150154
break;
151155
case 4:
152156
topk_moe_cuda<4, with_norm>
153-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
157+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
154158
break;
155159
case 8:
156160
topk_moe_cuda<8, with_norm>
157-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
161+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
158162
break;
159163
case 16:
160164
topk_moe_cuda<16, with_norm>
161-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
165+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
162166
break;
163167
case 32:
164168
topk_moe_cuda<32, with_norm>
165-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
169+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
166170
break;
167171
case 64:
168172
topk_moe_cuda<64, with_norm>
169-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
173+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
170174
break;
171175
case 128:
172176
topk_moe_cuda<128, with_norm>
173-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
177+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
174178
break;
175179
case 256:
176180
topk_moe_cuda<256, with_norm>
177-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
181+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
178182
break;
179183
case 512:
180184
topk_moe_cuda<512, with_norm>
181-
<<<grid_dims, block_dims, nbytes_shared, stream>>>(logits, weights, ids, n_rows, n_expert_used);
185+
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
182186
break;
183187
default:
184188
GGML_ASSERT(false && "fatal error");

ggml/src/ggml-hip/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ if (CXX_IS_HIPCC)
2828
" Prefer setting the HIP compiler directly. See README for details.")
2929
endif()
3030
else()
31-
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
32-
if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
31+
# Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
32+
if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
33+
set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
34+
elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
3335
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
3436
endif()
3537
cmake_minimum_required(VERSION 3.21)

ggml/src/ggml-impl.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -565,14 +565,23 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
565565
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
566566
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
567567

568+
static inline int32_t ggml_node_get_use_count(const struct ggml_cgraph * cgraph, int node_idx) {
569+
const struct ggml_tensor * node = cgraph->nodes[node_idx];
570+
571+
size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
572+
if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
573+
return 0;
574+
}
575+
return cgraph->use_counts[hash_pos];
576+
}
577+
568578
// return true if the node's results are only used by N other nodes
569579
// and can be fused into their calculations.
570580
static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
571581
const struct ggml_tensor * node = cgraph->nodes[node_idx];
572582

573583
// check the use count against how many we're replacing
574-
size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
575-
if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
584+
if (ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
576585
return false;
577586
}
578587

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ set(GGML_OPENCL_KERNELS
9191
mul_mv_id_q8_0_f32_flat
9292
mul_mv_id_mxfp4_f32
9393
mul_mv_id_mxfp4_f32_flat
94+
gemm_moe_mxfp4_f32
95+
gemv_moe_mxfp4_f32
9496
mul_mm_f32_f32_l4_lm
9597
mul_mm_f16_f32_l4_lm
9698
mul_mm_q8_0_f32_l4_lm

0 commit comments

Comments
 (0)