Skip to content

Commit 117f7dd

Browse files
author
Bodhi Hu
committed
fix cross entropy loss op for musa
1 parent 80a3000 commit 117f7dd

File tree

2 files changed

+6
-5
lines changed

2 files changed

+6
-5
lines changed

ggml/src/ggml-cuda/cross-entropy-loss.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,13 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
123123
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
124124

125125
if (nbytes_shared <= smpbo) {
126-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
126+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
127127
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
128128
if (!shared_memory_limit_raised[id]) {
129129
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
130130
shared_memory_limit_raised[id] = true;
131131
}
132-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
132+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
133133
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
134134
} else {
135135
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
@@ -175,13 +175,13 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
175175
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
176176

177177
if (nbytes_shared <= smpbo) {
178-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
178+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
179179
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
180180
if (!shared_memory_limit_raised[id]) {
181181
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
182182
shared_memory_limit_raised[id] = true;
183183
}
184-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
184+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
185185
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
186186
} else {
187187
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
262262
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
263263
device_vmm ? "yes" : "no", prop.warpSize);
264264
#elif defined(GGML_USE_MUSA)
265-
// NOTE: MUSA will reserve some shared mem, and 24B should be enough
265+
// TODO: MUSA will reserve some shared mem, and 24B should be enough,
266+
// we can remove the **24** in the future when MUSA no longer reserves shared mem.
266267
info.devices[id].smpbo = prop.sharedMemPerBlockOptin - 24;
267268
info.devices[id].cc = 100*prop.major + 10*prop.minor;
268269
#else

0 commit comments

Comments
 (0)