partially adopt musa cc to match cuda cc

Huaishun Hu · Huaishun Hu · commit d69103a41f83 · 2025-01-03T19:26:11.000+08:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -2249,7 +2249,7 @@ extern "C" {
     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 
-#define GGML_OP_PERF
+// #define GGML_OP_PERF
 // op: [ count, total_time ]
 enum OP_STAT_ENUM {
     OP_COUNT = 0,
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -178,6 +178,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpbo = prop.sharedMemPerBlock;
         info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+#elif defined(GGML_USE_MUSA)
+        /** TODO: MUSA arch should match CUDA 11.4 */
+        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+        // info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_MT;
+        info.devices[id].cc = CC_AMPERE;
 #else
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1671,9 +1676,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
         }
     }
 #else
-#ifdef GGML_USE_MUSA
-    GGML_ASSERT(false);
-#else // !GGML_USE_MUSA
     if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         // use cublasGemmStridedBatchedEx
@@ -1716,7 +1718,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
                 cu_compute_type,
                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
-#endif // GGML_USE_MUSA
 #endif
 
     if (dst->op_params[0] == GGML_PREC_DEFAULT) {
@@ -2637,6 +2638,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     {
         FILE *logFile = fopen("ggml_op_perf.log", "a");
         fprintf(logFile, "## compute stats for each op: ##################################################\n");
+        fprintf(logFile, ">> cc = %d, vmm = %d, total_vram = %u\n",
+            ggml_cuda_info().devices[cuda_ctx->device].cc,
+            ggml_cuda_info().devices[cuda_ctx->device].vmm,
+            ggml_cuda_info().devices[cuda_ctx->device].total_vram
+        );
         float total_time = 0, total_count = 0;
         for (int i = 0; i < GGML_OP_COUNT; ++i) {
             total_count += op_stats[i][OP_COUNT];
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
@@ -132,3 +132,29 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
+
+/** TODO: MUSA arch should match CUDA 11.4 */
+// #define CC_OFFSET_MT  99999999
+// #define __CUDA_ARCH__ CC_OFFSET_MT
+// #define __CUDA_ARCH__ 800
+
+/** TODO: following apis not supported yet by musa sdk: ***********
+
+__device__ __half hexp(const __half a) {
+    __half val;
+
+    float f_a = __half2float(a);
+    float f_result = expf(f_a);
+    val = __float2half(f_result);
+
+    return val;
+}
+
+__host__ __device__ __half2 h2exp(const __half2 a) {
+    __half2 result;
+    result.x = hexp(a.x);
+    result.y = hexp(a.y);
+    return result;
+}
+
+******************************************************************/