update comments

Bodhi Hu · Bodhi Hu · commit 30839a3a5145 · 2025-03-14T17:34:11.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1358,14 +1358,9 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }
 
-// CLI:
-//   ./build_musa/bin/llama-cli -m ~/models/deepseek-r1-7B-Q4_K_M.gguf -ngl 28 -t 8 -p "摩尔线程是一家HQ北京的国产GPU 及 AI 公司，他们正在" -n 10 -no-cnv --cache-type-k q8_0 -fa
 static double ticks_total, ticks_quant, ticks_op;
 // stats:     |  ticks_total  |  ticks_quant  | ticks_mul_mat
-//   base     |               |               |  
-//            |               |               |    
-//   dnn-shfl |  2.119177099  |  0.019638826  |  2.096463255
-//            |               |     0.92%     |    98.93%
+//            |  2.119177099  |  0.019638826  |  2.096463255
 static void ggml_cuda_op_mul_mat(
     ggml_backend_cuda_context & ctx,
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -47,7 +47,6 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
         1;
 }
 
-<<<<<<< HEAD
 enum mmvq_parameter_table_id {
     MMVQ_PARAMETERS_GENERIC = 0,
     MMVQ_PARAMETERS_GCN,
@@ -127,10 +126,8 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta
     }
     return 1;
 }
-=======
-static __device__ uint64_t ticks_total = 0, ticks_vecdotq = 0, ticks_reduce_sum = 0;
->>>>>>> c9e3fd9c (MUSA: enable fastfp16, correct warp reduce impl and other changes)
 
+static __device__ uint64_t ticks_total = 0, ticks_vecdotq = 0, ticks_reduce_sum = 0;
 template <ggml_type type, int ncols_y>
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,6 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {`
`47`	`47`	`1;`
`48`	`48`	`}`
`49`	`49`
`50`		`-<<<<<<< HEAD`
`51`	`50`	`enum mmvq_parameter_table_id {`
`52`	`51`	`MMVQ_PARAMETERS_GENERIC = 0,`
`53`	`52`	`MMVQ_PARAMETERS_GCN,`
`@@ -127,10 +126,8 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta`
`127`	`126`	`}`
`128`	`127`	`return 1;`
`129`	`128`	`}`
`130`		`-=======`
`131`		`-static __device__ uint64_t ticks_total = 0, ticks_vecdotq = 0, ticks_reduce_sum = 0;`
`132`		`->>>>>>> c9e3fd9c (MUSA: enable fastfp16, correct warp reduce impl and other changes)`
`133`	`129`
	`130`	`+static __device__ uint64_t ticks_total = 0, ticks_vecdotq = 0, ticks_reduce_sum = 0;`
`134`	`131`	`template <ggml_type type, int ncols_y>`
`135`	`132`	`// tell the compiler to use as many registers as it wants, see nwarps definition below`
`136`	`133`	`__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)`