Skip to content

Commit ecea71c

Browse files
authored
[None][chore] Update tinygemm kernel name (#10248)
Signed-off-by: Jonas Li <[email protected]>
1 parent f4f0fe8 commit ecea71c

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void launch_tinygemm2(__nv_bfloat16* gA, __nv_bfloat16* gB, __nv_bfloat16* gC, _
6161
int smem_size
6262
= STAGES * STAGE_UNROLL * (TILE_M * TILE_K * sizeof(__nv_bfloat16) + TILE_N * TILE_K * sizeof(__nv_bfloat16));
6363

64-
gpuErrChk(cudaFuncSetAttribute(kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES, STAGE_UNROLL, PROFILE>,
64+
gpuErrChk(cudaFuncSetAttribute(tinygemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES, STAGE_UNROLL, PROFILE>,
6565
cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
6666

6767
int tiles_m = (output_features + TILE_M - 1) / TILE_M;
@@ -82,8 +82,8 @@ void launch_tinygemm2(__nv_bfloat16* gA, __nv_bfloat16* gB, __nv_bfloat16* gC, _
8282
attrs[0].val.programmaticStreamSerializationAllowed = 1;
8383
config.numAttrs = 1;
8484

85-
cudaLaunchKernelEx(&config, &kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES, STAGE_UNROLL, PROFILE>, gC, gA, gB,
86-
bias, output_features, batch_size, input_features, weight_map, activation_map, nullptr);
85+
cudaLaunchKernelEx(&config, &tinygemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES, STAGE_UNROLL, PROFILE>,
86+
gC, gA, gB, bias, output_features, batch_size, input_features, weight_map, activation_map, nullptr);
8787
}
8888

8989
torch::Tensor tinygemm2_cuda_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias)

cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ struct Profile
172172
};
173173

174174
template <int WARP_TILE_M, int TILE_M, int TILE_N, int TILE_K, int STAGES, int STAGE_UNROLL, bool PROFILE>
175-
__global__ __launch_bounds__(384, 1) void kernel(__nv_bfloat16* output, __nv_bfloat16* weights,
175+
__global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, __nv_bfloat16* weights,
176176
__nv_bfloat16* activations, __nv_bfloat16* bias, int M, int N, int K,
177177
const __grid_constant__ CUtensorMap weight_map, const __grid_constant__ CUtensorMap activation_map,
178178
Profile* profile = nullptr)

0 commit comments

Comments
 (0)