@@ -38,56 +38,56 @@ void llama4_fp8_bf16_gemm_launcher(void const* A, void const* B, void* C, void c
3838 // When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.
3939 dim3 const grid_size = dim3 (div_up (hidden_out, 1 ), div_up (num_tokens, 1 ), 1 );
4040 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (1 , 1 );
41- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
41+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
4242 }
4343 else if (num_tokens == 2 )
4444 {
4545 // When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 1.
4646 dim3 const grid_size = dim3 (div_up (hidden_out, 1 ), div_up (num_tokens, 2 ), 1 );
4747 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (2 , 1 );
48- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
48+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
4949 }
5050 else if (num_tokens == 3 )
5151 {
5252 // When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.
5353 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
5454 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (1 , 4 );
55- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
55+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
5656 }
5757 else if (num_tokens == 4 )
5858 {
5959 // When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.
6060 dim3 const grid_size = dim3 (div_up (hidden_out, 2 ), div_up (num_tokens, 2 ), 1 );
6161 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (2 , 2 );
62- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
62+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
6363 }
6464 else if (num_tokens == 5 )
6565 {
6666 // When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.
6767 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
6868 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (1 , 4 );
69- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
69+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
7070 }
7171 else if (num_tokens == 6 )
7272 {
7373 // When num_tokens == 6, the best tiling size is tile_token == 3 and tile_out == 4.
7474 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 3 ), 1 );
7575 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (3 , 4 );
76- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
76+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
7777 }
7878 else if (num_tokens == 7 )
7979 {
8080 // When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.
8181 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
8282 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (1 , 4 );
83- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
83+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
8484 }
8585 else if (num_tokens == 8 )
8686 {
8787 // When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.
8888 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 2 ), 1 );
8989 void * kernel_func = get_per_block_func_ptr_aligned_true_5120_ (2 , 4 );
90- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
90+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 7 );
9191 }
9292 else
9393 {
@@ -115,56 +115,56 @@ void llama4_fp8_bf16_gemm_attn_scaling_launcher(void const* A, void const* B, vo
115115 // When num_tokens == 1, the best tiling size is tile_token == 1 and tile_out == 1.
116116 dim3 const grid_size = dim3 (div_up (hidden_out, 1 ), div_up (num_tokens, 1 ), 1 );
117117 void * kernel_func = get_kernel_func (1 , 1 , pos_ids_int64);
118- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
118+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
119119 }
120120 else if (num_tokens == 2 )
121121 {
122122 // When num_tokens == 2, the best tiling size is tile_token == 2 and tile_out == 2.
123123 dim3 const grid_size = dim3 (div_up (hidden_out, 2 ), div_up (num_tokens, 2 ), 1 );
124124 void * kernel_func = get_kernel_func (2 , 2 , pos_ids_int64);
125- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
125+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
126126 }
127127 else if (num_tokens == 3 )
128128 {
129129 // When num_tokens == 3, the best tiling size is tile_token == 1 and tile_out == 4.
130130 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
131131 void * kernel_func = get_kernel_func (1 , 4 , pos_ids_int64);
132- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
132+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
133133 }
134134 else if (num_tokens == 4 )
135135 {
136136 // When num_tokens == 4, the best tiling size is tile_token == 2 and tile_out == 2.
137137 dim3 const grid_size = dim3 (div_up (hidden_out, 2 ), div_up (num_tokens, 2 ), 1 );
138138 void * kernel_func = get_kernel_func (2 , 2 , pos_ids_int64);
139- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
139+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
140140 }
141141 else if (num_tokens == 5 )
142142 {
143143 // When num_tokens == 5, the best tiling size is tile_token == 1 and tile_out == 4.
144144 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
145145 void * kernel_func = get_kernel_func (1 , 4 , pos_ids_int64);
146- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
146+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
147147 }
148148 else if (num_tokens == 6 )
149149 {
150150 // When num_tokens == 6, the best tiling size is tile_token == 2 and tile_out == 4.
151151 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 2 ), 1 );
152152 void * kernel_func = get_kernel_func (2 , 4 , pos_ids_int64);
153- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
153+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
154154 }
155155 else if (num_tokens == 7 )
156156 {
157157 // When num_tokens == 7, the best tiling size is tile_token == 1 and tile_out == 4.
158158 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 1 ), 1 );
159159 void * kernel_func = get_kernel_func (1 , 4 , pos_ids_int64);
160- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
160+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
161161 }
162162 else if (num_tokens == 8 )
163163 {
164164 // When num_tokens == 8, the best tiling size is tile_token == 2 and tile_out == 4.
165165 dim3 const grid_size = dim3 (div_up (hidden_out, 4 ), div_up (num_tokens, 2 ), 1 );
166166 void * kernel_func = get_kernel_func (2 , 4 , pos_ids_int64);
167- launch_kernel_fdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
167+ launch_kernel_pdl (dim3 (grid_size), dim3 (BLOCK_SIZE), stream, kernel_func, args, 11 );
168168 }
169169 else
170170 {
0 commit comments