@@ -1056,7 +1056,7 @@ void _layer_norm_backward_kernel(
1056
1056
config_w.workgroup_num * config_w.block_row * config_w.workgroup_size ;
1057
1057
int thread_slots = syclGpuEuCount () * syclGpuHWThreadsPerEU ();
1058
1058
// use two stage col reduction if norm config occupancy < 50%
1059
- // TODO: we can releax this restriction in future for better perf
1059
+ // TODO: we can relax this restriction in future for better perf
1060
1060
bool use_two_stage_col_reduction =
1061
1061
(dY.dtype () == kFloat || dY.dtype () == kBFloat16 ||
1062
1062
dY.dtype () == kHalf ) &&
@@ -1077,7 +1077,7 @@ void _layer_norm_backward_kernel(
1077
1077
int num_tile_n = (N + tile_size_n - 1 ) / tile_size_n;
1078
1078
bool adjust_m = true ;
1079
1079
// for M = 64*1024, N = 1, we choose tile size (256, 16) on pvc
1080
- // TODO: we can tune these conditions in future
1080
+ // TODO: Consider tuning the tile size selection logic (tile_size_m, tile_size_n) and occupancy calculation
1081
1081
for (auto i = 0 ; i < 3 ; i++) {
1082
1082
// occupancy <= 50%
1083
1083
if (num_tile_m * num_tile_n * local_size_x * SIMD /
@@ -1097,7 +1097,8 @@ void _layer_norm_backward_kernel(
1097
1097
}
1098
1098
}
1099
1099
// tile size can be (1024,32), (512,32), (512,16), (256, 16)
1100
- // Change these parameters will cause changes in kernel
1100
+ // Modifying these parameters (num_subgroup, workgroup_size, tile_size, elements_per_thread)
1101
+ // will alter the kernel configuration, potentially affecting performance and behavior.
1101
1102
const scalar_t * dY_data = dY.const_data_ptr <scalar_t >();
1102
1103
const scalar_t * X_data = X.const_data_ptr <scalar_t >();
1103
1104
weight_t * dg_data =
0 commit comments