@@ -2397,7 +2397,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
23972397
23982398 ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
23992399 ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
2400- ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants ), {1, 1, 1}, {}, 1);
2400+ ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants ), {1, 1, 1}, {}, 1);
24012401 ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
24022402 ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
24032403
@@ -6006,6 +6006,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
60066006 case GGML_OP_REPEAT:
60076007 case GGML_OP_REPEAT_BACK:
60086008 case GGML_OP_ROPE:
6009+ case GGML_OP_RMS_NORM:
60096010 return true;
60106011 default:
60116012 return false;
@@ -6216,7 +6217,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
62166217
62176218 switch (op) {
62186219 case GGML_OP_NORM:
6219- case GGML_OP_RMS_NORM:
62206220 case GGML_OP_RMS_NORM_BACK:
62216221 case GGML_OP_L2_NORM:
62226222 case GGML_OP_SOFT_MAX:
@@ -6233,6 +6233,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
62336233 elements = { nr, 1, 1 };
62346234 }
62356235 } break;
6236+ case GGML_OP_RMS_NORM:
6237+ elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 };
6238+ break;
6239+
62366240 case GGML_OP_SUM:
62376241 // We use GGML_OP_SUM_ROWS with 1 row.
62386242 elements = { 1, 1, 1 };
@@ -6883,7 +6887,17 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
68836887
68846888static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
68856889 float * op_params = (float *)dst->op_params;
6886- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
6890+ const uint32_t src0_type_size = ggml_type_size(src0->type);
6891+ const uint32_t dst_type_size = ggml_type_size(dst->type);
6892+
6893+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
6894+ (uint32_t)ggml_nelements(src0),
6895+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
6896+ (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
6897+ 0,
6898+ op_params[0], 0.0f,
6899+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6900+ }, dryrun);
68876901}
68886902
68896903static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -9388,10 +9402,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
93889402 case GGML_OP_VIEW:
93899403 case GGML_OP_PERMUTE:
93909404 case GGML_OP_TRANSPOSE:
9405+ case GGML_OP_RMS_NORM:
93919406 return true;
93929407 case GGML_OP_NORM:
93939408 case GGML_OP_GROUP_NORM:
9394- case GGML_OP_RMS_NORM:
93959409 case GGML_OP_L2_NORM:
93969410 return ggml_is_contiguous(op->src[0]);
93979411 case GGML_OP_ADD:
0 commit comments