Skip to content

Commit d165961

Browse files
committed
Revert "RoPE cache (ikawrakow#887)"
1 parent 6d27076 commit d165961

File tree

12 files changed

+72
-1003
lines changed

12 files changed

+72
-1003
lines changed

common/common.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,10 +1106,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11061106
params.fused_mmad = false;
11071107
return true;
11081108
}
1109-
if (arg == "-no-rcache" || arg == "--no-rope-cache") {
1110-
params.rope_cache = false;
1111-
return true;
1112-
}
11131109
if (arg == "-ser" || arg == "--smart-expert-reduction") {
11141110
CHECK_ARG
11151111
auto values = string_split_pairs<int,float>(argv[i], ',');
@@ -1918,7 +1914,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
19181914
options.push_back({ "*", "-ger, --grouped-expert-routing", "enable grouped expert routing (default: %s)", params.grouped_expert_routing ? "enabled" : "disabled" });
19191915
options.push_back({ "*", "-no-fug, --no-fused-up-gate", "disaable fused up-gate (default: %s)", params.fused_up_gate ? "enabled" : "disabled" });
19201916
options.push_back({ "*", "-no-mmad, --no-fused-mul-multiadd", "disaable fused mul-multi_add (default: %s)", params.fused_mmad? "enabled" : "disabled" });
1921-
options.push_back({ "*", "-no-rcache, --no-rope-cache", "disaable RoPE cache (default: %s)", params.rope_cache ? "enabled" : "disabled" });
19221917
options.push_back({ "*", "-ser, --smart-expert-reduction,","experts reduction (default: %d,%g)", params.min_experts, params.thresh_experts});
19231918
options.push_back({ "*", "-mqkv, --merge-qkv,", "merge Q,K,V (default: %d)", params.merge_qkv});
19241919
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
@@ -2892,7 +2887,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
28922887
cparams.grouped_expert_routing = params.grouped_expert_routing;
28932888
cparams.fused_up_gate = params.fused_up_gate;
28942889
cparams.fused_mmad = params.fused_mmad;
2895-
cparams.rope_cache = params.rope_cache;
28962890
cparams.min_experts = params.min_experts;
28972891
cparams.thresh_experts = params.thresh_experts;
28982892
cparams.only_active_experts = params.only_active_exps;
@@ -4011,8 +4005,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
40114005
fprintf(stream, "fused_moe: %s # default: false\n", params.fused_moe_up_gate ? "true" : "false");
40124006
fprintf(stream, "grouped_expert_routing: %s # default: false\n", params.grouped_expert_routing ? "true" : "false");
40134007
fprintf(stream, "fused_up_gate: %s # default: true\n", params.fused_up_gate ? "true" : "false");
4014-
fprintf(stream, "fused_mmad: %s # default: true\n", params.fused_mmad ? "true" : "false");
4015-
fprintf(stream, "rope_cache: %s # default: true\n", params.rope_cache ? "true" : "false");
4008+
fprintf(stream, "fused_mmad: %s # default: true\n", params.fused_mmad? "true" : "false");
40164009
fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
40174010
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
40184011

common/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ enum common_reasoning_format {
112112
enum common_webui {
113113
COMMON_WEBUI_NONE,
114114
COMMON_WEBUI_AUTO,
115-
COMMON_WEBUI_LLAMACPP,
115+
COMMON_WEBUI_LLAMACPP,
116116
};
117117

118118
common_webui common_webui_from_name(const std::string& format);
@@ -249,7 +249,6 @@ struct gpt_params {
249249
bool fused_up_gate = true; // fused up*unary(gate) op
250250
bool fused_mmad = true; // fused mul+multi_add op
251251
bool grouped_expert_routing = false; // if to use grouped expert routing (BailingMoeV2 arch)
252-
bool rope_cache = true; // if to use RoPE cache (for supported models)
253252
int min_experts = -1;
254253
float thresh_experts = 0;
255254

ggml/include/ggml.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -640,8 +640,6 @@ extern "C" {
640640
GGML_OP_SOFT_MAX_BACK,
641641
GGML_OP_ROPE,
642642
GGML_OP_ROPE_BACK,
643-
GGML_OP_ROPE_CACHE,
644-
GGML_OP_ROPE_FAST,
645643
GGML_OP_CLAMP,
646644
GGML_OP_CONV_TRANSPOSE_1D,
647645
GGML_OP_IM2COL,
@@ -2023,26 +2021,6 @@ extern "C" {
20232021
float beta_fast,
20242022
float beta_slow);
20252023

2026-
GGML_API struct ggml_tensor * ggml_rope_cache(
2027-
struct ggml_context * ctx,
2028-
struct ggml_tensor * b,
2029-
struct ggml_tensor * c,
2030-
int ne0,
2031-
int n_dims,
2032-
int mode,
2033-
int n_ctx_orig,
2034-
float freq_base,
2035-
float freq_scale,
2036-
float ext_factor,
2037-
float attn_factor,
2038-
float beta_fast,
2039-
float beta_slow);
2040-
2041-
GGML_API struct ggml_tensor * ggml_rope_fast(
2042-
struct ggml_context * ctx,
2043-
struct ggml_tensor * a,
2044-
struct ggml_tensor * b);
2045-
20462024
// clamp
20472025
// in-place, returns view(a)
20482026
GGML_API struct ggml_tensor * ggml_clamp(

ggml/src/ggml-cuda.cu

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3062,7 +3062,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
30623062

30633063
auto next = i < cgraph->n_nodes - 1 ? cgraph->nodes[i+1] : nullptr;
30643064

3065-
//printf("%4d %s(%s)\n", i, ggml_op_name(dst->op), dst->name);
30663065
switch (dst->op) {
30673066
case GGML_OP_ARGMAX:
30683067
ggml_cuda_argmax(ctx, dst);
@@ -3097,6 +3096,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
30973096
ggml_are_same_shape(dst, cgraph->nodes[i+1]->src[1]) &&
30983097
cgraph->nodes[i+1] == cgraph->nodes[i+2]->src[0] &&
30993098
ops_are_same_device(cgraph, i, i+2)) {
3099+
//printf("Fusing add->add->fused_rms of %s, %s, %s\n", dst->name, cgraph->nodes[i+1]->name, cgraph->nodes[i+2]->name);
31003100
ggml_cuda_op_fused_add_add_rms_norm(ctx, dst, cgraph->nodes[i+1], cgraph->nodes[i+2]);
31013101
i += 2;
31023102
}
@@ -3244,27 +3244,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
32443244
ggml_cuda_op_rms_norm(ctx, dst);
32453245
break;
32463246
case GGML_OP_FUSED_RMS_NORM:
3247-
//if (i + 6 < cgraph->n_nodes) {
3248-
// printf("=== Fused rms_norm(%s)\n", dst->name);
3249-
// for (int j = 1; j <= 6; ++j) printf(" %s(%s)\n", ggml_op_name(cgraph->nodes[i+j]->op), cgraph->nodes[i+j]->name);
3250-
//}
3251-
if (ENABLE_FUSION && i + 4 < cgraph->n_nodes &&
3252-
cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
3253-
cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM &&
3254-
cgraph->nodes[i+3]->op == GGML_OP_ROPE_FAST &&
3255-
cgraph->nodes[i+4]->op == GGML_OP_ROPE_FAST &&
3256-
ggml_cuda_op_fused_rms_rope_fast(ctx, cgraph->nodes[i+3], cgraph->nodes[i+4])) {
3257-
i += 4;
3258-
}
3259-
else if (ENABLE_FUSION && i + 4 < cgraph->n_nodes &&
3260-
cgraph->nodes[i+1]->op == GGML_OP_ROPE_FAST &&
3261-
cgraph->nodes[i+2]->op == GGML_OP_RESHAPE &&
3262-
cgraph->nodes[i+3]->op == GGML_OP_FUSED_RMS_NORM &&
3263-
cgraph->nodes[i+4]->op == GGML_OP_ROPE_FAST &&
3264-
ggml_cuda_op_fused_rms_rope_fast(ctx, cgraph->nodes[i+1], cgraph->nodes[i+4])) {
3265-
i += 4;
3266-
}
3267-
else if (ENABLE_FUSION && i + 2 < cgraph->n_nodes &&
3247+
if (i + 2 < cgraph->n_nodes &&
32683248
cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
32693249
cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM &&
32703250
dst->ne[2] == 1 && cgraph->nodes[i+2]->ne[2] == 1) {
@@ -3338,32 +3318,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
33383318
case GGML_OP_ROPE_BACK:
33393319
ggml_cuda_op_rope_back(ctx, dst);
33403320
break;
3341-
case GGML_OP_ROPE_FAST:
3342-
if (ENABLE_FUSION && i + 3 < cgraph->n_nodes &&
3343-
(cgraph->nodes[i+1]->op == GGML_OP_RESHAPE || cgraph->nodes[i+1]->op == GGML_OP_VIEW) &&
3344-
(cgraph->nodes[i+2]->op == GGML_OP_RESHAPE || cgraph->nodes[i+2]->op == GGML_OP_VIEW) &&
3345-
cgraph->nodes[i+3]->op == GGML_OP_ROPE_FAST &&
3346-
ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+3])) {
3347-
i += 3;
3348-
}
3349-
else if (ENABLE_FUSION && i + 2 < cgraph->n_nodes &&
3350-
(cgraph->nodes[i+1]->op == GGML_OP_RESHAPE || cgraph->nodes[i+1]->op == GGML_OP_VIEW) &&
3351-
cgraph->nodes[i+2]->op == GGML_OP_ROPE_FAST &&
3352-
ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+2])) {
3353-
i += 2;
3354-
}
3355-
else if (ENABLE_FUSION && i + 1 < cgraph->n_nodes &&
3356-
cgraph->nodes[i+1]->op == GGML_OP_ROPE_FAST &&
3357-
ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+1])) {
3358-
i += 1;
3359-
}
3360-
else {
3361-
ggml_cuda_op_rope_fast(ctx, dst);
3362-
}
3363-
break;
3364-
case GGML_OP_ROPE_CACHE:
3365-
ggml_cuda_op_rope_cache(ctx, dst);
3366-
break;
33673321
case GGML_OP_IM2COL:
33683322
ggml_cuda_op_im2col(ctx, dst);
33693323
break;
@@ -4423,8 +4377,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
44234377
case GGML_OP_SOFT_CAP_MAX:
44244378
case GGML_OP_ROPE:
44254379
case GGML_OP_ROPE_BACK:
4426-
case GGML_OP_ROPE_FAST:
4427-
case GGML_OP_ROPE_CACHE:
44284380
return true;
44294381
//case GGML_OP_ROPE:
44304382
// return ggml_is_contiguous(op->src[0]);

0 commit comments

Comments
 (0)