Skip to content

Commit bffe3c9

Browse files
committed
tensor debugging now works -> (llama-eval-callback), instead of simulated gate split with views, GEGLU is now used which does exactly this
1 parent 18c0c23 commit bffe3c9

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

src/llama-graph.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,8 +645,11 @@ ggml_tensor * llm_graph_context::build_ffn(
645645
llm_ffn_gate_type type_gate,
646646
int il) const {
647647

648-
648+
LLAMA_LOG_INFO("building lora: up is {%lld, %lld}\n input is {%lld, %lld}\n", up->ne[0], up->ne[1], cur->ne[0], cur->ne[1]);
649+
649650
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
651+
LLAMA_LOG_INFO("Building FFN\n");
652+
LLAMA_LOG_INFO("built lora: tmp is {%lld, %lld}\n", tmp->ne[0], tmp->ne[1]);
650653
cb(tmp, "ffn_up", il);
651654

652655
if (up_b) {
@@ -669,6 +672,8 @@ ggml_tensor * llm_graph_context::build_ffn(
669672
case LLM_FFN_PAR:
670673
{
671674
cur = build_lora_mm(gate, cur);
675+
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
676+
672677
cb(cur, "ffn_gate", il);
673678
} break;
674679
}
@@ -687,6 +692,10 @@ ggml_tensor * llm_graph_context::build_ffn(
687692
cur = tmp;
688693
}
689694

695+
if( gate && type_gate == LLM_FFN_PAR ) {
696+
LLAMA_LOG_INFO("Gate Exists and In Paralell\n");
697+
}
698+
690699
switch (type_op) {
691700
case LLM_FFN_SILU:
692701
if (gate && type_gate == LLM_FFN_PAR) {
@@ -735,6 +744,7 @@ ggml_tensor * llm_graph_context::build_ffn(
735744
case LLM_FFN_GEGLU:
736745
{
737746
cur = ggml_geglu(ctx0, cur);
747+
LLAMA_LOG_INFO("geglu split: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
738748
cb(cur, "ffn_geglu", il);
739749
} break;
740750
case LLM_FFN_REGLU:
@@ -747,12 +757,16 @@ ggml_tensor * llm_graph_context::build_ffn(
747757
}
748758

749759
if (gate && type_gate == LLM_FFN_PAR) {
760+
LLAMA_LOG_INFO("cur @ tmp: cur is {%lld, %lld}\n tmp is {%lld, %lld}\n", cur->ne[0], cur->ne[1], tmp->ne[0], tmp->ne[1]);
750761
cur = ggml_mul(ctx0, cur, tmp);
762+
LLAMA_LOG_INFO("res is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
751763
cb(cur, "ffn_gate_par", il);
752764
}
753765

754766
if (down) {
755767
cur = build_lora_mm(down, cur);
768+
LLAMA_LOG_INFO("built lora: cur is {%lld, %lld}\n", cur->ne[0], cur->ne[1]);
769+
756770
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
757771
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
758772
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);

src/llama-model.cpp

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7672,7 +7672,7 @@ struct llm_build_modern_bert : public llm_graph_context {
76727672
ggml_tensor * ffn_up_view = model.layers[il].ffn_up;
76737673

76747674
if (ffn_gate_view == nullptr && ffn_up_view) {
7675-
7675+
76767676
// Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden)
76777677
if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
76787678

@@ -7685,33 +7685,49 @@ struct llm_build_modern_bert : public llm_graph_context {
76857685
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
76867686
/*ne0*/ n_ff, /*ne1*/ n_embd,
76877687
/*nb1*/ model.layers[il].ffn_up->nb[1],
7688+
76887689
/*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
76897690
}
7691+
7692+
/*
76907693
else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
76917694
// top half
7695+
LLAMA_LOG_INFO("Case B:\n");
76927696
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
76937697
n_embd, n_ff,
76947698
model.layers[il].ffn_up->nb[1],
76957699
0);
7700+
76967701
ffn_up_view = ggml_cont(ctx0, ffn_up_view);
76977702

76987703
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
76997704
n_embd, n_ff,
77007705
model.layers[il].ffn_up->nb[1],
7701-
n_ff * sizeof(float));
7706+
n_ff * model.layers[il].ffn_up->nb[0]);
77027707
ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
77037708
}
7704-
7705-
ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
7706-
LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}",
7707-
ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
7708-
7709+
*/
7710+
//ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
7711+
//LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}\n",
7712+
// ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
7713+
/*
7714+
ggml_tensor * cur,
7715+
ggml_tensor * up,
7716+
ggml_tensor * up_b,
7717+
ggml_tensor * up_s,
7718+
ggml_tensor * gate,
7719+
ggml_tensor * gate_b,
7720+
ggml_tensor * gate_s,
7721+
ggml_tensor * down,
7722+
ggml_tensor * down_b,
7723+
ggml_tensor * down_s,
7724+
ggml_tensor * act_scales,*/
77097725
mlp_out = build_ffn(
77107726
h,
7711-
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
7712-
ffn_gate_view , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
7727+
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
7728+
NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
77137729
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
7714-
/*expert_scores*/ NULL,
7730+
/*act_scales*/ NULL,
77157731
LLM_FFN_GEGLU, LLM_FFN_PAR, il
77167732
);
77177733
cb(mlp_out, "ffn_out_geglu", il);

0 commit comments

Comments
 (0)