@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
5555 if (ubatch->pos && pos) {
5656 const int64_t n_tokens = ubatch->n_tokens ;
5757
58- ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_token*ggml_element_size (pos));
58+ if (ubatch->token && n_pos_per_embd == 4 ) {
59+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60+ // the 3 first dims are the same, and 4th dim is all 0
61+ std::vector<llama_pos> pos_data (n_tokens*n_pos_per_embd);
62+ // copy the first dimension
63+ for (int i = 0 ; i < n_tokens; ++i) {
64+ pos_data[ i] = ubatch->pos [i];
65+ pos_data[ n_tokens + i] = ubatch->pos [i];
66+ pos_data[2 * n_tokens + i] = ubatch->pos [i];
67+ pos_data[3 * n_tokens + i] = 0 ; // 4th dim is 0
68+ }
69+ ggml_backend_tensor_set (pos, pos_data.data (), 0 , pos_data.size ()*ggml_element_size (pos));
70+ } else {
71+ ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_embd*ggml_element_size (pos));
72+ }
5973 }
6074}
6175
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
7185 ) * f_attn_temp_scale + 1.0 ;
7286 }
7387
74- ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*n_pos_per_token* ggml_element_size (attn_scale));
88+ ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*ggml_element_size (attn_scale));
7589 }
7690}
7791
@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592606 res (std::make_unique<llm_graph_result>()) {
593607 }
594608
595- int64_t llm_graph_context::n_pos_per_token () const {
609+ int64_t llm_graph_context::n_pos_per_embd () const {
596610 return arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
597611}
598612
@@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
803817
804818 if (down) {
805819 cur = build_lora_mm (down, cur);
820+ if (arch == LLM_ARCH_GLM4) {
821+ // GLM4 seems to have numerical issues with half-precision accumulators
822+ ggml_mul_mat_set_prec (cur, GGML_PREC_F32);
823+ }
806824 }
807825
808826 if (down_b) {
@@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
910928 ggml_tensor * up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
911929 cb (up, " ffn_moe_up" , il);
912930
913- ggml_tensor * gate = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
914- cb (gate, " ffn_moe_gate" , il);
931+ ggml_tensor * experts = nullptr ;
932+ if (gate_exps) {
933+ cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
934+ cb (cur, " ffn_moe_gate" , il);
935+ } else {
936+ cur = up;
937+ }
915938
916939 switch (type_op) {
917940 case LLM_FFN_SILU:
918941 {
919- gate = ggml_silu (ctx0, gate );
920- cb (gate , " ffn_moe_silu" , il);
942+ cur = ggml_silu (ctx0, cur );
943+ cb (cur , " ffn_moe_silu" , il);
921944 } break ;
922945 case LLM_FFN_GELU:
923946 {
924- gate = ggml_gelu (ctx0, gate );
925- cb (gate , " ffn_moe_gelu" , il);
947+ cur = ggml_gelu (ctx0, cur );
948+ cb (cur , " ffn_moe_gelu" , il);
926949 } break ;
927950 default :
928951 GGML_ABORT (" fatal error" );
929952 }
930953
931- ggml_tensor * par = ggml_mul (ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
932- cb (par, " ffn_moe_gate_par" , il);
954+ if (gate_exps) {
955+ cur = ggml_mul (ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
956+ cb (cur, " ffn_moe_gate_par" , il);
957+ }
933958
934- ggml_tensor * experts = build_lora_mm_id (down_exps, par , selected_experts); // [n_embd, n_expert_used, n_tokens]
959+ experts = build_lora_mm_id (down_exps, cur , selected_experts); // [n_embd, n_expert_used, n_tokens]
935960 cb (experts, " ffn_moe_down" , il);
936961
937962 if (!weight_before_ffn) {
@@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
10141039}
10151040
10161041ggml_tensor * llm_graph_context::build_inp_pos () const {
1017- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token ());
1042+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd ());
10181043
10191044 auto & cur = inp->pos ;
10201045
1021- cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token ());
1046+ cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd ());
10221047 ggml_set_input (cur);
10231048
10241049 res->add_input (std::move (inp));
@@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
10271052}
10281053
10291054ggml_tensor * llm_graph_context::build_inp_attn_scale () const {
1030- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token (), hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1055+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
10311056
10321057 auto & cur = inp->attn_scale ;
10331058
1034- cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens*n_pos_per_token ());
1059+ // this need to be 1x1xN for broadcasting
1060+ cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens);
10351061 ggml_set_input (cur);
10361062
10371063 res->add_input (std::move (inp));
0 commit comments