@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
5555 if (ubatch->pos && pos) {
5656 const int64_t n_tokens = ubatch->n_tokens ;
5757
58- ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_token*ggml_element_size (pos));
58+ if (ubatch->token && n_pos_per_embd > 1 ) {
59+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60+ // the other dimensions are all 0, they are unused for text tokens
61+ std::vector<llama_pos> pos_data (n_tokens*n_pos_per_embd, 0 );
62+ // copy the first dimension
63+ for (int i = 0 ; i < n_tokens; ++i) {
64+ pos_data[i] = ubatch->pos [i];
65+ }
66+ ggml_backend_tensor_set (pos, pos_data.data (), 0 , pos_data.size ()*ggml_element_size (pos));
67+ } else {
68+ ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_embd*ggml_element_size (pos));
69+ }
5970 }
6071}
6172
@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
7182 ) * f_attn_temp_scale + 1.0 ;
7283 }
7384
74- ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*n_pos_per_token* ggml_element_size (attn_scale));
85+ ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*ggml_element_size (attn_scale));
7586 }
7687}
7788
@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592603 res (std::make_unique<llm_graph_result>()) {
593604 }
594605
595- int64_t llm_graph_context::n_pos_per_token () const {
606+ int64_t llm_graph_context::n_pos_per_embd () const {
596607 return arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
597608}
598609
@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
10181029}
10191030
10201031ggml_tensor * llm_graph_context::build_inp_pos () const {
1021- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token ());
1032+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd ());
10221033
10231034 auto & cur = inp->pos ;
10241035
1025- cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token ());
1036+ cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd ());
10261037 ggml_set_input (cur);
10271038
10281039 res->add_input (std::move (inp));
@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
10311042}
10321043
10331044ggml_tensor * llm_graph_context::build_inp_attn_scale () const {
1034- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token (), hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1045+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
10351046
10361047 auto & cur = inp->attn_scale ;
10371048
1038- cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens*n_pos_per_token ());
1049+ // this need to be 1x1xN for broadcasting
1050+ cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens);
10391051 ggml_set_input (cur);
10401052
10411053 res->add_input (std::move (inp));
0 commit comments