llama : (mrope) allow using normal 1D position for text token (ggml-org#13138)

ngxson · web-flow · commit d2b2031e5f11 · 2025-04-28T14:20:56.000+02:00
* llama : (mrope) use normal position for text token

* rm n_pos_per_embd from llm_graph_input_attn_temp
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
@@ -92,20 +92,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
 
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
     int N = (int) tokens.size();
-    std::vector<llama_pos> pos;
     for (int i = 0; i < N; i += n_batch) {
         int n_eval = (int) tokens.size() - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
         auto batch = llama_batch_get_one(&tokens[i], n_eval);
-        // TODO: add mrope pos ids somewhere else
-        pos.resize(batch.n_tokens * 4);
-        std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < batch.n_tokens * 3; j ++) {
-            pos[j] = *st_pos_id + (j % batch.n_tokens);
-        }
-        batch.pos = pos.data();
 
         if (llama_decode(ctx_llama, batch)) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+        if (ubatch->token && n_pos_per_embd > 1) {
+            // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+            // the other dimensions are all 0, they are unused for text tokens
+            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
+            // copy the first dimension
+            for (int i = 0; i < n_tokens; ++i) {
+                pos_data[i] = ubatch->pos[i];
+            }
+            ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
+        } else {
+            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+        }
     }
 }
 
@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
             ) * f_attn_temp_scale + 1.0;
         }
 
-        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
     }
 }
 
@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     res              (std::make_unique<llm_graph_result>()) {
     }
 
-int64_t llm_graph_context::n_pos_per_token() const {
+int64_t llm_graph_context::n_pos_per_embd() const {
     return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
 
@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
+    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
 
     auto & cur = inp->pos;
 
-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
 
     auto & cur = inp->attn_scale;
 
-    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+    // this need to be 1x1xN for broadcasting
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -90,29 +90,27 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
 
-    const int64_t n_pos_per_token = 1;
-
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };
@@ -419,7 +417,7 @@ struct llm_graph_context {
 
     llm_graph_context(const llm_graph_params & params);
 
-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;
 
     void cb(ggml_tensor * cur, const char * name, int il) const;
 

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`55`	`55`	`if (ubatch->pos && pos) {`
`56`	`56`	`const int64_t n_tokens = ubatch->n_tokens;`
`57`	`57`
`58`		`- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokensn_pos_per_tokenggml_element_size(pos));`
	`58`	`+ if (ubatch->token && n_pos_per_embd > 1) {`
	`59`	`+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D`
	`60`	`+ // the other dimensions are all 0, they are unused for text tokens`
	`61`	`+ std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);`
	`62`	`+ // copy the first dimension`
	`63`	`+ for (int i = 0; i < n_tokens; ++i) {`
	`64`	`+ pos_data[i] = ubatch->pos[i];`
	`65`	`+ }`
	`66`	`+ ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));`
	`67`	`+ } else {`
	`68`	`+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokensn_pos_per_embdggml_element_size(pos));`
	`69`	`+ }`
`59`	`70`	`}`
`60`	`71`	`}`
`61`	`72`
`@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {`
`71`	`82`	`) * f_attn_temp_scale + 1.0;`
`72`	`83`	`}`
`73`	`84`
`74`		`- ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokensn_pos_per_tokenggml_element_size(attn_scale));`
	`85`	`+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));`
`75`	`86`	`}`
`76`	`87`	`}`
`77`	`88`
`@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :`
`592`	`603`	`res (std::make_unique<llm_graph_result>()) {`
`593`	`604`	`}`
`594`	`605`
`595`		`-int64_t llm_graph_context::n_pos_per_token() const {`
	`606`	`+int64_t llm_graph_context::n_pos_per_embd() const {`
`596`	`607`	`return arch == LLM_ARCH_QWEN2VL ? 4 : 1;`
`597`	`608`	`}`
`598`	`609`
`@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {`
`1018`	`1029`	`}`
`1019`	`1030`
`1020`	`1031`	`ggml_tensor * llm_graph_context::build_inp_pos() const {`
`1021`		`- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());`
	`1032`	`+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());`
`1022`	`1033`
`1023`	`1034`	`auto & cur = inp->pos;`
`1024`	`1035`
`1025`		`- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());`
	`1036`	`+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());`
`1026`	`1037`	`ggml_set_input(cur);`
`1027`	`1038`
`1028`	`1039`	`res->add_input(std::move(inp));`
`@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {`
`1031`	`1042`	`}`
`1032`	`1043`
`1033`	`1044`	`ggml_tensor * llm_graph_context::build_inp_attn_scale() const {`
`1034`		`- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);`
	`1045`	`+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);`
`1035`	`1046`
`1036`	`1047`	`auto & cur = inp->attn_scale;`
`1037`	`1048`
`1038`		`- cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());`
	`1049`	`+ // this need to be 1x1xN for broadcasting`
	`1050`	`+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);`
`1039`	`1051`	`ggml_set_input(cur);`
`1040`	`1052`
`1041`	`1053`	`res->add_input(std::move(inp));`