Fixes to compile

Tianyue-Zhao · Tianyue-Zhao · commit 4c7acaf3f792 · 2025-02-17T01:10:55.000Z
diff --git a/include/llama.h b/include/llama.h
@@ -267,7 +267,7 @@ extern "C" {
         int8_t       *  logits; // TODO: rename this to "output"
 
         struct ggml_tensor *  embd_tensor;
-        struct ggml_tensor *  cross_embd_tensor;
+        struct ggml_tensor *  cross_embd;
     } llama_batch;
 
     enum llama_model_kv_override_type {
@@ -544,6 +544,9 @@ extern "C" {
     // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
     LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
 
+    // Returns true if the model has a image attention KV cache
+    LLAMA_API bool llama_model_has_cross_kv(const struct llama_model * model);
+
     // Returns 0 on success
     LLAMA_API uint32_t llama_model_quantize(
             const char * fname_inp,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1249,8 +1249,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
                 switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
+                    case 32: type = LLM_TYPE_7B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
                 }
             }break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
@@ -3384,42 +3384,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_COGVLM:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                    model.output_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
-                    model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
 
                     // Not supporting ctx_split
                     for (int i=0; i < n_layer; i++) {
-                        ggml_context * ctx_layer = ctx_for_layer(i);
-
-                        auto & layer = model.layers[i];
+                        auto & layer = layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wqkv_txt = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_TXT_QKV, "weight", i), {n_embd, n_embd * 3});
-                        layer.wqkv_img = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_IMG_QKV, "weight", i), {n_embd, n_embd * 3});
-                        layer.wdense_txt = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_TXT_DENSE, "weight", i), {n_embd, n_embd});
-                        layer.wdense_img = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_IMG_DENSE, "weight", i), {n_embd, n_embd});
+                        layer.wqkv_txt = create_tensor(tn(LLM_TENSOR_ATTN_TXT_QKV, "weight", i), {n_embd, n_embd * 3}, 0);
+                        layer.wqkv_img = create_tensor(tn(LLM_TENSOR_ATTN_IMG_QKV, "weight", i), {n_embd, n_embd * 3}, 0);
+                        layer.wdense_txt = create_tensor(tn(LLM_TENSOR_ATTN_TXT_DENSE, "weight", i), {n_embd, n_embd}, 0);
+                        layer.wdense_img = create_tensor(tn(LLM_TENSOR_ATTN_IMG_DENSE, "weight", i), {n_embd, n_embd}, 0);
 
-                        layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+                        layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
 
-                        layer.wq_cross = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_cross});
+                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_cross}, 0);
                         // The input dimension is the number of dimensions from the cross vision encoder
                         // it might not be guaranteed that this is the same as the number of dimensions
                         // in the cogvlm attention calculation
-                        layer.wkv_cross = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CROSS_ATTN_KV, "weight", i), {n_embd_cross, n_embd_cross * 2});
-                        layer.wdense_cross = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CROSS_ATTN_DENSE, "weight", i), {n_embd_cross, n_embd});
+                        layer.wkv_cross = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_KV, "weight", i), {n_embd_cross, n_embd_cross * 2}, 0);
+                        layer.wdense_cross = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_DENSE, "weight", i), {n_embd_cross, n_embd}, 0);
 
-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.ffn_gate_txt = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_TXT_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_down_txt = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_TXT_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_up_txt = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_TXT_UP, "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate_img = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_IMG_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_down_img = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_IMG_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_up_img = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_IMG_UP, "weight", i), {n_embd, n_ff});
+                        layer.ffn_gate_txt = create_tensor(tn(LLM_TENSOR_FFN_TXT_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down_txt = create_tensor(tn(LLM_TENSOR_FFN_TXT_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_up_txt = create_tensor(tn(LLM_TENSOR_FFN_TXT_UP, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate_img = create_tensor(tn(LLM_TENSOR_FFN_IMG_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down_img = create_tensor(tn(LLM_TENSOR_FFN_IMG_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                        layer.ffn_up_img = create_tensor(tn(LLM_TENSOR_FFN_IMG_UP, "weight", i), {n_embd, n_ff}, 0);
                     }
                 } break;
             case LLM_ARCH_WAVTOKENIZER_DEC:
@@ -4170,6 +4168,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
+        case LLM_ARCH_COGVLM:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
@@ -4309,3 +4308,10 @@ bool llama_model_is_recurrent(const struct llama_model * model) {
         default:              return false;
     }
 }
+
+bool llama_model_has_cross_kv(const struct llama_model * model) {
+    switch (model->arch) {
+        case LLM_ARCH_COGVLM: return true;
+        default: return false;
+    }
+}
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -713,14 +713,14 @@ static struct ggml_tensor * llm_build_kv(
 // cross attention KV cache
 static struct ggml_tensor * llm_build_cross_kv(
     struct ggml_context * ctx,
-    struct llama_context * lctx,
+    struct llama_context & lctx,
     struct ggml_tensor * qcur,
     struct ggml_tensor * kcur,
     struct ggml_tensor * vcur,
     struct ggml_cgraph * graph,
     int64_t il
 ) {
-    llama_cross_kv_cache & kv = lctx->kv_cross;
+    llama_cross_kv_cache & kv = lctx.kv_cross;
 
     // Q has dimensions K, H, L, B
     // K = hidden dimension per head
@@ -8187,8 +8187,8 @@ struct llm_build_context {
 
         // Multiplied directly to Q
         const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
-        const float cross_attn_scale = 1.0f / sqrtf(float(hparams.n_embd_cross / hparams.n_head()));
 
+        struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
@@ -9495,7 +9495,7 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
         uint32_t n_seqs = 1; // TODO: worst-case number of sequences
         uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
         llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
 
         // initialize scheduler with the worst-case graph
@@ -9963,6 +9963,15 @@ struct llama_context * llama_init_from_model(
             return nullptr;
         }
 
+        if (llama_model_has_cross_kv(model)) {
+            // TODO: Add parameter for cross kv cache size
+            if (!llama_cross_kv_cache_init(ctx->kv_cross, ctx->model, type_k, type_v, 1024 * 6400, cparams.offload_kqv)) {
+                LLAMA_LOG_ERROR("%s: llama_cross_kv_cache_init() failed\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+        }
+
         {
             size_t memory_size_k = 0;
             size_t memory_size_v = 0;
@@ -10058,7 +10067,7 @@ struct llama_context * llama_init_from_model(
             uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
             llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
             ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
 
             // reserve pp graph first so that buffers are only allocated once
@@ -10067,7 +10076,7 @@ struct llama_context * llama_init_from_model(
             int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
 
             // reserve with tg graph to get the number of splits and nodes
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
             ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
             ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
             int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());