ggml-org · ggerganov · May 19, 2025 · May 17, 2025 · May 19, 2025 · May 19, 2025
diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
             { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
         },
     },
     {

diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp
@@ -1704,10 +1704,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
         }
     }
 
-    LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
-    kv_self->state_write(io);
+    if (kv_self != nullptr) {
+        LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
+        kv_self->state_write(io);
+    }
 
     return io.n_bytes();
 }

diff --git a/examples/talk-llama/llama-kv-cache.cpp b/examples/talk-llama/llama-kv-cache.cpp
@@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
 
 void llama_kv_cache_unified::set_full() {
     n = size;
+
+    // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
+    //   affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
+    //   we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
+    //   setting it to 0 is the simplest way to achieve that
+    // ref: https://github.com/ggml-org/llama.cpp/issues/13359
+    head = 0;
 }
 
 llama_sbatch llama_kv_cache_unified::sbatch_init(
@@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
 
 void llama_kv_cache_recurrent::set_full() {
     n = size;
+    head = 0;
 }
 
 llama_sbatch llama_kv_cache_recurrent::sbatch_init(

diff --git a/examples/talk-llama/llama-kv-cache.h b/examples/talk-llama/llama-kv-cache.h
@@ -171,11 +171,8 @@ class llama_kv_cache_unified : public llama_kv_cache {
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
 
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
     uint32_t used = 0; // used cells (i.e. at least one seq_id)
 
     // computed before each graph build
@@ -343,11 +340,8 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
 
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
     uint32_t used = 0; // used cells (i.e. at least one seq_id)
 
     // computed before each graph build

diff --git a/examples/talk-llama/llama-model-loader.cpp b/examples/talk-llama/llama-model-loader.cpp
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
 
     meta.reset(gguf_init_from_file(fname.c_str(), params));
     if (!meta) {
-        throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
+        throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
     }
 
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
             };
             gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
             if (!ctx_gguf) {
-                throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
+                throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
             }
 
             // check idx
@@ -822,13 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
         mappings.reserve(files.size());
         mmaps_used.reserve(files.size());
         for (const auto & file : files) {
-            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
-            if (!reg) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            bool is_numa = false;
+
+            auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (dev) {
+                auto * reg = ggml_backend_dev_backend_reg(dev);
+                auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
+                if (is_numa_fn) {
+                    is_numa = is_numa_fn();
+                }
             }
 
-            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
-            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
+            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
             mmaps_used.emplace_back(mapping->size(), 0);
             if (mlock_mmaps) {
                 std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());