Merge branch 'mradermacher' into master

nicoboss · web-flow · commit 1bf5cf06088f · 2025-07-09T22:05:01.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -915,6 +915,10 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
+        if(getenv("DRYRUN")) {
+            LOG_ERR("%s: Dryrun completed!\n", __func__);
+            exit(0);
+        }
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
         llama_model_free(model);
         return iparams;
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1966,6 +1966,11 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
 }
 
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if(getenv("DRYRUN")) {
+		GGML_LOG_ERROR("[DRYRUN][CPU]: %ld\n", size);
+		return NULL;
+	}
+
     void * data = ggml_aligned_malloc(size);
 
     if (data == NULL) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -679,6 +679,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
 
     ggml_cuda_set_device(buft_ctx->device);
 
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", buft_ctx->device, size);
+        return nullptr;
+    }
+
     void * dev_ptr;
     cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
     if (err != cudaSuccess) {
@@ -857,12 +862,18 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
         // FIXME: do not crash if cudaMalloc fails
         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
         ggml_cuda_set_device(id);
+
         char * buf;
-        CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
+        if(getenv("DRYRUN")) {
+            GGML_LOG_ERROR("[DRYRUN][GPU%d]: %ld\n", id, size);
+            buf = nullptr;
+        } else {
+            CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
 
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            // set padding to 0 to avoid possible NaN values
+            if (size > original_size) {
+                CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+            }
         }
 
         extra->data_device[id] = buf;
@@ -1118,6 +1129,10 @@ static void * ggml_cuda_host_malloc(size_t size) {
 }
 
 static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    if(getenv("DRYRUN")) {
+        GGML_LOG_ERROR("[DRYRUN][PINNED]: %ld\n", size);
+        return nullptr;
+    }
     void * ptr = ggml_cuda_host_malloc(size);
 
     if (ptr == nullptr) {
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -238,6 +238,10 @@ def write_kv_data_to_file(self) -> None:
             kv_bytes = bytearray()
 
             for key, val in kv_data.items():
+                if val.type != GGUFValueType.ARRAY or len (val.value) < 50:
+                    print("gguf serialising key ", key, "value", val)
+                else:
+                    print("gguf serialising key ", key, "value-suppressed")
                 kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
                 kv_bytes += self._pack_val(val.value, val.type, add_vtype=True, sub_type=val.sub_type)
 
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -135,6 +135,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
+            if(getenv("DRYRUN")) {
+                LLAMA_LOG_ERROR("%s: pretend allocating buffer for kv cache was successful due to dry-run being enabled\n", __func__);
+                return;
+            }
             throw std::runtime_error("failed to allocate buffer for kv cache");
         }
 
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -99,6 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent(
 
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buf) {
+            if(getenv("DRYRUN")) {
+                LLAMA_LOG_ERROR("%s: pretend allocating buffer for rs cache was successful due to dry-run being enabled\n", __func__);
+                return;
+            }
             throw std::runtime_error("failed to allocate buffer for rs cache");
         }
         ggml_backend_buffer_clear(buf, 0);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -711,6 +711,13 @@ llama_model_loader::llama_model_loader(
         use_mmap = false;
     }
 
+    if(getenv("DRYRUN")) {
+        if (use_mmap) {
+            LLAMA_LOG_WARN("%s: mmap is not supported for dry-run so it is now disabled\n", __func__);
+            use_mmap = false;
+        }
+    }
+
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
 }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -4845,7 +4845,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
             if (buf == nullptr) {
-                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                if(getenv("DRYRUN")) {
+                    LLAMA_LOG_WARN("%s: pretend allocating %s buffer was successful due to dry-run being enabled\n", __func__, ggml_backend_buft_name(buft));
+                } else {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+                }
             }
             pimpl->bufs.emplace_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
@@ -4863,10 +4867,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             throw std::runtime_error("failed to allocate buffer");
         }
 
-        for (auto & buf : buf_map) {
-            // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        if(!getenv("DRYRUN")) {
+            for (auto & buf : buf_map) {
+                // indicate that this buffer contains weights
+                // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+                ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            }
         }
 
         ctx_bufs.emplace_back(ctx, buf_map);
@@ -4887,8 +4893,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // print memory requirements per buffer type
-    for (auto & buf : pimpl->bufs) {
-        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+    if(!getenv("DRYRUN")) {
+        for (auto & buf : pimpl->bufs) {
+            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        }
     }
 
     // populate tensors_by_name
@@ -4899,11 +4907,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     }
 
     // load tensor data
-    for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+    if(!getenv("DRYRUN")) {
+        for (auto & it : ctx_bufs) {
+            ggml_context * ctx = it.first;
+            auto & bufs = it.second;
+            if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
             return false;
+            }
         }
     }
 
diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
@@ -33,6 +33,7 @@ struct Stats {
     std::vector<float> values;
     std::vector<int> counts;
     int ncall = 0;
+    int n_as = 1;
 };
 
 class IMatrixCollector {
@@ -127,11 +128,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         if (e.values.empty()) {
             e.values.resize(src1->ne[0]*n_as, 0);
             e.counts.resize(src1->ne[0]*n_as, 0);
+            e.n_as = n_as;
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
             LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
             exit(1); //GGML_ABORT("fatal error");
         }
+        else if (e.n_as != n_as) {
+            LOG_ERR("%s: inconsistent n_as for %s (%d vs %d)\n", __func__, wname.c_str(), e.n_as, n_as);
+        }
         LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
         // loop over all possible experts, regardless if they are used or not in the batch
         for (int ex = 0; ex < n_as; ++ex) {
@@ -173,23 +178,36 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     } else {
         auto & e = m_stats[wname];
         if (e.values.empty()) {
-            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
+            if (src0->ne[3] > 1) {
+                LOG_ERR("Unsupported 4D tensor %s\n", wname.c_str());
+                exit(1);
+            }
+            // If we have a 3D tensor as it is the case for the attn_k_b and attn_v_b for DeepSeek MLA models,
+            // than we need to compute the imatrix for each head, and not just one imatrx for all heads.
+            // Hence, the storage we need is src0->ne[0]*src0->ne[2].
+            e.values.resize(src0->ne[0]*src0->ne[2], 0);
+            e.counts.resize(src0->ne[0]*src0->ne[2], 0);
         }
-        else if (e.values.size() != (size_t)src1->ne[0]) {
+        else if (e.values.size() != (size_t)(src0->ne[0]*src0->ne[2])) {
             LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
             exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
         LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        for (int row = 0; row < (int)src1->ne[1]; ++row) {
-            const float * x = (const float *) (data + row * src1->nb[1]);
-            for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
-                    exit(1);
+        int rk2 = src1->ne[2]/src0->ne[2];
+        for (int i12 = 0; i12 < (int)src1->ne[2]; ++i12) {  // i.e., loop over attention heads for MLA models
+            int i02 = i12/rk2;
+            auto values = e.values.data() + i02*src0->ne[0];
+            auto counts = e.counts.data() + i02*src0->ne[0];
+            for (int i11 = 0; i11 < (int)src1->ne[1]; ++i11) {
+                const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    values[j] += x[j]*x[j];
+                    counts[j]++;
+                    if (!std::isfinite(values[j])) {
+                        LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
+                        exit(1);
+                    }
                 }
             }
         }
@@ -221,6 +239,10 @@ void IMatrixCollector::save_imatrix(int ncall) const {
     int n_entries = 0;
     std::vector<std::string> to_store;
 
+    // Retrieve the REQUIRED_GOOD_EXPERT_PERCENTAGE from the environment
+    const char* required_good_expert_percentage_env_value = getenv("REQUIRED_GOOD_EXPERT_PERCENTAGE");
+    double required_good_expert_percentage = required_good_expert_percentage_env_value ? std::clamp(std::stod(required_good_expert_percentage_env_value), 0.0, 100.0) : 90.0;
+
     bool is_first = true; // for printing
     for (const auto & kv : m_stats) {
         const int n_all = kv.second.counts.size();
@@ -247,8 +269,40 @@ void IMatrixCollector::save_imatrix(int ncall) const {
         }
 
         if (n_zeros > 0) {
-            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-            continue;
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            bool store_it = false;
+            if (kv.second.n_as > 1) {
+                int n_per_expert = n_all / kv.second.n_as;
+                std::vector<int> bad_experts;
+                bad_experts.reserve(kv.second.n_as);
+                for (int i = 0; i < kv.second.n_as; ++i) {
+                    auto counts = kv.second.counts.data() + i*n_per_expert;
+                    int nz_i = 0;
+                    for (int j = 0; j < n_per_expert; ++j) {
+                        if (counts[j] == 0) ++nz_i;
+                    }
+                    if (nz_i > 0) bad_experts.push_back(i);
+                }
+                size_t required_good_experts = round((kv.second.n_as * required_good_expert_percentage) / 100.0);
+                size_t good_experts = kv.second.n_as - bad_experts.size();
+                LOG_WRN("%s: %d out of %d experts are missing data - %ld out of %ld required\n", __func__, int(bad_experts.size()), kv.second.n_as, good_experts, required_good_experts);
+                if (good_experts >= required_good_experts) {
+                    LOG_WRN("%s: %d out of %d experts are missing data - storing but be aware\n", __func__, int(bad_experts.size()), kv.second.n_as);
+                    store_it = true;
+                    for (auto i : bad_experts) {
+                        auto counts = const_cast<int*>(kv.second.counts.data()) + i * n_per_expert;
+                        auto values = const_cast<float*>(kv.second.values.data()) + i * n_per_expert;
+                        for (int j = 0; j < n_per_expert; ++j) {
+                            counts[j] = 1;
+                            values[j] = 1;
+                        }
+                    }
+                }
+            }
+            if (!store_it) {
+                LOG_WRN("%s: Skipping expert with missing data!\n", __func__);
+                continue;
+            }
         }
 
         n_entries++;