Cleanup code

mitmul · mitmul · commit 222c5ada942f · 2025-06-25T20:21:03.000+09:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -13,7 +13,6 @@
 #include <cassert>
 #include <cmath>
 #include <cstring>
-#include <iostream>
 
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     if (ubatch->token) {
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -1,7 +1,6 @@
 #include "llama-hparams.h"
 
 #include "ggml.h"
-#include <iostream>
 
 void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
     for (uint32_t il = 0; il < n_layer; ++il) {
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -11,7 +11,6 @@
 #include <limits>
 #include <map>
 #include <stdexcept>
-#include <iostream>
 
 //
 // llama_kv_cache_unified
@@ -1744,7 +1743,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {}
 
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(
-        llama_kv_cache_unified * kv) : kv(kv), status(LLAMA_MEMORY_STATUS_SUCCESS) {
+        llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
     n_kv = kv->get_size();
     head = 0;
 }
@@ -1753,7 +1752,7 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
         llama_kv_cache_unified * kv,
         llama_context * lctx,
         bool do_shift,
-        defrag_info dinfo) : kv(kv), status(LLAMA_MEMORY_STATUS_SUCCESS), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
+        defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
     if (!do_shift && this->dinfo.empty()) {
         status = LLAMA_MEMORY_STATUS_NO_UPDATE;
     }
@@ -1762,7 +1761,7 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(
         llama_kv_cache_unified * kv,
         llama_kv_cache_unified::ubatch_heads heads,
-        std::vector<llama_ubatch> ubatches) : kv(kv), status(LLAMA_MEMORY_STATUS_SUCCESS), heads(std::move(heads)), ubatches(std::move(ubatches)) {
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) {
 }
 
 llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
@@ -239,7 +239,7 @@ class llama_kv_cache_unified_context : public llama_memory_context_i {
     // Delete copy constructor and copy assignment to prevent shallow copies
     llama_kv_cache_unified_context(const llama_kv_cache_unified_context&) = delete;
     llama_kv_cache_unified_context& operator=(const llama_kv_cache_unified_context&) = delete;
-    
+
     // Delete move constructor and move assignment to prevent issues
     llama_kv_cache_unified_context(llama_kv_cache_unified_context&&) = delete;
     llama_kv_cache_unified_context& operator=(llama_kv_cache_unified_context&&) = delete;
@@ -273,12 +273,10 @@ class llama_kv_cache_unified_context : public llama_memory_context_i {
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 
-    llama_kv_cache_unified * kv;
-    // the beginning of the current slot in which the ubatch will be inserted
-    int32_t head;
 private:
     llama_memory_status status;
 
+    llama_kv_cache_unified * kv;
     llama_context * lctx;
 
     //
@@ -307,4 +305,7 @@ class llama_kv_cache_unified_context : public llama_memory_context_i {
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // as the cache gets filled, the benefit from this heuristic disappears
     int32_t n_kv;
+
+    // the beginning of the current slot in which the ubatch will be inserted
+    int32_t head;
 };
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
@@ -4,8 +4,6 @@
 #include "llama-model.h"
 #include "llama-context.h"
 
-#include <iostream>
-
 //
 // llama_memory_hybrid
 //
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -1059,10 +1059,6 @@ bool llama_model_loader::load_all_data(
                 mmap_used.first  = std::min(mmap_used.first,  weight->offs);
                 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
             } else {
-                // Check if tensor has a buffer before calling ggml_backend_tensor_set
-                if (cur->buffer == nullptr) {
-                    throw std::runtime_error(format("tensor '%s' has no buffer allocated", ggml_get_name(cur)));
-                }
                 ggml_backend_tensor_set(cur, data, 0, n_size);
             }
         } else {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3,7 +3,6 @@
 #include "ggml.h"
 #include "llama-arch.h"
 #include "llama-impl.h"
-#include "llama-mmap.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-model-loader.h"
@@ -23,9 +22,7 @@
 #include <functional>
 #include <map>
 #include <regex>
-#include <sstream>
 #include <stdexcept>
-#include <iostream>
 
 const char * llm_type_name(llm_type type) {
     switch (type) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp