ENH: Update llama.cpp to b7964 (#112)

codingl2k1 · web-flow · commit f83e5fcd8007 · 2026-02-07T17:33:15.000+08:00
diff --git a/src/llama.cpp/include/common.h b/src/llama.cpp/include/common.h
@@ -164,6 +164,17 @@ enum common_params_sampling_config : uint64_t {
     COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
 };
 
+enum common_speculative_type {
+    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
+    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
+    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
+    COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
+    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
+};
 
 // sampling parameters
 struct common_params_sampling {
@@ -242,25 +253,55 @@ struct common_params_model {
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };
 
+struct common_ngram_mod;
+
 struct common_params_speculative {
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
 
-    int32_t n_ctx        =     0; // draft context size
-    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
-    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    // general-purpose speculative decoding parameters
+
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+
+    // ngram-based speculative decoding
+
+    uint16_t ngram_size_n     = 12; // ngram size for lookup
+    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t ngram_check_rate =  1; // check rate for ngram lookup
+    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+
+    // draft-model speculative decoding
+
+    struct common_params_model mparams_dft;
+
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+
+    int32_t n_ctx        = 0;  // draft context size
+    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
-    struct common_params_model model;
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+
+    bool has_dft() const {
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
+    }
 };
 
 struct common_params_vocoder {
@@ -378,8 +419,6 @@ struct common_params {
     std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
     std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
     std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
     std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
 
     // llama-debug specific options
@@ -438,7 +477,7 @@ struct common_params {
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_direct_io     = false; // read from disk without buffering
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
@@ -575,10 +614,6 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
-
-    bool has_speculative() const {
-        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
-    }
 };
 
 // call once at the start of a program if it uses libcommon
@@ -714,8 +749,6 @@ struct common_init_result {
 
     std::vector<llama_adapter_lora_ptr> & lora();
 
-    void free_context();
-
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama.cpp/include/ggml.h b/src/llama.cpp/include/ggml.h
@@ -6,7 +6,7 @@
 // This documentation is still a work in progress.
 // If you wish some specific topics to be covered, feel free to drop a comment:
 //
-//   https://github.com/ggerganov/whisper.cpp/issues/40
+//   https://github.com/ggml-org/whisper.cpp/issues/40
 //
 // ## Overview
 //
diff --git a/src/llama.cpp/include/llama.h b/src/llama.cpp/include/llama.h
@@ -309,7 +309,7 @@ extern "C" {
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;      // only load the vocabulary, no weights
         bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
+        bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
         bool use_mlock;       // force system to keep model in RAM
         bool check_tensors;   // validate model tensor data
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -1476,12 +1476,12 @@ extern "C" {
     /// @details Build a split GGUF final path for this chunk.
     ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
     //  Returns the split_path length.
-    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+    LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
 
     /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
     ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
     //  Returns the split_prefix length.
-    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+    LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
 
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
diff --git a/src/xllamacpp/server.cpp b/src/xllamacpp/server.cpp
@@ -632,4 +632,45 @@ void build_tensor_buffer_overrides(
   value = string_join(parts, ",");
 }
 
+// Helper function to parse device list
+std::vector<ggml_backend_dev_t> parse_device_list(const std::string &value) {
+  std::vector<ggml_backend_dev_t> devices;
+  auto dev_names = string_split<std::string>(value, ',');
+  if (dev_names.empty()) {
+    throw std::invalid_argument("no devices specified");
+  }
+  if (dev_names.size() == 1 && dev_names[0] == "none") {
+    devices.push_back(nullptr);
+  } else {
+    for (const auto &device : dev_names) {
+      auto *dev = ggml_backend_dev_by_name(device.c_str());
+      if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+        throw std::invalid_argument(
+            string_format("invalid device: %s", device.c_str()));
+      }
+      devices.push_back(dev);
+    }
+    devices.push_back(nullptr);
+  }
+  return devices;
+}
+
+// Helper function to build device string from vector of ggml_backend_dev_t
+std::string
+build_device_string(const std::vector<ggml_backend_dev_t> &devices) {
+  if (devices.empty()) {
+    return "";
+  }
+  if (devices.size() == 1 && devices[0] == nullptr) {
+    return "";
+  }
+  std::vector<std::string> names;
+  for (size_t i = 0; i < devices.size() - 1; ++i) { // Skip the trailing nullptr
+    if (devices[i]) {
+      names.emplace_back(ggml_backend_dev_name(devices[i]));
+    }
+  }
+  return string_join(names, ",");
+}
+
 } // namespace xllamacpp
diff --git a/src/xllamacpp/server.h b/src/xllamacpp/server.h
@@ -56,4 +56,6 @@ void parse_tensor_buffer_overrides(
 void build_tensor_buffer_overrides(
     const std::vector<llama_model_tensor_buft_override> &overrides,
     std::string &value);
+std::vector<ggml_backend_dev_t> parse_device_list(const std::string &value);
+std::string build_device_string(const std::vector<ggml_backend_dev_t> &devices);
 } // namespace xllamacpp
diff --git a/src/xllamacpp/server.pxd b/src/xllamacpp/server.pxd
@@ -1,6 +1,6 @@
 # distutils: language=c++
 
-from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override
+from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override, ggml_backend_dev_t
 from libcpp cimport bool as c_bool
 from libcpp.string cimport string as std_string
 from libcpp.vector cimport vector as std_vector
@@ -41,3 +41,5 @@ cdef extern from "server.h" namespace "xllamacpp" nogil:
         const std_string & value, std_vector[llama_model_tensor_buft_override] & overrides) except +
     void c_build_tensor_buffer_overrides "xllamacpp::build_tensor_buffer_overrides" (
         const std_vector[llama_model_tensor_buft_override] & overrides, std_string & value) except +
+    std_vector[ggml_backend_dev_t] c_parse_device_list "xllamacpp::parse_device_list" (const std_string & value) except +
+    std_string c_build_device_string "xllamacpp::build_device_string" (const std_vector[ggml_backend_dev_t] & devices) except +
diff --git a/src/xllamacpp/xllamacpp.pxd b/src/xllamacpp/xllamacpp.pxd
@@ -1,6 +1,6 @@
 # distutils: language=c++
 
-from libc.stdint cimport int32_t, uint32_t, int64_t, int8_t, uint64_t
+from libc.stdint cimport int32_t, uint32_t, int64_t, int8_t, uint64_t, uint16_t
 from libcpp.string cimport string as std_string
 from libcpp.vector cimport vector as std_vector
 from libcpp.set cimport set as std_set
@@ -349,6 +349,18 @@ cdef extern from "common.h":
         bint backend_sampling
 
 
+    cpdef enum common_speculative_type:
+        COMMON_SPECULATIVE_TYPE_NONE          # no speculative decoding
+        COMMON_SPECULATIVE_TYPE_DRAFT         # draft model
+        COMMON_SPECULATIVE_TYPE_EAGLE3        # eagle draft model
+        COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE  # simple self-speculative decoding
+        COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K   # self-speculative decoding with n-gram keys only
+        COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V # self-speculative decoding with n-gram keys and 4 m-gram values
+        COMMON_SPECULATIVE_TYPE_NGRAM_MOD
+        COMMON_SPECULATIVE_TYPE_NGRAM_CACHE   # self-speculative decoding with 3-level n-gram cache
+        COMMON_SPECULATIVE_TYPE_COUNT         # number of types, unknown type
+
+
     ctypedef struct common_params_model:
         std_string path          # model local path                                           // NOLINT
         std_string url           # model url to download                                      // NOLINT
@@ -357,23 +369,41 @@ cdef extern from "common.h":
         std_string docker_repo   # Docker repo                                                // NOLINT
         std_string name          # in format <user>/<model>[:<tag>] (tag is optional)         // NOLINT
 
-    ctypedef struct common_params_speculative:
-        std_vector[ggml_backend_dev_t] devices # devices to use for offloading
-        int32_t n_ctx           # draft context size
-        int32_t n_max           # maximum number of tokens to draft during speculative decoding
-        int32_t n_min           # minimum number of draft tokens to use for speculative decoding
-        int32_t n_gpu_layers    # number of layers to store in VRAM for the draft model (-1 - use default)
-        float   p_split         # speculative decoding split probability
-        float   p_min           # minimum speculative decoding probability (greedy)
-        std_vector[std_pair[std_string, std_string]] replacements  # main to speculative model replacements
-        std_vector[llama_model_tensor_buft_override] tensor_buft_overrides
+    ctypedef struct common_ngram_mod
+    ctypedef struct llama_model
 
+    ctypedef struct common_params_speculative:
+        common_speculative_type type    # type of speculative decoding
+        
+        # general-purpose speculative decoding parameters
+        int32_t n_max   # maximum number of tokens to draft during speculative decoding
+        int32_t n_min   # minimum number of draft tokens to use for speculative decoding
+        float   p_split # speculative decoding split probability
+        float   p_min   # minimum speculative decoding probability (greedy)
+        
+        # ngram-based speculative decoding
+        uint16_t ngram_size_n     # ngram size for lookup
+        uint16_t ngram_size_m     # mgram size for speculative tokens
+        uint16_t ngram_check_rate  # check rate for ngram lookup
+        uint16_t ngram_min_hits   # minimum hits at ngram/mgram lookup for mgram to be proposed
+        # common_ngram_mod * ngram_mod  # ngram modification (runtime only, filled according to ngram_size_n, not exposed to Python)
+        
+        std_string lookup_cache_static   # path of static ngram cache file for lookup decoding
+        std_string lookup_cache_dynamic  # path of dynamic ngram cache file for lookup decoding
+        
+        # draft-model speculative decoding
+        common_params_model mparams_dft  # draft model parameters
+        # llama_model * model_dft         # a llama_model that can be shared by multiple speculative contexts (runtime only, not exposed to Python)
+        # llama_context_params cparams_dft  # parameters for the draft llama_context (runtime only, not exposed to Python)
+        int32_t n_ctx         # draft context size
+        int32_t n_gpu_layers  # number of layers to store in VRAM for the draft model (-1 - use default)
         ggml_type cache_type_k  # KV cache data type for the K
         ggml_type cache_type_v  # KV cache data type for the V
-
         cpu_params cpuparams
         cpu_params cpuparams_batch
-        common_params_model model
+        std_vector[ggml_backend_dev_t] devices  # devices to use for offloading
+        std_vector[std_pair[std_string, std_string]] replacements  # main to speculative model replacements
+        std_vector[llama_model_tensor_buft_override] tensor_buft_overrides
 
 
     ctypedef struct common_params_vocoder:
@@ -465,8 +495,6 @@ cdef extern from "common.h":
         std_string path_prompt_cache    # path to file for saving/loading prompt eval state
         std_string input_prefix         # string to prefix user inputs with
         std_string input_suffix         # string to suffix user inputs with
-        std_string lookup_cache_static  # path of static ngram cache file for lookup decoding
-        std_string lookup_cache_dynamic # path of dynamic ngram cache file for lookup decoding
         std_string logits_file          # file for saving *all* logits
 
         # llama-debug specific options
@@ -525,7 +553,7 @@ cdef extern from "common.h":
 
         bint input_prefix_bos       # prefix BOS to user inputs, preceding input_prefix
         bint use_mmap               # use mmap for faster loads
-        bint use_direct_io          # read from disk without buffering for faster model loading
+        bint use_direct_io          # read from disk without buffering
         bint use_mlock              # use mlock to keep model in memory
         bint verbose_prompt         # print prompt tokens before generation
         bint display_prompt         # print prompt before generation
diff --git a/src/xllamacpp/xllamacpp.pyx b/src/xllamacpp/xllamacpp.pyx
diff --git a/tests/test_params.py b/tests/test_params.py
diff --git a/thirdparty/llama.cpp b/thirdparty/llama.cpp