Skip to content

Commit f83e5fc

Browse files
authored
ENH: Update llama.cpp to b7964 (#112)
1 parent 9eadc1f commit f83e5fc

File tree

10 files changed

+300
-97
lines changed

10 files changed

+300
-97
lines changed

src/llama.cpp/include/common.h

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,17 @@ enum common_params_sampling_config : uint64_t {
164164
COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
165165
};
166166

167+
enum common_speculative_type {
168+
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
169+
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
170+
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
171+
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
172+
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
173+
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
174+
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
175+
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
176+
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
177+
};
167178

168179
// sampling parameters
169180
struct common_params_sampling {
@@ -242,25 +253,55 @@ struct common_params_model {
242253
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
243254
};
244255

256+
struct common_ngram_mod;
257+
245258
struct common_params_speculative {
246-
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
259+
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
247260

248-
int32_t n_ctx = 0; // draft context size
249-
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
250-
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
251-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
252-
float p_split = 0.1f; // speculative decoding split probability
253-
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
254-
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
255-
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
261+
// general-purpose speculative decoding parameters
262+
263+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
264+
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
265+
float p_split = 0.1f; // speculative decoding split probability
266+
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
267+
268+
// ngram-based speculative decoding
269+
270+
uint16_t ngram_size_n = 12; // ngram size for lookup
271+
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
272+
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
273+
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
274+
275+
std::shared_ptr<common_ngram_mod> ngram_mod;
276+
277+
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
278+
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
279+
280+
// draft-model speculative decoding
281+
282+
struct common_params_model mparams_dft;
283+
284+
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
285+
286+
llama_context_params cparams_dft; // these are the parameters for the draft llama_context
287+
288+
int32_t n_ctx = 0; // draft context size
289+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
256290

257291
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
258292
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
259293

260294
struct cpu_params cpuparams;
261295
struct cpu_params cpuparams_batch;
262296

263-
struct common_params_model model;
297+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
298+
299+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
300+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
301+
302+
bool has_dft() const {
303+
return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
304+
}
264305
};
265306

266307
struct common_params_vocoder {
@@ -378,8 +419,6 @@ struct common_params {
378419
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
379420
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
380421
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
381-
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
382-
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
383422
std::string logits_file = ""; // file for saving *all* logits // NOLINT
384423

385424
// llama-debug specific options
@@ -438,7 +477,7 @@ struct common_params {
438477

439478
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
440479
bool use_mmap = true; // enable mmap to use filesystem cache
441-
bool use_direct_io = true; // read from disk without buffering for faster model loading
480+
bool use_direct_io = false; // read from disk without buffering
442481
bool use_mlock = false; // use mlock to keep model in memory
443482
bool verbose_prompt = false; // print prompt tokens before generation
444483
bool display_prompt = true; // print prompt before generation
@@ -575,10 +614,6 @@ struct common_params {
575614
// return false from callback to abort model loading or true to continue
576615
llama_progress_callback load_progress_callback = NULL;
577616
void * load_progress_callback_user_data = NULL;
578-
579-
bool has_speculative() const {
580-
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
581-
}
582617
};
583618

584619
// call once at the start of a program if it uses libcommon
@@ -714,8 +749,6 @@ struct common_init_result {
714749

715750
std::vector<llama_adapter_lora_ptr> & lora();
716751

717-
void free_context();
718-
719752
private:
720753
struct impl;
721754
std::unique_ptr<impl> pimpl;

src/llama.cpp/include/ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
// This documentation is still a work in progress.
77
// If you wish some specific topics to be covered, feel free to drop a comment:
88
//
9-
// https://github.com/ggerganov/whisper.cpp/issues/40
9+
// https://github.com/ggml-org/whisper.cpp/issues/40
1010
//
1111
// ## Overview
1212
//

src/llama.cpp/include/llama.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ extern "C" {
309309
// Keep the booleans together to avoid misalignment during copy-by-value.
310310
bool vocab_only; // only load the vocabulary, no weights
311311
bool use_mmap; // use mmap if possible
312-
bool use_direct_io; // use direct io, takes precedence over use_mmap
312+
bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
313313
bool use_mlock; // force system to keep model in RAM
314314
bool check_tensors; // validate model tensor data
315315
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -1476,12 +1476,12 @@ extern "C" {
14761476
/// @details Build a split GGUF final path for this chunk.
14771477
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
14781478
// Returns the split_path length.
1479-
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
1479+
LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
14801480

14811481
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
14821482
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
14831483
// Returns the split_prefix length.
1484-
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
1484+
LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
14851485

14861486
// Print system information
14871487
LLAMA_API const char * llama_print_system_info(void);

src/xllamacpp/server.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,4 +632,45 @@ void build_tensor_buffer_overrides(
632632
value = string_join(parts, ",");
633633
}
634634

635+
// Helper function to parse device list
636+
std::vector<ggml_backend_dev_t> parse_device_list(const std::string &value) {
637+
std::vector<ggml_backend_dev_t> devices;
638+
auto dev_names = string_split<std::string>(value, ',');
639+
if (dev_names.empty()) {
640+
throw std::invalid_argument("no devices specified");
641+
}
642+
if (dev_names.size() == 1 && dev_names[0] == "none") {
643+
devices.push_back(nullptr);
644+
} else {
645+
for (const auto &device : dev_names) {
646+
auto *dev = ggml_backend_dev_by_name(device.c_str());
647+
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
648+
throw std::invalid_argument(
649+
string_format("invalid device: %s", device.c_str()));
650+
}
651+
devices.push_back(dev);
652+
}
653+
devices.push_back(nullptr);
654+
}
655+
return devices;
656+
}
657+
658+
// Helper function to build device string from vector of ggml_backend_dev_t
659+
std::string
660+
build_device_string(const std::vector<ggml_backend_dev_t> &devices) {
661+
if (devices.empty()) {
662+
return "";
663+
}
664+
if (devices.size() == 1 && devices[0] == nullptr) {
665+
return "";
666+
}
667+
std::vector<std::string> names;
668+
for (size_t i = 0; i < devices.size() - 1; ++i) { // Skip the trailing nullptr
669+
if (devices[i]) {
670+
names.emplace_back(ggml_backend_dev_name(devices[i]));
671+
}
672+
}
673+
return string_join(names, ",");
674+
}
675+
635676
} // namespace xllamacpp

src/xllamacpp/server.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,6 @@ void parse_tensor_buffer_overrides(
5656
void build_tensor_buffer_overrides(
5757
const std::vector<llama_model_tensor_buft_override> &overrides,
5858
std::string &value);
59+
std::vector<ggml_backend_dev_t> parse_device_list(const std::string &value);
60+
std::string build_device_string(const std::vector<ggml_backend_dev_t> &devices);
5961
} // namespace xllamacpp

src/xllamacpp/server.pxd

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# distutils: language=c++
22

3-
from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override
3+
from xllamacpp.xllamacpp cimport common_params, ggml_backend_dev_props, llama_model_tensor_buft_override, ggml_backend_dev_t
44
from libcpp cimport bool as c_bool
55
from libcpp.string cimport string as std_string
66
from libcpp.vector cimport vector as std_vector
@@ -41,3 +41,5 @@ cdef extern from "server.h" namespace "xllamacpp" nogil:
4141
const std_string & value, std_vector[llama_model_tensor_buft_override] & overrides) except +
4242
void c_build_tensor_buffer_overrides "xllamacpp::build_tensor_buffer_overrides" (
4343
const std_vector[llama_model_tensor_buft_override] & overrides, std_string & value) except +
44+
std_vector[ggml_backend_dev_t] c_parse_device_list "xllamacpp::parse_device_list" (const std_string & value) except +
45+
std_string c_build_device_string "xllamacpp::build_device_string" (const std_vector[ggml_backend_dev_t] & devices) except +

src/xllamacpp/xllamacpp.pxd

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# distutils: language=c++
22

3-
from libc.stdint cimport int32_t, uint32_t, int64_t, int8_t, uint64_t
3+
from libc.stdint cimport int32_t, uint32_t, int64_t, int8_t, uint64_t, uint16_t
44
from libcpp.string cimport string as std_string
55
from libcpp.vector cimport vector as std_vector
66
from libcpp.set cimport set as std_set
@@ -349,6 +349,18 @@ cdef extern from "common.h":
349349
bint backend_sampling
350350

351351

352+
cpdef enum common_speculative_type:
353+
COMMON_SPECULATIVE_TYPE_NONE # no speculative decoding
354+
COMMON_SPECULATIVE_TYPE_DRAFT # draft model
355+
COMMON_SPECULATIVE_TYPE_EAGLE3 # eagle draft model
356+
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE # simple self-speculative decoding
357+
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K # self-speculative decoding with n-gram keys only
358+
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V # self-speculative decoding with n-gram keys and 4 m-gram values
359+
COMMON_SPECULATIVE_TYPE_NGRAM_MOD
360+
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE # self-speculative decoding with 3-level n-gram cache
361+
COMMON_SPECULATIVE_TYPE_COUNT # number of types, unknown type
362+
363+
352364
ctypedef struct common_params_model:
353365
std_string path # model local path // NOLINT
354366
std_string url # model url to download // NOLINT
@@ -357,23 +369,41 @@ cdef extern from "common.h":
357369
std_string docker_repo # Docker repo // NOLINT
358370
std_string name # in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
359371

360-
ctypedef struct common_params_speculative:
361-
std_vector[ggml_backend_dev_t] devices # devices to use for offloading
362-
int32_t n_ctx # draft context size
363-
int32_t n_max # maximum number of tokens to draft during speculative decoding
364-
int32_t n_min # minimum number of draft tokens to use for speculative decoding
365-
int32_t n_gpu_layers # number of layers to store in VRAM for the draft model (-1 - use default)
366-
float p_split # speculative decoding split probability
367-
float p_min # minimum speculative decoding probability (greedy)
368-
std_vector[std_pair[std_string, std_string]] replacements # main to speculative model replacements
369-
std_vector[llama_model_tensor_buft_override] tensor_buft_overrides
372+
ctypedef struct common_ngram_mod
373+
ctypedef struct llama_model
370374

375+
ctypedef struct common_params_speculative:
376+
common_speculative_type type # type of speculative decoding
377+
378+
# general-purpose speculative decoding parameters
379+
int32_t n_max # maximum number of tokens to draft during speculative decoding
380+
int32_t n_min # minimum number of draft tokens to use for speculative decoding
381+
float p_split # speculative decoding split probability
382+
float p_min # minimum speculative decoding probability (greedy)
383+
384+
# ngram-based speculative decoding
385+
uint16_t ngram_size_n # ngram size for lookup
386+
uint16_t ngram_size_m # mgram size for speculative tokens
387+
uint16_t ngram_check_rate # check rate for ngram lookup
388+
uint16_t ngram_min_hits # minimum hits at ngram/mgram lookup for mgram to be proposed
389+
# common_ngram_mod * ngram_mod # ngram modification (runtime only, filled according to ngram_size_n, not exposed to Python)
390+
391+
std_string lookup_cache_static # path of static ngram cache file for lookup decoding
392+
std_string lookup_cache_dynamic # path of dynamic ngram cache file for lookup decoding
393+
394+
# draft-model speculative decoding
395+
common_params_model mparams_dft # draft model parameters
396+
# llama_model * model_dft # a llama_model that can be shared by multiple speculative contexts (runtime only, not exposed to Python)
397+
# llama_context_params cparams_dft # parameters for the draft llama_context (runtime only, not exposed to Python)
398+
int32_t n_ctx # draft context size
399+
int32_t n_gpu_layers # number of layers to store in VRAM for the draft model (-1 - use default)
371400
ggml_type cache_type_k # KV cache data type for the K
372401
ggml_type cache_type_v # KV cache data type for the V
373-
374402
cpu_params cpuparams
375403
cpu_params cpuparams_batch
376-
common_params_model model
404+
std_vector[ggml_backend_dev_t] devices # devices to use for offloading
405+
std_vector[std_pair[std_string, std_string]] replacements # main to speculative model replacements
406+
std_vector[llama_model_tensor_buft_override] tensor_buft_overrides
377407

378408

379409
ctypedef struct common_params_vocoder:
@@ -465,8 +495,6 @@ cdef extern from "common.h":
465495
std_string path_prompt_cache # path to file for saving/loading prompt eval state
466496
std_string input_prefix # string to prefix user inputs with
467497
std_string input_suffix # string to suffix user inputs with
468-
std_string lookup_cache_static # path of static ngram cache file for lookup decoding
469-
std_string lookup_cache_dynamic # path of dynamic ngram cache file for lookup decoding
470498
std_string logits_file # file for saving *all* logits
471499

472500
# llama-debug specific options
@@ -525,7 +553,7 @@ cdef extern from "common.h":
525553

526554
bint input_prefix_bos # prefix BOS to user inputs, preceding input_prefix
527555
bint use_mmap # use mmap for faster loads
528-
bint use_direct_io # read from disk without buffering for faster model loading
556+
bint use_direct_io # read from disk without buffering
529557
bint use_mlock # use mlock to keep model in memory
530558
bint verbose_prompt # print prompt tokens before generation
531559
bint display_prompt # print prompt before generation

0 commit comments

Comments
 (0)