@@ -164,6 +164,17 @@ enum common_params_sampling_config : uint64_t {
164164 COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11 ,
165165};
166166
167+ enum common_speculative_type {
168+ COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
169+ COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
170+ COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
171+ COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
172+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
173+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
174+ COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
175+ COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
176+ COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
177+ };
167178
168179// sampling parameters
169180struct common_params_sampling {
@@ -242,25 +253,55 @@ struct common_params_model {
242253 std::string name = " " ; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
243254};
244255
256+ struct common_ngram_mod ;
257+
245258struct common_params_speculative {
246- std::vector< ggml_backend_dev_t > devices ; // devices to use for offloading
259+ common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE ; // type of speculative decoding
247260
248- int32_t n_ctx = 0 ; // draft context size
249- int32_t n_max = 16 ; // maximum number of tokens to draft during speculative decoding
250- int32_t n_min = 0 ; // minimum number of draft tokens to use for speculative decoding
251- int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
252- float p_split = 0 .1f ; // speculative decoding split probability
253- float p_min = 0 .75f ; // minimum speculative decoding probability (greedy)
254- std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
255- std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
261+ // general-purpose speculative decoding parameters
262+
263+ int32_t n_max = 16 ; // maximum number of tokens to draft during speculative decoding
264+ int32_t n_min = 0 ; // minimum number of draft tokens to use for speculative decoding
265+ float p_split = 0 .1f ; // speculative decoding split probability
266+ float p_min = 0 .75f ; // minimum speculative decoding probability (greedy)
267+
268+ // ngram-based speculative decoding
269+
270+ uint16_t ngram_size_n = 12 ; // ngram size for lookup
271+ uint16_t ngram_size_m = 48 ; // mgram size for speculative tokens
272+ uint16_t ngram_check_rate = 1 ; // check rate for ngram lookup
273+ uint16_t ngram_min_hits = 1 ; // minimum hits at ngram/mgram lookup for mgram to be proposed
274+
275+ std::shared_ptr<common_ngram_mod> ngram_mod;
276+
277+ std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
278+ std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
279+
280+ // draft-model speculative decoding
281+
282+ struct common_params_model mparams_dft;
283+
284+ llama_model * model_dft = nullptr ; // a llama_model that can be shared by multiple speculative contexts
285+
286+ llama_context_params cparams_dft; // these are the parameters for the draft llama_context
287+
288+ int32_t n_ctx = 0 ; // draft context size
289+ int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
256290
257291 ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
258292 ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
259293
260294 struct cpu_params cpuparams;
261295 struct cpu_params cpuparams_batch;
262296
263- struct common_params_model model;
297+ std::vector<ggml_backend_dev_t > devices; // devices to use for offloading
298+
299+ std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
300+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
301+
302+ bool has_dft () const {
303+ return !mparams_dft.path .empty () || !mparams_dft.hf_repo .empty ();
304+ }
264305};
265306
266307struct common_params_vocoder {
@@ -378,8 +419,6 @@ struct common_params {
378419 std::string path_prompt_cache = " " ; // path to file for saving/loading prompt eval state // NOLINT
379420 std::string input_prefix = " " ; // string to prefix user inputs with // NOLINT
380421 std::string input_suffix = " " ; // string to suffix user inputs with // NOLINT
381- std::string lookup_cache_static = " " ; // path of static ngram cache file for lookup decoding // NOLINT
382- std::string lookup_cache_dynamic = " " ; // path of dynamic ngram cache file for lookup decoding // NOLINT
383422 std::string logits_file = " " ; // file for saving *all* logits // NOLINT
384423
385424 // llama-debug specific options
@@ -438,7 +477,7 @@ struct common_params {
438477
439478 bool input_prefix_bos = false ; // prefix BOS to user inputs, preceding input_prefix
440479 bool use_mmap = true ; // enable mmap to use filesystem cache
441- bool use_direct_io = true ; // read from disk without buffering for faster model loading
480+ bool use_direct_io = false ; // read from disk without buffering
442481 bool use_mlock = false ; // use mlock to keep model in memory
443482 bool verbose_prompt = false ; // print prompt tokens before generation
444483 bool display_prompt = true ; // print prompt before generation
@@ -575,10 +614,6 @@ struct common_params {
575614 // return false from callback to abort model loading or true to continue
576615 llama_progress_callback load_progress_callback = NULL ;
577616 void * load_progress_callback_user_data = NULL ;
578-
579- bool has_speculative () const {
580- return !speculative.model .path .empty () || !speculative.model .hf_repo .empty ();
581- }
582617};
583618
584619// call once at the start of a program if it uses libcommon
@@ -714,8 +749,6 @@ struct common_init_result {
714749
715750 std::vector<llama_adapter_lora_ptr> & lora ();
716751
717- void free_context ();
718-
719752private:
720753 struct impl ;
721754 std::unique_ptr<impl> pimpl;
0 commit comments