@@ -103,8 +103,8 @@ enum dimre_method {
103103 DIMRE_METHOD_MEAN,
104104};
105105
106- // sampler parameters
107- struct common_sampler_params {
106+ // sampling parameters
107+ struct common_params_sampling {
108108 uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
109109
110110 int32_t n_prev = 64 ; // number of previous tokens to remember
@@ -155,20 +155,30 @@ struct common_sampler_params {
155155 std::string print () const ;
156156};
157157
158+ struct common_params_speculative {
159+ int32_t n_ctx = 4096 ; // draft context size
160+ int32_t n_max = 5 ; // maximum number of tokens to draft during speculative decoding
161+ int32_t n_min = 0 ; // minimum number of draft tokens to use for speculative decoding
162+ int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
163+ float p_split = 0 .1f ; // speculative decoding split probability
164+ float p_min = 0 .9f ; // minimum speculative decoding probability (greedy)
165+
166+ struct cpu_params cpuparams;
167+ struct cpu_params cpuparams_batch;
168+
169+ std::string model = " " ; // draft model for speculative decoding // NOLINT
170+ };
171+
158172struct common_params {
159173 int32_t n_predict = -1 ; // new tokens to predict
160174 int32_t n_ctx = 4096 ; // context size
161175 int32_t n_batch = 2048 ; // logical batch size for prompt processing (must be >=32 to use BLAS)
162176 int32_t n_ubatch = 512 ; // physical batch size for prompt processing (must be >=32 to use BLAS)
163177 int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
164- int32_t n_draft = 5 ; // number of tokens to draft during speculative decoding
165- int32_t n_draft_min = 0 ; // minimum number of draft tokens to use for speculative decoding
166178 int32_t n_chunks = -1 ; // max number of chunks to process (-1 = unlimited)
167179 int32_t n_parallel = 1 ; // number of parallel sequences to decode
168180 int32_t n_sequences = 1 ; // number of sequences to decode
169- float p_split = 0 .1f ; // speculative decoding split probability
170181 int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM (-1 - use default)
171- int32_t n_gpu_layers_draft = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
172182 int32_t main_gpu = 0 ; // the GPU that is used for scratch and small tensors
173183 float tensor_split[128 ] = {0 }; // how split tensors should be distributed across GPUs
174184 int32_t grp_attn_n = 1 ; // group-attention factor
@@ -185,8 +195,6 @@ struct common_params {
185195
186196 struct cpu_params cpuparams;
187197 struct cpu_params cpuparams_batch;
188- struct cpu_params draft_cpuparams;
189- struct cpu_params draft_cpuparams_batch;
190198
191199 ggml_backend_sched_eval_callback cb_eval = nullptr ;
192200 void * cb_eval_user_data = nullptr ;
@@ -198,10 +206,10 @@ struct common_params {
198206 enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
199207 enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
200208
201- struct common_sampler_params sparams;
209+ struct common_params_sampling sampling;
210+ struct common_params_speculative speculative;
202211
203212 std::string model = " " ; // model path // NOLINT
204- std::string model_draft = " " ; // draft model for speculative decoding // NOLINT
205213 std::string model_alias = " unknown" ; // model alias // NOLINT
206214 std::string model_url = " " ; // model url to download // NOLINT
207215 std::string hf_token = " " ; // HF token // NOLINT
0 commit comments