@@ -214,54 +214,55 @@ def __init__(
214214 model_path : str ,
215215 * ,
216216 # NOTE: These parameters are likely to change in the future.
217+ seed : int = llama_cpp .LLAMA_DEFAULT_SEED ,
217218 n_ctx : int = 512 ,
218- n_parts : int = - 1 ,
219+ n_batch : int = 512 ,
219220 n_gpu_layers : int = 0 ,
220- seed : int = 1337 ,
221+ main_gpu : int = 0 ,
222+ tensor_split : Optional [List [float ]] = None ,
223+ rope_freq_base : float = 10000.0 ,
224+ rope_freq_scale : float = 1.0 ,
225+ low_vram : bool = False ,
226+ mul_mat_q : bool = True ,
221227 f16_kv : bool = True ,
222228 logits_all : bool = False ,
223229 vocab_only : bool = False ,
224230 use_mmap : bool = True ,
225231 use_mlock : bool = False ,
226232 embedding : bool = False ,
227233 n_threads : Optional [int ] = None ,
228- n_batch : int = 512 ,
229234 last_n_tokens_size : int = 64 ,
230235 lora_base : Optional [str ] = None ,
231236 lora_path : Optional [str ] = None ,
232- low_vram : bool = False ,
233- tensor_split : Optional [List [float ]] = None ,
234- rope_freq_base : float = 10000.0 ,
235- rope_freq_scale : float = 1.0 ,
236- n_gqa : Optional [int ] = None , # (TEMPORARY) must be 8 for llama2 70b
237- rms_norm_eps : Optional [float ] = None , # (TEMPORARY)
238- mul_mat_q : Optional [bool ] = None ,
239237 verbose : bool = True ,
240238 ** kwargs # type: ignore
241239 ):
242240 """Load a llama.cpp model from `model_path`.
243241
244242 Args:
245243 model_path: Path to the model.
246- n_ctx: Maximum context size.
247- n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
248244 seed: Random seed. -1 for random.
245+ n_ctx: Maximum context size.
246+ n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
249247 n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
248+ main_gpu: Main GPU to use.
249+ tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
250+ rope_freq_base: Base frequency for rope sampling.
251+ rope_freq_scale: Scale factor for rope sampling.
252+ low_vram: Use low VRAM mode.
253+ mul_mat_q: if true, use experimental mul_mat_q kernels
250254 f16_kv: Use half-precision for key/value cache.
251255 logits_all: Return logits for all tokens, not just the last token.
252256 vocab_only: Only load the vocabulary no weights.
253257 use_mmap: Use mmap if possible.
254258 use_mlock: Force the system to keep the model in RAM.
255259 embedding: Embedding mode only.
256260 n_threads: Number of threads to use. If None, the number of threads is automatically determined.
257- n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
258261 last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
259262 lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
260263 lora_path: Path to a LoRA file to apply to the model.
261- tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
262- rope_freq_base: Base frequency for rope sampling.
263- rope_freq_scale: Scale factor for rope sampling.
264264 verbose: Print verbose output to stderr.
265+ kwargs: Unused keyword arguments (for additional backwards compatibility).
265266
266267 Raises:
267268 ValueError: If the model path does not exist.
@@ -274,16 +275,20 @@ def __init__(
274275 self .model_path = model_path
275276
276277 self .params = llama_cpp .llama_context_default_params ()
278+ self .params .seed = seed
277279 self .params .n_ctx = n_ctx
278280 self .params .n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == - 1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
279- self .params .seed = seed
281+ self .params .main_gpu = main_gpu
282+ self .params .rope_freq_base = rope_freq_base
283+ self .params .rope_freq_scale = rope_freq_scale
284+ self .params .low_vram = low_vram
285+ self .params .mul_mat_q = mul_mat_q
280286 self .params .f16_kv = f16_kv
281287 self .params .logits_all = logits_all
282288 self .params .vocab_only = vocab_only
283289 self .params .use_mmap = use_mmap if lora_path is None else False
284290 self .params .use_mlock = use_mlock
285291 self .params .embedding = embedding
286- self .params .low_vram = low_vram
287292
288293 self .tensor_split = tensor_split
289294 self ._p_tensor_split = None
@@ -296,12 +301,6 @@ def __init__(
296301 ) # keep a reference to the array so it is not gc'd
297302 self .params .tensor_split = self ._c_tensor_split
298303
299- self .params .rope_freq_base = rope_freq_base
300- self .params .rope_freq_scale = rope_freq_scale
301-
302-
303- if mul_mat_q is not None :
304- self .params .mul_mat_q = mul_mat_q
305304
306305 self .last_n_tokens_size = last_n_tokens_size
307306 self .n_batch = min (n_ctx , n_batch )
@@ -313,10 +312,6 @@ def __init__(
313312 self .lora_base = lora_base
314313 self .lora_path = lora_path
315314
316- ### DEPRECATED ###
317- self .n_parts = n_parts
318- ### DEPRECATED ###
319-
320315 if not os .path .exists (model_path ):
321316 raise ValueError (f"Model path does not exist: { model_path } " )
322317
0 commit comments