diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5175a7f2..e8893d1e1 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int): # Sampling functions - deprecated, use LlamaSampler instead def set_rng_seed(self, seed: int): - raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "set_rng_seed is deprecated, use LlamaSampler instead" + ) def sample_repetition_penalties( self, @@ -366,30 +368,44 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_repetition_penalties is deprecated, use LlamaSampler instead" + ) def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_softmax is deprecated, use LlamaSampler instead" + ) def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_k is deprecated, use LlamaSampler instead" + ) def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_p is deprecated, use LlamaSampler instead" + ) def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_min_p is deprecated, use LlamaSampler instead" + ) def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_typical is deprecated, use LlamaSampler instead" + ) def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead") def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_grammar is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat( self, @@ -399,7 +415,9 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat is deprecated, use LlamaSampler instead" + ) def sample_token_mirostat_v2( self, @@ -408,17 +426,25 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead" + ) def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_greedy is deprecated, use LlamaSampler instead" + ) def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token is deprecated, use LlamaSampler instead" + ) # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "grammar_accept_token is deprecated, use LlamaSampler instead" + ) def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -529,6 +555,8 @@ def normalize_embedding(embedding): norm = float(np.linalg.norm(embedding)) if norm == 0.0: return embedding + if isinstance(embedding, np.ndarray): + return embedding / norm return [v / norm for v in embedding] @@ -602,16 +630,16 @@ def sample( logits_array: Optional[npt.NDArray[np.single]] = None, ): # This method is deprecated in favor of using LlamaSampler directly - raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead" + ) def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): self.prev.append(id) class CustomSampler: - def __init__( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] - ): + def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]): self.apply_func = apply_func def apply_wrapper( @@ -723,20 +751,20 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_grammar_lazy_patterns( - self, - model: LlamaModel, + self, + model: LlamaModel, grammar: LlamaGrammar, trigger_patterns: List[str], - trigger_tokens: List[int] + trigger_tokens: List[int], ): # Convert patterns to C array pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))() for i, pattern in enumerate(trigger_patterns): pattern_ptrs[i] = pattern.encode("utf-8") - + # Convert tokens to C array token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) - + sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( model.vocab, grammar._grammar.encode("utf-8"), @@ -744,7 +772,7 @@ def add_grammar_lazy_patterns( pattern_ptrs, len(trigger_patterns), token_array, - len(trigger_tokens) + len(trigger_tokens), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -771,13 +799,13 @@ def add_dry( dry_base: float, dry_allowed_length: int, dry_penalty_last_n: int, - seq_breakers: List[str] + seq_breakers: List[str], ): # Convert seq_breakers to C array breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))() for i, breaker in enumerate(seq_breakers): breaker_ptrs[i] = breaker.encode("utf-8") - + sampler = llama_cpp.llama_sampler_init_dry( model.vocab, n_ctx_train, @@ -786,25 +814,19 @@ def add_dry( dry_allowed_length, dry_penalty_last_n, breaker_ptrs, - len(seq_breakers) + len(seq_breakers), ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - def add_logit_bias( - self, - n_vocab: int, - logit_bias: Dict[int, float] - ): + def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]): # Convert logit_bias dict to C array bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))() for i, (token, bias) in enumerate(logit_bias.items()): bias_array[i].token = token bias_array[i].bias = bias - + sampler = llama_cpp.llama_sampler_init_logit_bias( - n_vocab, - len(logit_bias), - bias_array + n_vocab, len(logit_bias), bias_array ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) @@ -838,15 +860,17 @@ def reset(self): def clone(self): # NOTE: Custom samplers cannot be cloned due to Python callback limitations if self.custom_samplers: - raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers") - + raise NotImplementedError( + "Cannot clone LlamaSampler that contains custom samplers" + ) + cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler) # Create a new wrapper around the cloned sampler new_sampler = LlamaSampler.__new__(LlamaSampler) new_sampler.sampler = cloned_sampler new_sampler.custom_samplers = [] new_sampler._exit_stack = ExitStack() - + def free_sampler(): if new_sampler.sampler is not None: llama_cpp.llama_sampler_free(new_sampler.sampler) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..91a15ac81 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -25,6 +25,8 @@ Deque, Callable, Dict, + Tuple, + overload, ) from collections import deque from pathlib import Path @@ -75,6 +77,7 @@ def __init__( n_ctx: int = 512, n_batch: int = 512, n_ubatch: int = 512, + n_seq_max: int = 1, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, rope_scaling_type: Optional[ @@ -159,6 +162,7 @@ def __init__( n_ctx: Text context, 0 = from model n_batch: Prompt processing maximum batch size n_ubatch: Physical batch size + n_seq_max: Maximum number of sequences (i.e. distinct states for recurrent models or parallel batches) n_threads: Number of threads to use for generation n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 @@ -300,6 +304,8 @@ def __init__( self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? + self.n_ubatch = min(self.n_batch, n_ubatch) + self.n_seq_max = n_seq_max self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() @@ -310,7 +316,8 @@ def __init__( self.context_params = llama_cpp.llama_context_default_params() self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch - self.context_params.n_ubatch = min(self.n_batch, n_ubatch) + self.context_params.n_ubatch = self.n_ubatch + self.context_params.n_seq_max = self.n_seq_max self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch self.context_params.rope_scaling_type = ( @@ -934,7 +941,8 @@ def generate( sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] + self._input_ids[:sample_idx], + self._scores[sample_idx - self.n_tokens, :], ): return tokens_or_none = yield token @@ -960,7 +968,10 @@ def generate( ) def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None + self, + input: Union[str, List[str]], + model: Optional[str] = None, + return_numpy: bool = False, ) -> CreateEmbeddingResponse: """Embed a string. @@ -975,9 +986,11 @@ def create_embedding( input = input if isinstance(input, list) else [input] # get numeric embeddings - embeds: Union[List[List[float]], List[List[List[float]]]] + embeds: Union[List[npt.NDArray[np.single]], List[List[List[float]]]] total_tokens: int - embeds, total_tokens = self.embed(input, return_count=True) # type: ignore + embeds, total_tokens = self.embed( + input, return_count=True, return_numpy=return_numpy + ) # type: ignore # convert to CreateEmbeddingResponse data: List[Embedding] = [ @@ -999,13 +1012,103 @@ def create_embedding( }, } + @overload + def embed( + self, + input: str, + normalize: bool = False, + truncate: bool = True, + return_count: Literal[False] = False, + return_numpy: Literal[True] = True, + ) -> npt.NDArray[np.single]: ... + + @overload + def embed( + self, + input: str, + normalize: bool = False, + truncate: bool = True, + return_count: Literal[False] = False, + return_numpy: Literal[False] = False, + ) -> List[List[float]]: ... + + @overload + def embed( + self, + input: List[str], + normalize: bool = False, + truncate: bool = True, + return_count: Literal[False] = False, + return_numpy: Literal[True] = True, + ) -> List[npt.NDArray[np.single]]: ... + + @overload + def embed( + self, + input: List[str], + normalize: bool = False, + truncate: bool = True, + return_count: Literal[False] = False, + return_numpy: Literal[False] = False, + ) -> List[List[List[float]]]: ... + + @overload + def embed( + self, + input: str, + normalize: bool = False, + truncate: bool = True, + return_count: Literal[True] = True, + return_numpy: Literal[True] = True, + ) -> Tuple[npt.NDArray[np.single], int]: ... + + @overload + def embed( + self, + input: str, + normalize: bool = False, + truncate: bool = True, + return_count: Literal[True] = True, + return_numpy: Literal[False] = False, + ) -> Tuple[List[List[float]], int]: ... + + @overload + def embed( + self, + input: List[str], + normalize: bool = False, + truncate: bool = True, + return_count: Literal[True] = True, + return_numpy: Literal[True] = True, + ) -> Tuple[List[npt.NDArray[np.single]], int]: ... + + @overload + def embed( + self, + input: List[str], + normalize: bool = False, + truncate: bool = True, + return_count: Literal[True] = True, + return_numpy: Literal[False] = False, + ) -> Tuple[List[List[List[float]]], int]: ... + def embed( self, input: Union[str, List[str]], normalize: bool = False, truncate: bool = True, return_count: bool = False, - ): + return_numpy: bool = False, + ) -> Union[ + npt.NDArray[np.single], + List[List[float]], + List[npt.NDArray[np.single]], + List[List[List[float]]], + Tuple[npt.NDArray[np.single], int], + Tuple[List[List[float]], int], + Tuple[List[npt.NDArray[np.single]], int], + Tuple[List[List[List[float]]], int], + ]: """Embed a string. Args: @@ -1050,20 +1153,21 @@ def decode_batch(seq_sizes: List[int]): pos: int = 0 for i, size in enumerate(seq_sizes): ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx) - embedding: List[List[float]] = [ - ptr[pos + j * n_embd : pos + (j + 1) * n_embd] - for j in range(size) - ] + # Convert full pointer to numpy array once (zero-copy) + ptr_array = np.ctypeslib.as_array(ptr, shape=(size * n_embd,)) + # Reshape to 2D array: (n_tokens, n_embd) + embedding = ptr_array.reshape(size, n_embd) if normalize: - embedding = [ - internals.normalize_embedding(e) for e in embedding - ] + # Normalize each token embedding using vectorized operations + norms = np.linalg.norm(embedding, axis=1, keepdims=True) + norms = np.where(norms == 0, 1, norms) # Avoid division by zero + embedding = embedding / norms data.append(embedding) pos += size else: for i in range(len(seq_sizes)): ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i) - embedding: List[float] = ptr[:n_embd] + embedding = np.ctypeslib.as_array(ptr, shape=(n_embd,)) if normalize: embedding = internals.normalize_embedding(embedding) data.append(embedding) @@ -1110,7 +1214,20 @@ def decode_batch(seq_sizes: List[int]): if self.verbose: llama_cpp.llama_perf_context_print(self._ctx.ctx) - output = data[0] if isinstance(input, str) else data + output: Union[ + npt.NDArray[np.single], + List[List[float]], + List[npt.NDArray[np.single]], + List[List[List[float]]], + ] = data[0] if isinstance(input, str) else data + + if not return_numpy: + if isinstance(output, np.ndarray): + output = output.tolist() + elif isinstance(output, list) and all( + isinstance(x, np.ndarray) for x in output + ): + output = [x.tolist() for x in output] llama_cpp.llama_kv_self_clear(self._ctx.ctx) self.reset() @@ -1157,9 +1274,9 @@ def _create_completion( bos_token_id: int = self.token_bos() cls_token_id: int = self._model.token_cls() sep_token_id: int = self._model.token_sep() - prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix - middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix - suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix + prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix + middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix + suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix add_space_prefix: bool = ( self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" ) @@ -1315,7 +1432,7 @@ def logit_bias_processor( if seed is not None: self.set_seed(seed) else: - self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) + self.set_seed(random.Random(self._seed).randint(0, 2**32)) finish_reason = "length" multibyte_fix = 0 @@ -2056,7 +2173,10 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + return ( + ChatCompletionChunk(**chunk) + for chunk in self.create_chat_completion(*args, **kwargs) + ) # type: ignore else: return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore except ImportError: @@ -2080,8 +2200,9 @@ def __getstate__(self): # Context Params seed=self._seed, n_ctx=self.context_params.n_ctx, - n_batch=self.n_batch, + n_batch=self.context_params.n_batch, n_ubatch=self.context_params.n_ubatch, + n_seq_max=self.context_params.n_seq_max, n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, @@ -2318,7 +2439,11 @@ def from_pretrained( if additional_files: for additonal_file_name in additional_files: # find the additional shard file: - matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + matching_additional_files = [ + file + for file in file_list + if fnmatch.fnmatch(file, additonal_file_name) + ] if len(matching_additional_files) == 0: raise ValueError( diff --git a/tests/test_llama.py b/tests/test_llama.py index 0a1a9f5ad..53e449da8 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -66,6 +66,7 @@ def llama_cpp_model_path(): def test_real_model(llama_cpp_model_path): import os + assert os.path.exists(llama_cpp_model_path) params = llama_cpp.llama_model_default_params() @@ -114,6 +115,7 @@ def test_real_model(llama_cpp_model_path): output_text = model.detokenize(output, special=True) assert output_text == b" over the lazy dog" + def test_real_llama(llama_cpp_model_path): model = llama_cpp.Llama( llama_cpp_model_path, @@ -132,11 +134,10 @@ def test_real_llama(llama_cpp_model_path): top_k=50, top_p=0.9, temperature=0.8, - seed=1337 + seed=1337, ) assert output["choices"][0]["text"] == " over the lazy dog" - output = model.create_completion( "The capital of france is paris, 'true' or 'false'?:\n", max_tokens=4, @@ -146,20 +147,19 @@ def test_real_llama(llama_cpp_model_path): seed=1337, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "true" | "false" -""") +"""), ) assert output["choices"][0]["text"] == "true" suffix = b"rot" tokens = model.tokenize(suffix, add_bos=True, special=True) + def logit_processor_func(input_ids, logits): for token in tokens: logits[token] *= 1000 return logits - logit_processors = llama_cpp.LogitsProcessorList( - [logit_processor_func] - ) + logit_processors = llama_cpp.LogitsProcessorList([logit_processor_func]) output = model.create_completion( "The capital of france is par", @@ -168,7 +168,7 @@ def logit_processor_func(input_ids, logits): top_p=0.9, temperature=0.8, seed=1337, - logits_processor=logit_processors + logits_processor=logit_processors, ) assert output["choices"][0]["text"].lower().startswith("rot") @@ -184,7 +184,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_1 = output["choices"][0]["text"] @@ -196,7 +196,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_2 = output["choices"][0]["text"] @@ -210,7 +210,7 @@ def logit_processor_func(input_ids, logits): temperature=0.8, grammar=llama_cpp.LlamaGrammar.from_string(""" root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" -""") +"""), ) number_3 = output["choices"][0]["text"] @@ -228,7 +228,32 @@ def test_real_llama_embeddings(llama_cpp_model_path): n_threads_batch=multiprocessing.cpu_count(), logits_all=False, flash_attn=True, - embedding=True + embedding=True, ) # Smoke test for now model.embed("Hello World") + + +def test_embed_numpy(llama_cpp_model_path: str): + model = llama_cpp.Llama( + llama_cpp_model_path, + embedding=True, + verbose=False, + n_seq_max=16, # Enable batch embeddings + ) + # Test single input + embedding_numpy = model.embed("Hello, world!", return_numpy=True) + assert isinstance(embedding_numpy, np.ndarray) + embedding_list = model.embed("Hello, world!", return_numpy=False) + assert isinstance(embedding_list, list) + # Test batch input + embeddings_numpy = model.embed( + ["Hello, world!", "Goodbye, world!"], return_numpy=True + ) + assert isinstance(embeddings_numpy, list) + assert all(isinstance(e, np.ndarray) for e in embeddings_numpy) + embeddings_list = model.embed( + ["Hello, world!", "Goodbye, world!"], return_numpy=False + ) + assert isinstance(embeddings_list, list) + assert all(isinstance(e, list) for e in embeddings_list)