abetlen
diff --git a/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docker/openblas_simple/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎docker/openblas_simple/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama_cpp/llama.py‎
Lines changed: 82 additions & 30 deletions b/‎llama_cpp/llama.py‎
Lines changed: 82 additions & 30 deletions
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.78]
+
+### Added
+
+- Grammar based sampling via LlamaGrammar which can be passed to completions
+- Make n_gpu_layers == -1 offload all layers
+
 ## [0.1.77]
 
 - (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
 
@@ -140,7 +140,7 @@ llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
 Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
 
 ```python
-llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
+llm = Llama(model_path="./models/70B/ggml-model.bin", n_gqa=8)
 ```
 
 ## Web Server
@@ -169,7 +169,7 @@ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggm
 ## Low-level API
 
 The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
-The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
 
 Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
 
 
@@ -9,7 +9,7 @@ COPY . .
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
-RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
 
 # Run the server
 CMD python3 -m llama_cpp.server
@@ -23,10 +23,12 @@
 
 from . import llama_cpp
 from .llama_types import *
+from .llama_grammar import LlamaGrammar
 
 import numpy as np
 import numpy.typing as npt
 
+from .utils import suppress_stdout_stderr
 
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
@@ -231,7 +233,8 @@ def __init__(
         rope_freq_base: float = 10000.0,
         rope_freq_scale: float = 1.0,
         n_gqa: Optional[int] = None,  # (TEMPORARY) must be 8 for llama2 70b
-        rms_norm_eps: Optional[float] = None, # (TEMPORARY)
+        rms_norm_eps: Optional[float] = None,  # (TEMPORARY)
+        mul_mat_q: Optional[bool] = None,  # (TEMPORARY)
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -241,6 +244,7 @@ def __init__(
             n_ctx: Maximum context size.
             n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
             seed: Random seed. -1 for random.
+            n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
             f16_kv: Use half-precision for key/value cache.
             logits_all: Return logits for all tokens, not just the last token.
             vocab_only: Only load the vocabulary no weights.
@@ -269,7 +273,7 @@ def __init__(
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
-        self.params.n_gpu_layers = n_gpu_layers
+        self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -280,7 +284,7 @@ def __init__(
         self.params.low_vram = low_vram
 
         self.tensor_split = tensor_split
-        self._c_tensor_split = None
+        self._p_tensor_split = None
 
         if self.tensor_split is not None:
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
@@ -299,6 +303,9 @@ def __init__(
         if rms_norm_eps is not None:
             self.params.rms_norm_eps = rms_norm_eps
 
+        if mul_mat_q is not None:
+            self.params.mul_mat_q = mul_mat_q
+
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 
@@ -316,12 +323,25 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        self.model = llama_cpp.llama_load_model_from_file(
-            self.model_path.encode("utf-8"), self.params
-        )
+        if verbose:
+            self.model = llama_cpp.llama_load_model_from_file(
+                self.model_path.encode("utf-8"), self.params
+            )
+        else:
+            with suppress_stdout_stderr():
+                self.model = llama_cpp.llama_load_model_from_file(
+                    self.model_path.encode("utf-8"), self.params
+                )
         assert self.model is not None
 
-        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
+        if verbose:
+            self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
+        else:
+            with suppress_stdout_stderr():
+                print("here")
+                self.ctx = llama_cpp.llama_new_context_with_model(
+                    self.model, self.params
+                )
 
         assert self.ctx is not None
 
@@ -358,8 +378,8 @@ def __init__(
             sorted=sorted,
         )
         self._candidates = candidates
-        self._token_nl = Llama.token_nl()
-        self._token_eos = Llama.token_eos()
+        self._token_nl = self.token_nl()
+        self._token_eos = self.token_eos()
         self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc)  # type: ignore
         self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)
 
@@ -437,10 +457,14 @@ def detokenize(self, tokens: List[int]) -> bytes:
         """
         assert self.ctx is not None
         output = b""
+        buffer_size = 32
+        buffer = (ctypes.c_char * buffer_size)()
         for token in tokens:
-            output += llama_cpp.llama_token_to_str(
-                self.ctx, llama_cpp.llama_token(token)
+            n = llama_cpp.llama_token_to_str(
+                self.ctx, llama_cpp.llama_token(token), buffer, buffer_size
             )
+            assert n <= buffer_size
+            output += bytes(buffer[:n])
         return output
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
@@ -506,6 +530,7 @@ def _sample(
         mirostat_eta: llama_cpp.c_float,
         penalize_nl: bool = True,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ):
         assert self.ctx is not None
         assert self.n_tokens > 0
@@ -548,8 +573,16 @@ def _sample(
         )
         if not penalize_nl:
             candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
+
+        if grammar is not None:
+            llama_cpp.llama_sample_grammar(
+                ctx=self.ctx,
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                grammar=grammar.grammar,
+            )
+
         if temp.value == 0.0:
-            return llama_cpp.llama_sample_token_greedy(
+            id = llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             )
@@ -561,7 +594,7 @@ def _sample(
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token_mirostat(
+            id = llama_cpp.llama_sample_token_mirostat(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 tau=mirostat_tau,
@@ -576,7 +609,7 @@ def _sample(
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token_mirostat_v2(
+            id = llama_cpp.llama_sample_token_mirostat_v2(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 tau=mirostat_tau,
@@ -613,10 +646,17 @@ def _sample(
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token(
+            id = llama_cpp.llama_sample_token(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             )
+        if grammar is not None:
+            llama_cpp.llama_grammar_accept_token(
+                ctx=self.ctx,
+                grammar=grammar.grammar,
+                token=llama_cpp.ctypes.c_int(id),
+            )
+        return id
 
     def sample(
         self,
@@ -632,6 +672,7 @@ def sample(
         mirostat_tau: float = 5.0,
         penalize_nl: bool = True,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ):
         """Sample a token from the model.
 
@@ -665,6 +706,7 @@ def sample(
             mirostat_eta=llama_cpp.c_float(mirostat_eta),
             penalize_nl=penalize_nl,
             logits_processor=logits_processor,
+            grammar=grammar,
         )
 
     def generate(
@@ -683,6 +725,7 @@ def generate(
         mirostat_eta: float = 0.1,
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ) -> Generator[int, Optional[Sequence[int]], None]:
         """Create a generator of tokens from a prompt.
 
@@ -704,7 +747,6 @@ def generate(
             The generated tokens.
         """
         assert self.ctx is not None
-
         if reset and len(self._input_ids) > 0:
             longest_prefix = 0
             for a, b in zip(self._input_ids, tokens[:-1]):
@@ -722,6 +764,9 @@ def generate(
         if reset:
             self.reset()
 
+        if grammar is not None:
+            grammar.reset()
+
         while True:
             self.eval(tokens)
             token = self.sample(
@@ -736,6 +781,7 @@ def generate(
                 mirostat_tau=mirostat_tau,
                 mirostat_eta=mirostat_eta,
                 logits_processor=logits_processor,
+                grammar=grammar,
             )
             if stopping_criteria is not None and stopping_criteria(
                 self._input_ids, self._scores[-1, :]
@@ -838,6 +884,7 @@ def _create_completion(
         model: Optional[str] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
 
@@ -915,6 +962,7 @@ def _create_completion(
             repeat_penalty=repeat_penalty,
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
+            grammar=grammar,
         ):
             if token == self._token_eos:
                 text = self.detokenize(completion_tokens)
@@ -965,9 +1013,7 @@ def _create_completion(
                 for token in remaining_tokens:
                     token_end_position += len(self.detokenize([token]))
                     # Check if stop sequence is in the token
-                    if token_end_position >= (
-                        remaining_length - first_stop_position
-                    ):
+                    if token_end_position >= (remaining_length - first_stop_position):
                         break
                     logprobs_or_none: Optional[CompletionLogprobs] = None
                     if logprobs is not None:
@@ -1261,6 +1307,7 @@ def create_completion(
         model: Optional[str] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1305,6 +1352,7 @@ def create_completion(
             model=model,
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
+            grammar=grammar
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -1334,6 +1382,7 @@ def __call__(
         model: Optional[str] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1378,6 +1427,7 @@ def __call__(
             model=model,
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
+            grammar=grammar,
         )
 
     def _convert_text_completion_to_chat(
@@ -1460,6 +1510,7 @@ def create_chat_completion(
         mirostat_eta: float = 0.1,
         model: Optional[str] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1502,6 +1553,7 @@ def create_chat_completion(
             mirostat_eta=mirostat_eta,
             model=model,
             logits_processor=logits_processor,
+            grammar=grammar,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -1511,10 +1563,10 @@ def create_chat_completion(
             return self._convert_text_completion_to_chat(completion)
 
     def __del__(self):
-        if self.model is not None:
+        if hasattr(self, "model") and self.model is not None:
             llama_cpp.llama_free_model(self.model)
             self.model = None
-        if self.ctx is not None:
+        if hasattr(self, "ctx") and self.ctx is not None:
             llama_cpp.llama_free(self.ctx)
             self.ctx = None
 
@@ -1638,20 +1690,20 @@ def tokenizer(self) -> "LlamaTokenizer":
         assert self.ctx is not None
         return LlamaTokenizer(self)
 
-    @staticmethod
-    def token_eos() -> int:
+    def token_eos(self) -> int:
         """Return the end-of-sequence token."""
-        return llama_cpp.llama_token_eos()
+        assert self.ctx is not None
+        return llama_cpp.llama_token_eos(self.ctx)
 
-    @staticmethod
-    def token_bos() -> int:
+    def token_bos(self) -> int:
         """Return the beginning-of-sequence token."""
-        return llama_cpp.llama_token_bos()
+        assert self.ctx is not None
+        return llama_cpp.llama_token_bos(self.ctx)
 
-    @staticmethod
-    def token_nl() -> int:
+    def token_nl(self) -> int:
         """Return the newline token."""
-        return llama_cpp.llama_token_nl()
+        assert self.ctx is not None
+        return llama_cpp.llama_token_nl(self.ctx)
 
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]: