diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index 255ee67d..1910864b 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -48,7 +48,7 @@ jobs: # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$' - name: Upload artifact - uses: actions/upload-pages-artifact@v3 + uses: actions/upload-pages-artifact@v4 with: # Upload entire repository path: 'index' diff --git a/.gitmodules b/.gitmodules index 7edf0975..5cc3e080 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git + url = http://github.com/inference-sh/llama.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b06d98b..97b46852 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21) project(llama_cpp) option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON) -option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON) +option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON) function(llama_cpp_python_install_target target) if(NOT TARGET ${target}) @@ -143,7 +143,7 @@ if (LLAMA_BUILD) ) endif() - if (LLAVA_BUILD) + if (MTMD_BUILD) if (LLAMA_CUBLAS OR LLAMA_CUDA) add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUDA) @@ -153,7 +153,7 @@ if (LLAMA_BUILD) add_compile_definitions(GGML_USE_METAL) endif() - # Building llava + # Building multimodal support using mtmd add_subdirectory(vendor/llama.cpp/tools/mtmd) if (WIN32) diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb index be7fe9b5..b1992e9d 100644 --- a/examples/notebooks/Batching.ipynb +++ b/examples/notebooks/Batching.ipynb @@ -230,7 +230,7 @@ "outputs": [], "source": [ "for i in range(n_parallel):\n", - " llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)" + " llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)" ] }, { diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py index e88ed387..032e9835 100644 --- a/llama_cpp/_ctypes_extensions.py +++ b/llama_cpp/_ctypes_extensions.py @@ -128,4 +128,4 @@ def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCD ... -byref = _byref if TYPE_CHECKING else ctypes.byref +byref = _byref if TYPE_CHECKING else ctypes.byref \ No newline at end of file diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 18d73348..3dc7a67f 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -2,6 +2,7 @@ import os import ctypes +from enum import Enum from typing import ( Dict, @@ -26,7 +27,13 @@ # Python wrappers over llama.h structs - +class LlamaBackendDev(Enum): + # CPU device using system memory + CPU = 0 + # GPU device using dedicated memory + GPU = 1 + # accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) + ACCEL = 2 class LlamaModel: """Intermediate Python wrapper for a llama.cpp llama_model. @@ -95,7 +102,13 @@ def n_ctx_train(self) -> int: return llama_cpp.llama_model_n_ctx_train(self.model) def n_embd(self) -> int: - return llama_cpp.llama_model_n_embd(self.model) + return llama_cpp.llama_n_embd(self.model) + + def n_layer(self) -> int: + return llama_cpp.llama_n_layer(self.model) + + def dev_layer(self, il: int) -> LlamaBackendDev: + return LlamaBackendDev(llama_cpp.llama_model_dev_layer(self.model, il)) def rope_freq_scale_train(self) -> float: return llama_cpp.llama_model_rope_freq_scale_train(self.model) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cdc05c7a..fb9df956 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -299,65 +299,9 @@ def __init__( ].key = b"\0" # ensure sentinel element is zeroed self.model_params.kv_overrides = self._kv_overrides_array - self.n_batch = min(n_ctx, n_batch) # ??? - self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) - self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() - # Used by the sampler self._seed = seed or llama_cpp.LLAMA_DEFAULT_SEED - # Context Params - self.context_params = llama_cpp.llama_context_default_params() - self.context_params.n_ctx = n_ctx - self.context_params.n_batch = self.n_batch - self.context_params.n_ubatch = min(self.n_batch, n_ubatch) - self.context_params.n_threads = self.n_threads - self.context_params.n_threads_batch = self.n_threads_batch - self.context_params.rope_scaling_type = ( - rope_scaling_type - if rope_scaling_type is not None - else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED - ) - self.context_params.pooling_type = pooling_type - self.context_params.rope_freq_base = ( - rope_freq_base if rope_freq_base != 0.0 else 0 - ) - self.context_params.rope_freq_scale = ( - rope_freq_scale if rope_freq_scale != 0.0 else 0 - ) - self.context_params.yarn_ext_factor = ( - yarn_ext_factor if yarn_ext_factor != 0.0 else 0 - ) - self.context_params.yarn_attn_factor = ( - yarn_attn_factor if yarn_attn_factor != 0.0 else 0 - ) - self.context_params.yarn_beta_fast = ( - yarn_beta_fast if yarn_beta_fast != 0.0 else 0 - ) - self.context_params.yarn_beta_slow = ( - yarn_beta_slow if yarn_beta_slow != 0.0 else 0 - ) - self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 - self._logits_all = logits_all if draft_model is None else True - self.context_params.embeddings = embedding # TODO: Rename to embeddings - self.context_params.offload_kqv = offload_kqv - self.context_params.flash_attn = flash_attn - - if op_offloat is not None: - self.context_params.op_offloat = op_offloat - - if swa_full is not None: - self.context_params.swa_full = swa_full - - # KV cache quantization - if type_k is not None: - self.context_params.type_k = type_k - if type_v is not None: - self.context_params.type_v = type_v - # Sampling Params - self.context_params.no_perf = no_perf - self.last_n_tokens_size = last_n_tokens_size - self.cache: Optional[BaseLlamaCache] = None self.lora_base = lora_base @@ -378,39 +322,45 @@ def __init__( ) ) ) - - # Override tokenizer + + self.draft_model = draft_model + + # Override tokenizer self.tokenizer_ = tokenizer or LlamaTokenizer(self) + + self._n_vocab = self.n_vocab() - # Set the default value for the context and correct the batch - if n_ctx == 0: - n_ctx = self._model.n_ctx_train() - self.n_batch = min(n_ctx, n_batch) - self.context_params.n_ctx = self._model.n_ctx_train() - self.context_params.n_batch = self.n_batch - self.context_params.n_ubatch = min(self.n_batch, n_ubatch) - - self._ctx = self._stack.enter_context( - contextlib.closing( - internals.LlamaContext( - model=self._model, - params=self.context_params, - verbose=self.verbose, - ) - ) - ) + self._token_nl = self.token_nl() + self._token_eos = self.token_eos() - self._batch = self._stack.enter_context( - contextlib.closing( - internals.LlamaBatch( - n_tokens=self.n_batch, - embd=0, - n_seq_max=self.context_params.n_ctx, - verbose=self.verbose, - ) - ) + self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) + # Context Params + self._create_context( + n_ctx=n_ctx, + n_batch=n_batch, + n_ubatch=min(n_batch, n_ubatch), + n_threads=n_threads, + n_threads_batch=n_threads_batch, + rope_scaling_type=rope_scaling_type, + pooling_type=pooling_type, + rope_freq_base=rope_freq_base, + rope_freq_scale=rope_freq_scale, + yarn_ext_factor=yarn_ext_factor, + yarn_attn_factor=yarn_attn_factor, + yarn_beta_fast=yarn_beta_fast, + yarn_beta_slow=yarn_beta_slow, + yarn_orig_ctx=yarn_orig_ctx, + logits_all=logits_all, + embedding=embedding, + offload_kqv=offload_kqv, + flash_attn=flash_attn, + no_perf=no_perf, + last_n_tokens_size=last_n_tokens_size, + type_k=type_k, + type_v=type_v, ) + self._lora_adapter: Optional[llama_cpp.llama_adapter_lora_p] = None if self.lora_path: @@ -447,22 +397,6 @@ def free_lora_adapter(): str, llama_chat_format.LlamaChatCompletionHandler ] = {} - self.draft_model = draft_model - - self._n_vocab = self.n_vocab() - self._n_ctx = self.n_ctx() - - self._token_nl = self.token_nl() - self._token_eos = self.token_eos() - - self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) - - self.n_tokens = 0 - self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) - self.scores: npt.NDArray[np.single] = np.ndarray( - (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single - ) - self._mirostat_mu = ctypes.c_float( 2.0 * 5.0 ) # TODO: Move this to sampling context @@ -543,7 +477,7 @@ def free_lora_adapter(): print( f"Using fallback chat format: {self.chat_format}", file=sys.stderr ) - + self._sampler = None @property @@ -553,6 +487,13 @@ def ctx(self) -> llama_cpp.llama_context_p: @property def model(self) -> llama_cpp.llama_model_p: return self._model.model + + @property + def n_layer(self) -> int: + return self._model.n_layer() + + def dev_layer(self, il: int) -> internals.LlamaBackendDev: + return self._model.dev_layer(il) @property def _input_ids(self) -> npt.NDArray[np.intc]: @@ -1120,6 +1061,50 @@ def decode_batch(seq_sizes: List[int]): else: return output + def _create_chunk( + self, + completion_id: str, + created: int, + model_name: str, + text: str, + logprobs_or_none: Union[Optional[CompletionLogprobs], None], + index: int, + finish_reason: Union[str, None], + usage: Optional[Dict[str, Any]] = None, + ) -> CreateCompletionStreamResponse: + """Create chunks for streaming API, depending on whether usage is requested or not.""" + if usage is not None: + return { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": text, + "index": index, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason, + } + ], + "usage": usage, + } + else: + return { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": text, + "index": index, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason, + } + ], + } + def _create_completion( self, prompt: Union[str, List[int]], @@ -1446,24 +1431,20 @@ def logit_bias_processor( "top_logprobs": [top_logprob], } returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": self.detokenize( - [token], - prev_tokens=prompt_tokens - + completion_tokens[:returned_tokens], - ).decode("utf-8", errors="ignore"), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": None, - } - ], - } + yield self._create_chunk( + completion_id=completion_id, + created=created, + model_name=model_name, + text=self.detokenize( + [token], + prev_tokens=prompt_tokens + + completion_tokens[:returned_tokens], + ).decode("utf-8", errors="ignore"), + logprobs_or_none=logprobs_or_none, + index=0, + finish_reason=None, + usage=None, + ) else: while len(remaining_tokens) > 0: decode_success = False @@ -1492,20 +1473,16 @@ def logit_bias_processor( remaining_tokens = remaining_tokens[i:] returned_tokens += i - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": ts, - "index": 0, - "logprobs": None, - "finish_reason": None, - } - ], - } + yield self._create_chunk( + completion_id=completion_id, + created=created, + model_name=model_name, + text=ts, + logprobs_or_none=None, + index=0, + finish_reason=None, + usage=None, + ) if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) @@ -1584,54 +1561,51 @@ def logit_bias_processor( if token_end_position == end - 1: break returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": None, - } - ], - } + yield self._create_chunk( + completion_id=completion_id, + created=created, + model_name=model_name, + text=last_text[ + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), + logprobs_or_none=logprobs_or_none, + index=0, + finish_reason=None, + usage=None, + ) break returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": self.detokenize([token]).decode( - "utf-8", errors="ignore" - ), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": None, - } - ], - } - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": "", - "index": 0, - "logprobs": None, - "finish_reason": finish_reason, - } - ], + yield self._create_chunk( + completion_id=completion_id, + created=created, + model_name=model_name, + text=self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + logprobs_or_none=logprobs_or_none, + index=0, + finish_reason=None, + usage=None, + ) + + # Final streaming chunk with both finish_reason and usage + usage = { + "prompt_tokens": len(prompt_tokens), + "completion_tokens": returned_tokens, + "total_tokens": len(prompt_tokens) + returned_tokens, } + + yield self._create_chunk( + completion_id=completion_id, + created=created, + model_name=model_name, + text="", + logprobs_or_none=None, + index=0, + finish_reason=finish_reason, + usage=usage, + ) + if self.cache: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) @@ -2363,6 +2337,266 @@ def from_pretrained( **kwargs, ) + def _create_context( + self, + *, + n_ctx: int = 512, + n_batch: int = 512, + n_ubatch: int = 512, + n_threads: Optional[int] = None, + n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[ + int + ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, + rope_freq_base: float = 0.0, + rope_freq_scale: float = 0.0, + yarn_ext_factor: float = -1.0, + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, + logits_all: bool = False, + embedding: bool = False, + offload_kqv: bool = True, + flash_attn: bool = False, + # Sampling Params + no_perf: bool = False, + last_n_tokens_size: int = 64, + type_k: Optional[int] = None, + type_v: Optional[int] = None, + state: Optional[LlamaState] = None, + ) -> None: + """Free the existing context and create a new one with specified parameters. + + Args: + n_ctx: Text context size. If 0, value from model is used. + n_batch: Maximum batch size for llama_decode. + n_ubatch: Maximum physical batch size. + n_seq_max: Maximum number of sequences (distinct states for recurrent models). + n_threads: Number of threads to use for generation. + n_threads_batch: Number of threads to use for batch processing. + rope_scaling_type: RoPE scaling type from llama_rope_scaling_type enum. + pooling_type: Whether to pool embedding results by sequence id. + attention_type: Attention type to use for embeddings. + rope_freq_base: RoPE base frequency, 0 = from model. + rope_freq_scale: RoPE frequency scaling factor, 0 = from model. + yarn_ext_factor: YaRN extrapolation mix factor, negative = from model. + yarn_attn_factor: YaRN magnitude scaling factor. + yarn_beta_fast: YaRN low correction dim. + yarn_beta_slow: YaRN high correction dim. + yarn_orig_ctx: YaRN original context size. + defrag_thold: Defragment KV cache if holes/size > thold, < 0 disabled. + type_k: Data type for K cache. + type_v: Data type for V cache. + logits_all: Compute all logits in llama_decode (deprecated). + embeddings: Extract embeddings with logits. + offload_kqv: Offload KQV ops (including KV cache) to GPU. + flash_attn: Use flash attention. + no_perf: Disable performance timings. + last_n_tokens_size: Size of the last n tokens. + type_k: Data type for K cache. + type_v: Data type for V cache. + """ + # Create new context params with provided values + self.n_batch = min(n_ctx, n_batch) # ??? + self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) + self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() + + # Context Params + self.context_params = llama_cpp.llama_context_default_params() + self.context_params.n_ctx = n_ctx + self.context_params.n_batch = self.n_batch + self.context_params.n_ubatch = min(self.n_batch, n_ubatch) + self.context_params.n_threads = self.n_threads + self.context_params.n_threads_batch = self.n_threads_batch + self.context_params.rope_scaling_type = ( + rope_scaling_type + if rope_scaling_type is not None + else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + ) + self.context_params.pooling_type = pooling_type + self.context_params.rope_freq_base = ( + rope_freq_base if rope_freq_base != 0.0 else 0 + ) + self.context_params.rope_freq_scale = ( + rope_freq_scale if rope_freq_scale != 0.0 else 0 + ) + self.context_params.yarn_ext_factor = ( + yarn_ext_factor if yarn_ext_factor != 0.0 else 0 + ) + self.context_params.yarn_attn_factor = ( + yarn_attn_factor if yarn_attn_factor != 0.0 else 0 + ) + self.context_params.yarn_beta_fast = ( + yarn_beta_fast if yarn_beta_fast != 0.0 else 0 + ) + self.context_params.yarn_beta_slow = ( + yarn_beta_slow if yarn_beta_slow != 0.0 else 0 + ) + self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 + self.context_params.logits_all = ( + logits_all if self.draft_model is None else True + ) # Must be set to True for speculative decoding + self.context_params.embeddings = embedding # TODO: Rename to embeddings + self.context_params.offload_kqv = offload_kqv + self.context_params.flash_attn = flash_attn + # KV cache quantization + if type_k is not None: + self.context_params.type_k = type_k + if type_v is not None: + self.context_params.type_v = type_v + + self.context_params.no_perf = no_perf + self.last_n_tokens_size = last_n_tokens_size + + # Store logits_all as instance attribute + self._logits_all = self.context_params.logits_all + + # Set the default value for the context and correct the batch + if n_ctx == 0: + n_ctx = self._model.n_ctx_train() + self.n_batch = min(n_ctx, n_batch) + self.context_params.n_ctx = self._model.n_ctx_train() + self.context_params.n_batch = self.n_batch + self.context_params.n_ubatch = min(self.n_batch, n_ubatch) + + self._ctx = self._stack.enter_context( + contextlib.closing( + internals.LlamaContext( + model=self._model, + params=self.context_params, + verbose=self.verbose, + ) + ) + ) + + if state is not None: + self.load_state(state) + + self._n_ctx = self.n_ctx() + self.n_tokens = 0 + + self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) + self.scores: npt.NDArray[np.single] = np.ndarray( + (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single + ) + + self._batch = self._stack.enter_context( + contextlib.closing( + internals.LlamaBatch( + n_tokens=self.n_batch, + embd=0, + n_seq_max=self.context_params.n_ctx, + verbose=self.verbose, + ) + ) + ) + + + + if self._ctx is None: + raise RuntimeError("Failed to create new context") + + def recreate_context( + self, + *, + n_ctx: int = 512, + n_batch: int = 512, + n_ubatch: int = 512, + n_threads: Optional[int] = None, + n_threads_batch: Optional[int] = None, + rope_scaling_type: Optional[ + int + ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, + rope_freq_base: float = 0.0, + rope_freq_scale: float = 0.0, + yarn_ext_factor: float = -1.0, + yarn_attn_factor: float = 1.0, + yarn_beta_fast: float = 32.0, + yarn_beta_slow: float = 1.0, + yarn_orig_ctx: int = 0, + logits_all: bool = False, + embedding: bool = False, + offload_kqv: bool = True, + flash_attn: bool = False, + # Sampling Params + no_perf: bool = False, + last_n_tokens_size: int = 64, + type_k: Optional[int] = None, + type_v: Optional[int] = None, + ) -> None: + """Free the existing context and create a new one with specified parameters. + + Args: + n_ctx: Text context size. If 0, value from model is used. + n_batch: Maximum batch size for llama_decode. + n_ubatch: Maximum physical batch size. + n_seq_max: Maximum number of sequences (distinct states for recurrent models). + n_threads: Number of threads to use for generation. + n_threads_batch: Number of threads to use for batch processing. + rope_scaling_type: RoPE scaling type from llama_rope_scaling_type enum. + pooling_type: Whether to pool embedding results by sequence id. + attention_type: Attention type to use for embeddings. + rope_freq_base: RoPE base frequency, 0 = from model. + rope_freq_scale: RoPE frequency scaling factor, 0 = from model. + yarn_ext_factor: YaRN extrapolation mix factor, negative = from model. + yarn_attn_factor: YaRN magnitude scaling factor. + yarn_beta_fast: YaRN low correction dim. + yarn_beta_slow: YaRN high correction dim. + yarn_orig_ctx: YaRN original context size. + defrag_thold: Defragment KV cache if holes/size > thold, < 0 disabled. + type_k: Data type for K cache. + type_v: Data type for V cache. + logits_all: Compute all logits in llama_decode (deprecated). + embeddings: Extract embeddings with logits. + offload_kqv: Offload KQV ops (including KV cache) to GPU. + flash_attn: Use flash attention. + no_perf: Disable performance timings. + last_n_tokens_size: Size of the last n tokens. + type_k: Data type for K cache. + type_v: Data type for V cache. + """ + + current_state = None + + if self._ctx is not None: + current_state = self.save_state() + self._ctx.close() + self._ctx = None + + # Free existing context if it exists + self._create_context( + n_ctx=n_ctx, + n_batch=n_batch, + n_ubatch=min(n_batch, n_ubatch), + n_threads=n_threads, + n_threads_batch=n_threads_batch, + rope_scaling_type=rope_scaling_type, + pooling_type=pooling_type, + rope_freq_base=rope_freq_base, + rope_freq_scale=rope_freq_scale, + yarn_ext_factor=yarn_ext_factor, + yarn_attn_factor=yarn_attn_factor, + yarn_beta_fast=yarn_beta_fast, + yarn_beta_slow=yarn_beta_slow, + yarn_orig_ctx=yarn_orig_ctx, + logits_all=logits_all, + embedding=embedding, + offload_kqv=offload_kqv, + flash_attn=flash_attn, + no_perf=no_perf, + last_n_tokens_size=last_n_tokens_size, + type_k=type_k, + type_v=type_v, + state=current_state, + ) + + # Reapply any LoRA adapter if it exists + if self._lora_adapter is not None: + llama_cpp.llama_set_adapter_lora(self._ctx, self._lora_adapter, self.lora_scale) + class LlamaState: def __init__( diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a288db7b..76c8ea66 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -7,6 +7,7 @@ import dataclasses import random import string +import warnings from contextlib import ExitStack from typing import ( @@ -349,6 +350,7 @@ def _convert_text_completion_chunks_to_chat( "finish_reason": chunk["choices"][0]["finish_reason"], } ], + "usage": chunk.get("usage") if "usage" in chunk else None, } @@ -433,7 +435,7 @@ def _stream_response_to_function_stream( created = chunk["created"] model = chunk["model"] tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"] - yield { + response = { "id": id_, "object": "chat.completion.chunk", "created": created, @@ -452,7 +454,11 @@ def _stream_response_to_function_stream( } ], } - yield { + if "usage" in chunk: + response["usage"] = chunk["usage"] + yield response + + response = { "id": "chat" + chunk["id"], "object": "chat.completion.chunk", "created": chunk["created"], @@ -486,10 +492,14 @@ def _stream_response_to_function_stream( } ], } + if "usage" in chunk: + response["usage"] = chunk["usage"] + yield response first = False continue + assert tool_id is not None - yield { + response = { "id": "chat" + chunk["id"], "object": "chat.completion.chunk", "created": chunk["created"], @@ -521,9 +531,12 @@ def _stream_response_to_function_stream( } ], } + if "usage" in chunk: + response["usage"] = chunk["usage"] + yield response if id_ is not None and created is not None and model is not None: - yield { + response = { "id": id_, "object": "chat.completion.chunk", "created": created, @@ -542,6 +555,9 @@ def _stream_response_to_function_stream( } ], } + if "usage" in chunk: + response["usage"] = chunk["usage"] + yield response return _stream_response_to_function_stream(chunks) @@ -2122,6 +2138,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) first = False if tools is not None: @@ -2162,6 +2179,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) # Yield tool_call/function_call stop message yield llama_types.CreateChatCompletionStreamResponse( @@ -2184,6 +2202,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) # If "auto" or no tool_choice/function_call elif isinstance(function_call, str) and function_call == "auto": @@ -2219,6 +2238,7 @@ def generate_streaming(tools, functions, function_call, prompt): "finish_reason": None, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) else: prompt += f"{function_name}\n<|content|>" @@ -2264,6 +2284,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) # Generate content stops = [RECIPIENT_TOKEN, STOP_TOKEN] @@ -2301,6 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) is_end = False elif chunk["choices"][0]["text"] == "\n": @@ -2330,6 +2352,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) # Check whether the model wants to generate another turn if ( @@ -2362,6 +2385,7 @@ def generate_streaming(tools, functions, function_call, prompt): "finish_reason": "stop", } ], + usage=chunk["usage"] if "usage" in chunk else None, ) break else: @@ -2411,6 +2435,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) prompt += completion_text.strip() grammar = None @@ -2450,6 +2475,7 @@ def generate_streaming(tools, functions, function_call, prompt): }, } ], + usage=chunk["usage"] if "usage" in chunk else None, ) break @@ -2649,7 +2675,6 @@ def generate_streaming(tools, functions, function_call, prompt): usage=completion["usage"], ) - class Llava15ChatHandler: DEFAULT_SYSTEM_MESSAGE: Optional[str] = ( "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." @@ -3028,11 +3053,18 @@ def _load_image(image_url: str) -> bytes: import base64 image_bytes = base64.b64decode(image_url.split(",")[1]) return image_bytes - else: + elif image_url.startswith("http") or image_url.startswith("https"): import urllib.request with urllib.request.urlopen(image_url) as f: image_bytes = f.read() return image_bytes + else: + import os + if os.path.exists(image_url): + with open(image_url, "rb") as f: + image_bytes = f.read() + return image_bytes + raise ValueError(f"Image file does not exist: {image_url}") @staticmethod def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): @@ -3510,6 +3542,229 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class Gemma3ChatHandler(Llava15ChatHandler): + # Chat Format: + # 'user\n{system_prompt}\n\n{prompt}\nmodel\n' + + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = ( + "{{ '' }}" + "{%- if messages[0]['role'] == 'system' -%}" + "{%- if messages[0]['content'] is string -%}" + "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}" + "{%- else -%}" + "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}" + "{%- endif -%}" + "{%- set loop_messages = messages[1:] -%}" + "{%- else -%}" + "{%- set first_user_prefix = \"\" -%}" + "{%- set loop_messages = messages -%}" + "{%- endif -%}" + "{%- for message in loop_messages -%}" + "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{%- endif -%}" + "{%- if (message['role'] == 'assistant') -%}" + "{%- set role = \"model\" -%}" + "{%- else -%}" + "{%- set role = message['role'] -%}" + "{%- endif -%}" + "{{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] | trim }}" + "{%- elif message['content'] is iterable -%}" + "{%- for item in message['content'] -%}" + "{%- if item['type'] == 'image_url' -%}" + "{{ '' }}" + "{%- elif item['type'] == 'text' -%}" + "{{ item['text'] | trim }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- else -%}" + "{{ raise_exception(\"Invalid content type\") }}" + "{%- endif -%}" + "{{ '\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ 'model\n' }}" + "{%- endif -%}" + ) + + @staticmethod + def split_text_on_image_urls(text: str, image_urls: List[str]): + split_text: List[Tuple[Literal["text", "image_url"], str]] = [] + copied_urls = image_urls[:] + remaining = text + image_placeholder = "" + + while remaining: + # Find placeholder + pos = remaining.find(image_placeholder) + if pos != -1: + assert len(copied_urls) > 0 + if pos > 0: + split_text.append(("text", remaining[:pos])) + split_text.append(("text", "\n\n")) + split_text.append(("image_url", copied_urls.pop(0))) + split_text.append(("text", "\n\n")) + remaining = remaining[pos + len(image_placeholder):] + else: + assert len(copied_urls) == 0 + split_text.append(("text", remaining)) + remaining = "" + return split_text + + +def _accumulate_chunks( + chunks_iterator: Iterator[llama_types.CreateCompletionStreamResponse], + chunks_list: List[llama_types.CreateCompletionStreamResponse], +) -> Iterator[llama_types.CreateCompletionStreamResponse]: + for chunk in chunks_iterator: + chunks_list.append(chunk) + yield chunk + + +def _convert_chunks_to_completion( + chunks: List[llama_types.CreateCompletionStreamResponse], +) -> llama_types.CreateCompletionResponse: + """Convert a list of completion chunks to a completion.""" + # Accumulate completion response values + text: str = "" + finish_reason: Optional[str] = None + logprobs: Optional[llama_types.CompletionLogprobs] = None + prompt_tokens = 0 + completion_tokens = 0 + total_tokens = 0 + completion_id: Optional[str] = None + completion_model: Optional[str] = None + completion_created: Optional[int] = None + for chunk in chunks: + # Extract the id, model, and created values from the first chunk + if completion_id is None: + completion_id = chunk["id"] + completion_model = chunk["model"] + completion_created = chunk["created"] + # Extract the usage if present in the chunk + usage = chunk.get("usage") + if usage: + prompt_tokens += usage.get("prompt_tokens", 0) + completion_tokens += usage.get("completion_tokens", 0) + total_tokens += usage.get("total_tokens", 0) + # Accumulate the chunk text + choice = chunk["choices"][0] + text += choice.get("text", "") + # Extract the finish_reason and logprobs if present in the chunk + if choice.get("finish_reason"): + finish_reason = choice["finish_reason"] + if choice.get("logprobs"): + logprobs = choice["logprobs"] + # Create the completion response + completion: llama_types.CreateCompletionResponse = { + "id": completion_id or "unknown_id", + "object": "text_completion", + "created": completion_created or 0, + "model": completion_model or "unknown_model", + "choices": [ + { + "text": text, + "index": 0, + "logprobs": logprobs, # TODO: Improve accumulation of logprobs + "finish_reason": finish_reason, # type: ignore[typeddict-item] + } + ], + } + # Add usage section if present in the chunks + if (prompt_tokens + completion_tokens + total_tokens) > 0: + completion["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + } + return completion + + +def _stream_tool_calls( + llama: llama.Llama, + prompt: str, + tools: List[llama_types.ChatCompletionTool], + tool_name: str, + completion_kwargs: dict[str, Any], + follow_up_gbnf_tool_grammar: str, +) -> Iterator[llama_types.CreateChatCompletionStreamResponse]: + # Generate a tool call completions + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + completions: List[llama_types.CreateCompletionResponse] = [] + completions_tool_name: List[str] = [] + finish_reason_chat_chunk = None + while tool is not None and len(completions) <= 16: + # Generate the parameter values for the selected tool + prompt += f"functions.{tool_name}:\n" + try: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(tool["function"]["parameters"]), verbose=llama.verbose + ) + except Exception as e: + warnings.warn( + f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}", + category=RuntimeWarning, + stacklevel=2, + ) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + completion_or_chunks = llama.create_completion( + prompt=prompt, + **{ + **completion_kwargs, + "max_tokens": None, + "grammar": grammar, + }, + ) + chunks: List[llama_types.CreateCompletionResponse] = [] + chat_chunks = _convert_completion_to_chat_function( + tool_name, + _accumulate_chunks(completion_or_chunks, chunks), # type: ignore[arg-type] + stream=True, + ) + for chat_chunk in chat_chunks: + # Don't return the finish_reason chunk + if chat_chunk["choices"] and chat_chunk["choices"][0].get("finish_reason"): + finish_reason_chat_chunk = chat_chunk + break + # Update this tool call's index + if chat_chunk["choices"] and chat_chunk["choices"][0]["delta"].get("tool_calls"): + chat_chunk["choices"][0]["delta"]["tool_calls"][0]["index"] = len(completions) + yield chat_chunk + completion = _convert_chunks_to_completion(chunks) + completions.append(completion) + completions_tool_name.append(tool_name) + prompt += completion["choices"][0]["text"] + prompt += "\n" + # Determine whether to call another tool or stop + response = cast( + llama_types.CreateCompletionResponse, + llama.create_completion( + prompt=prompt, + **{ + **completion_kwargs, + "temperature": 0, + "stream": False, + "stop": [*completion_kwargs["stop"], ":", ""], + "max_tokens": None, + "grammar": llama_grammar.LlamaGrammar.from_string( + follow_up_gbnf_tool_grammar, verbose=llama.verbose + ), + }, + ), + ) + tool_name = response["choices"][0]["text"][len("functions.") :] + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + # Yield the finish_reason chunk + if finish_reason_chat_chunk is not None: + yield finish_reason_chat_chunk + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, @@ -3539,7 +3794,7 @@ def chatml_function_calling( grammar: Optional[llama.LlamaGrammar] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, - **kwargs, # type: ignore + **kwargs: Any, ) -> Union[ llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], @@ -3553,18 +3808,21 @@ def chatml_function_calling( "{% if tool_calls %}" "\n\nYou have access to the following functions:\n" "{% for tool in tools %}" + '\n{% if tool.function.get("description") %}/* {{ tool.function.description | trim }} */{% endif %}' "\nfunctions.{{ tool.function.name }}:\n" "{{ tool.function.parameters | tojson }}" "\n{% endfor %}" - "\n\nYou can respond to users messages with either a single message or one or more function calls." - "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\nYou must respond to user messages with either a single message or with one or more function calls." + "\n\nTo respond with a message use the following format:" "\n\nmessage:" "\n" - "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" - "\n\nfunctions.:" + "\n\nTo respond with one or more function calls use the following format:" + "\n\n" + "\nfunctions.:" '\n{ "arg1": "value1", "arg2": "value2" }' "\nfunctions.:" '\n{ "arg1": "value1", "arg2": "value2" }' + "\n" "{% endif %}" "<|im_end|>\n" "{% endif %}" @@ -3575,7 +3833,7 @@ def chatml_function_calling( "{% endif %}" # Assistant message "{% if message.role == 'assistant' %}" - ## Reglar message + ## Regular message "{% if message.content and message.content | length > 0 %}" "{% if tool_calls %}" "message:\n" @@ -3602,35 +3860,55 @@ def chatml_function_calling( # Convert legacy functions to tools if functions is not None: - tools = [ - { - "type": "function", - "function": function, - } - for function in functions - ] + tools = [{"type": "function", "function": function} for function in functions] # Convert legacy function_call to tool_choice if function_call is not None: - if isinstance(function_call, str) and ( - function_call == "none" or function_call == "auto" - ): + if isinstance(function_call, str) and (function_call in ("none", "auto")): tool_choice = function_call if isinstance(function_call, dict) and "name" in function_call: - tool_choice = { - "type": "function", - "function": { - "name": function_call["name"], - }, - } + tool_choice = {"type": "function", "function": {"name": function_call["name"]}} + # Collect the llama.create_completion keyword arguments so we don't have to repeat these with + # each completion call stop = ( [stop, "<|im_end|>"] if isinstance(stop, str) - else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + else [*stop, "<|im_end|>"] + if stop + else ["<|im_end|>"] ) + grammar = ( # It is assumed the grammar applies to messages only, not tool calls + grammar + if grammar is not None + else ( + _grammar_for_response_format(response_format) + if response_format is not None and response_format["type"] == "json_object" + else None + ) + ) + completion_kwargs = { + "temperature": temperature, + "top_p": top_p, + "top_k": top_k, + "min_p": min_p, + "typical_p": typical_p, + "stream": stream, + "stop": stop, + "max_tokens": max_tokens, + "presence_penalty": presence_penalty, + "frequency_penalty": frequency_penalty, + "repeat_penalty": repeat_penalty, + "tfs_z": tfs_z, + "mirostat_mode": mirostat_mode, + "mirostat_tau": mirostat_tau, + "mirostat_eta": mirostat_eta, + "model": model, + "logits_processor": logits_processor, + "grammar": grammar, + } - # Case 1: No tool choice by user + # Case 1: No tool use if ( tool_choice is None or (isinstance(tool_choice, str) and tool_choice == "none") @@ -3638,316 +3916,526 @@ def chatml_function_calling( or len(tools) == 0 ): prompt = template_renderer.render( - messages=messages, - tools=[], - tool_calls=None, - add_generation_prompt=True, + messages=messages, tools=[], tool_calls=None, add_generation_prompt=True ) - - if response_format is not None and response_format["type"] == "json_object": - grammar = _grammar_for_response_format(response_format) - return _convert_completion_to_chat( llama.create_completion( prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=grammar, + **completion_kwargs, # type: ignore[arg-type] logprobs=top_logprobs if logprobs else None, ), stream=stream, ) - # Case 2: Tool choice by user + # Ensure there is a system prompt to attach the tool metadata to + if not any(message["role"] == "system" for message in messages): + messages = [*messages, {"role": "system", "content": ""}] + + # Case 2: Automatic or fixed tool choice + # Case 2 step 1: Determine whether to respond with a message or a tool call + assert (isinstance(tool_choice, str) and tool_choice == "auto") or isinstance(tool_choice, dict) if isinstance(tool_choice, dict): - tool_name = tool_choice["function"]["name"] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None + tools = [t for t in tools if t["function"]["name"] == tool_choice["function"]["name"]] + assert tools + function_names = " | ".join([f'''"functions.{t['function']['name']}:"''' for t in tools]) + prompt = template_renderer.render( + messages=messages, tools=tools, tool_calls=True, add_generation_prompt=True + ) + initial_gbnf_tool_grammar = ( + ( + 'root ::= "" "\\n" functions | "message:"\n' + f"functions ::= {function_names}\n" ) - if tool is None: - raise ValueError(f"Tool with name '{tool_name}' not found in tools") + if tool_choice == "auto" + else f'root ::= "" "\\n" functions\nfunctions ::= {function_names}\n' + ) + completion = cast( + llama_types.CreateCompletionResponse, + llama.create_completion( + prompt=prompt, + **{ # type: ignore[arg-type] + **completion_kwargs, + "temperature": 0, + "stream": False, + "stop": [":"], + "max_tokens": None, + "grammar": llama_grammar.LlamaGrammar.from_string( + initial_gbnf_tool_grammar, verbose=llama.verbose + ), + }, + ), + ) + text = completion["choices"][0]["text"] + tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :] + + # Case 2 step 2A: Respond with a message + if tool_name is None: prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, + messages=messages, tools=[], tool_calls=None, add_generation_prompt=True + ) + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + **completion_kwargs, # type: ignore[arg-type] + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, + ) + + # Case 2 step 2B: One or more function calls + follow_up_gbnf_tool_grammar = ( + 'root ::= functions | "" | "<|im_end|>"\n' + f"functions ::= {function_names}\n" + ) + prompt += "\n" + if stream: + return _stream_tool_calls( + llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar ) + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + completions: List[llama_types.CreateCompletionResponse] = [] + completions_tool_name: List[str] = [] + while tool is not None and len(completions) <= 16: + # Generate the parameter values for the selected tool prompt += f"functions.{tool_name}:\n" try: grammar = llama_grammar.LlamaGrammar.from_json_schema( json.dumps(tool["function"]["parameters"]), verbose=llama.verbose ) except Exception as e: + warnings.warn( + f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}", + category=RuntimeWarning, + stacklevel=2, + ) grammar = llama_grammar.LlamaGrammar.from_string( llama_grammar.JSON_GBNF, verbose=llama.verbose ) - if llama.verbose: - print( - "Failed to parse function body as JSON schema, falling back to default grammar" - ) - print(e) completion_or_chunks = llama.create_completion( prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=stop, - max_tokens=max_tokens, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=grammar, + **{ # type: ignore[arg-type] + **completion_kwargs, + "max_tokens": None, + "grammar": grammar, + }, ) - return _convert_completion_to_chat_function( - tool_name, completion_or_chunks, stream + completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks) + completions.append(completion) + completions_tool_name.append(tool_name) + prompt += completion["choices"][0]["text"] + prompt += "\n" + # Determine whether to call another tool or stop + response = cast( + llama_types.CreateCompletionResponse, + llama.create_completion( + prompt=prompt, + **{ # type: ignore[arg-type] + **completion_kwargs, + "temperature": 0, + "stream": False, + "stop": [*completion_kwargs["stop"], ":", ""], # type: ignore[misc] + "max_tokens": None, + "grammar": llama_grammar.LlamaGrammar.from_string( + follow_up_gbnf_tool_grammar, verbose=llama.verbose + ), + }, + ), ) + tool_name = response["choices"][0]["text"][len("functions.") :] + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + # Merge the completions into a single chat completion + chat_completion: llama_types.CreateChatCompletionResponse = { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "finish_reason": "tool_calls", + "index": 0, + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] + ), + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"], + "type": "function", + "function": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + } + for i, (tool_name, completion) in enumerate( + zip(completions_tool_name, completions) + ) + ], + }, + } + ], + "usage": { + "completion_tokens": sum( + (completion["usage"]["completion_tokens"] if "usage" in completion else 0) + for completion in completions + ), + "prompt_tokens": sum( + completion["usage"]["prompt_tokens"] if "usage" in completion else 0 + for completion in completions + ), + "total_tokens": sum( + completion["usage"]["total_tokens"] if "usage" in completion else 0 + for completion in completions + ), + }, + } + if len(completions) == 1: + single_function_call: llama_types.ChatCompletionResponseFunctionCall = { + "name": tool_name, + "arguments": completions[0]["choices"][0]["text"], + } + chat_completion["choices"][0]["message"]["function_call"] = single_function_call + return chat_completion + + +@register_chat_completion_handler("gguf-function-calling") +def gguf_function_calling( + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, + logprobs: Optional[bool] = None, + top_logprobs: Optional[int] = None, + **kwargs: Any, +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + + function_calling_template = None + if hasattr(llama, 'model_path'): + metadata = llama.metadata + if metadata and "tokenizer.chat_template" in metadata: + function_calling_template = metadata["tokenizer.chat_template"] + - # Case 3: Automatic tool choice - assert isinstance(tool_choice, str) and tool_choice == "auto" - function_names = " | ".join( - [f'''"functions.{tool['function']['name']}:"''' for tool in tools] + function_calling_template = ( + "{% for message in messages %}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + '\n{% if tool.function.get("description") %}/* {{ tool.function.description | trim }} */{% endif %}' + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\nYou must respond to user messages with either a single message or with one or more function calls." + "\n\nTo respond with a message use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls use the following format:" + "\n\n" + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\n" + "{% endif %}" + "<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Regular message + "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" + "message:\n" + "{% endif %}" + "{{ message.content }}" + "<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if 'tool_calls' in message %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" + "{% endfor %}" + "<|im_end|>\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) - initial_gbnf_tool_grammar = ( - """root ::= functions | "message:"\n""" - f"""functions ::= {function_names}\n""" + template_renderer = ImmutableSandboxedEnvironment( + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) + + # Convert legacy functions to tools + if functions is not None: + tools = [{"type": "function", "function": function} for function in functions] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and (function_call in ("none", "auto")): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = {"type": "function", "function": {"name": function_call["name"]}} + + # Collect the llama.create_completion keyword arguments so we don't have to repeat these with + # each completion call + stop = ( + [stop, "<|im_end|>"] + if isinstance(stop, str) + else [*stop, "<|im_end|>"] + if stop + else ["<|im_end|>"] ) - follow_up_gbnf_tool_grammar = ( - """root ::= functions | "<|im_end|>"\n""" - f"""functions ::= {function_names}\n""" + grammar = ( # It is assumed the grammar applies to messages only, not tool calls + grammar + if grammar is not None + else ( + _grammar_for_response_format(response_format) + if response_format is not None and response_format["type"] == "json_object" + else None + ) ) + completion_kwargs = { + "temperature": temperature, + "top_p": top_p, + "top_k": top_k, + "min_p": min_p, + "typical_p": typical_p, + "stream": stream, + "stop": stop, + "max_tokens": max_tokens, + "presence_penalty": presence_penalty, + "frequency_penalty": frequency_penalty, + "repeat_penalty": repeat_penalty, + "tfs_z": tfs_z, + "mirostat_mode": mirostat_mode, + "mirostat_tau": mirostat_tau, + "mirostat_eta": mirostat_eta, + "model": model, + "logits_processor": logits_processor, + "grammar": grammar, + } + + # Case 1: No tool use + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, tools=[], tool_calls=None, add_generation_prompt=True + ) + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + **completion_kwargs, # type: ignore[arg-type] + logprobs=top_logprobs if logprobs else None, + ), + stream=stream, + ) + + # Ensure there is a system prompt to attach the tool metadata to + if not any(message["role"] == "system" for message in messages): + messages = [*messages, {"role": "system", "content": ""}] + + # Case 2: Automatic or fixed tool choice + # Case 2 step 1: Determine whether to respond with a message or a tool call + assert (isinstance(tool_choice, str) and tool_choice == "auto") or isinstance(tool_choice, dict) + if isinstance(tool_choice, dict): + tools = [t for t in tools if t["function"]["name"] == tool_choice["function"]["name"]] + assert tools + function_names = " | ".join([f'''"functions.{t['function']['name']}:"''' for t in tools]) prompt = template_renderer.render( - messages=messages, - tools=tools, - tool_calls=True, - add_generation_prompt=True, + messages=messages, tools=tools, tool_calls=True, add_generation_prompt=True ) - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=0, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=False, - stop=[":"], - max_tokens=None, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=llama_grammar.LlamaGrammar.from_string( - initial_gbnf_tool_grammar, verbose=llama.verbose + initial_gbnf_tool_grammar = ( + ( + 'root ::= "" "\\n" functions | "message:"\n' + f"functions ::= {function_names}\n" + ) + if tool_choice == "auto" + else f'root ::= "" "\\n" functions\nfunctions ::= {function_names}\n' + ) + completion = cast( + llama_types.CreateCompletionResponse, + llama.create_completion( + prompt=prompt, + **{ # type: ignore[arg-type] + **completion_kwargs, + "temperature": 0, + "stream": False, + "stop": [":"], + "max_tokens": None, + "grammar": llama_grammar.LlamaGrammar.from_string( + initial_gbnf_tool_grammar, verbose=llama.verbose + ), + }, ), ) - completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore text = completion["choices"][0]["text"] - if "message" in text: + tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :] + + # Case 2 step 2A: Respond with a message + if tool_name is None: + prompt = template_renderer.render( + messages=messages, tools=[], tool_calls=None, add_generation_prompt=True + ) return _convert_completion_to_chat( llama.create_completion( - prompt=prompt + "message:\n", - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=stream, - stop=["<|im_end|>"], + prompt=prompt, + **completion_kwargs, # type: ignore[arg-type] logprobs=top_logprobs if logprobs else None, - max_tokens=None, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=llama_grammar.LlamaGrammar.from_string( - follow_up_gbnf_tool_grammar, verbose=llama.verbose - ), ), stream=stream, ) - # One or more function calls - tool_name = text[len("functions.") :] + # Case 2 step 2B: One or more function calls + follow_up_gbnf_tool_grammar = ( + 'root ::= functions | "" | "<|im_end|>"\n' + f"functions ::= {function_names}\n" + ) + prompt += "\n" + if stream: + return _stream_tool_calls( + llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar + ) tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) - if not stream: - completions: List[llama_types.CreateCompletionResponse] = [] - completions_tool_name: List[str] = [] - while tool is not None: - prompt += f"functions.{tool_name}:\n" - try: - grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(tool["function"]["parameters"]), verbose=llama.verbose - ) - except Exception as e: - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF, verbose=llama.verbose - ) - if llama.verbose: - print( - "Failed to parse function body as JSON schema, falling back to default grammar" - ) - print(e) - completion_or_chunks = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=False, - stop=stop, - max_tokens=None, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=grammar, - ) - completion_or_chunks = cast( - llama_types.CreateCompletionResponse, completion_or_chunks + completions: List[llama_types.CreateCompletionResponse] = [] + completions_tool_name: List[str] = [] + while tool is not None and len(completions) <= 16: + # Generate the parameter values for the selected tool + prompt += f"functions.{tool_name}:\n" + try: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(tool["function"]["parameters"]), verbose=llama.verbose ) - completions.append(completion_or_chunks) - completions_tool_name.append(tool_name) - prompt += completion_or_chunks["choices"][0]["text"] - prompt += "\n" - - response = llama.create_completion( - prompt=prompt, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - typical_p=typical_p, - stream=False, - stop=stop, - max_tokens=None, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - repeat_penalty=repeat_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - grammar=llama_grammar.LlamaGrammar.from_string( - follow_up_gbnf_tool_grammar, verbose=llama.verbose - ), + except Exception as e: + warnings.warn( + f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}", + category=RuntimeWarning, + stacklevel=2, ) - response = cast(llama_types.CreateCompletionResponse, response) - - tool_name = response["choices"][0]["text"][len("functions.") :] - tool = next( - (tool for tool in tools if tool["function"]["name"] == tool_name), None + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose ) - - # Merge completions - function_call_dict: Union[ - Dict[str, str], - Dict[ - Literal["function_call"], - llama_types.ChatCompletionRequestAssistantMessageFunctionCall, - ], - ] = ( - { - "function_call": { - "name": tool_name, - "arguments": completions[0]["choices"][0]["text"], - } - } - if len(completions) == 1 - else {} + completion_or_chunks = llama.create_completion( + prompt=prompt, + **{ # type: ignore[arg-type] + **completion_kwargs, + "max_tokens": None, + "grammar": grammar, + }, ) - return { - "id": "chat" + completion["id"], - "object": "chat.completion", - "created": completion["created"], - "model": completion["model"], - "choices": [ - { - "finish_reason": "tool_calls", - "index": 0, - "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]), - "message": { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": "call_" - + f"_{i}_" - + tool_name - + "_" - + completion["id"], - "type": "function", - "function": { - "name": tool_name, - "arguments": completion["choices"][0]["text"], - }, - } - for i, (tool_name, completion) in enumerate( - zip(completions_tool_name, completions) - ) - ], - **function_call_dict, - }, - } - ], - "usage": { - "completion_tokens": sum( - ( - completion["usage"]["completion_tokens"] - if "usage" in completion - else 0 - ) - for completion in completions - ), - "prompt_tokens": sum( - completion["usage"]["prompt_tokens"] if "usage" in completion else 0 - for completion in completions - ), - "total_tokens": sum( - completion["usage"]["total_tokens"] if "usage" in completion else 0 - for completion in completions + completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks) + completions.append(completion) + completions_tool_name.append(tool_name) + prompt += completion["choices"][0]["text"] + prompt += "\n" + # Determine whether to call another tool or stop + response = cast( + llama_types.CreateCompletionResponse, + llama.create_completion( + prompt=prompt, + **{ # type: ignore[arg-type] + **completion_kwargs, + "temperature": 0, + "stream": False, + "stop": [*completion_kwargs["stop"], ":", ""], # type: ignore[misc] + "max_tokens": None, + "grammar": llama_grammar.LlamaGrammar.from_string( + follow_up_gbnf_tool_grammar, verbose=llama.verbose + ), + }, + ), + ) + tool_name = response["choices"][0]["text"][len("functions.") :] + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + # Merge the completions into a single chat completion + chat_completion: llama_types.CreateChatCompletionResponse = { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "finish_reason": "tool_calls", + "index": 0, + "logprobs": _convert_text_completion_logprobs_to_chat( + completion["choices"][0]["logprobs"] ), - }, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"], + "type": "function", + "function": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + } + for i, (tool_name, completion) in enumerate( + zip(completions_tool_name, completions) + ) + ], + }, + } + ], + "usage": { + "completion_tokens": sum( + (completion["usage"]["completion_tokens"] if "usage" in completion else 0) + for completion in completions + ), + "prompt_tokens": sum( + completion["usage"]["prompt_tokens"] if "usage" in completion else 0 + for completion in completions + ), + "total_tokens": sum( + completion["usage"]["total_tokens"] if "usage" in completion else 0 + for completion in completions + ), + }, + } + if len(completions) == 1: + single_function_call: llama_types.ChatCompletionResponseFunctionCall = { + "name": tool_name, + "arguments": completions[0]["choices"][0]["text"], } - - raise ValueError("Automatic streaming tool choice is not supported") + chat_completion["choices"][0]["message"]["function_call"] = single_function_call + return chat_completion diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d13d6045..2c625505 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -10,6 +10,7 @@ NewType, Optional, TYPE_CHECKING, + List, ) from llama_cpp._ctypes_extensions import ( @@ -654,6 +655,12 @@ class llama_model_kv_override(ctypes.Structure): # ggml_backend_buffer_type_t buft; # }; +class llama_model_tensor_buft_override(ctypes.Structure): + _fields_ = [ + ("pattern", ctypes.c_char_p), + ("buft", ctypes.c_int), + ] + # struct llama_model_params { # // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) @@ -785,7 +792,8 @@ class llama_model_params(ctypes.Structure): # // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 # }; class llama_context_params(ctypes.Structure): - """Parameters for llama_context + """Parameters for llama_context. NOTE: changing the default values of parameters marked as [EXPERIMENTAL] + may cause crashes or incorrect results in certain configurations. Attributes: n_ctx (int): text context, 0 = from model @@ -795,7 +803,7 @@ class llama_context_params(ctypes.Structure): n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` - pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + pooling_type (int): whether to pool (sum) embedding results by sequence id attention_type (int): attention type to use for embeddings rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model @@ -1428,6 +1436,11 @@ def llama_model_n_embd(model: llama_model_p, /) -> int: def llama_model_n_layer(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_dev_layer (const struct llama_model * model, int32_t il); +@ctypes_function("llama_model_dev_layer", [llama_model_p_ctypes, ctypes.c_int32], ctypes.c_int32) +def llama_model_dev_layer(model: llama_model_p, il: Union[ctypes.c_int32, int], /) -> int: + ... + # LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index f647822f..67772b8e 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -154,13 +154,13 @@ class ChatCompletionStreamResponseChoice(TypedDict): finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]] logprobs: NotRequired[Optional[ChatCompletionLogprobs]] - class CreateChatCompletionStreamResponse(TypedDict): id: str model: str object: Literal["chat.completion.chunk"] created: int choices: List[ChatCompletionStreamResponseChoice] + usage: NotRequired[CompletionUsage] class ChatCompletionFunctions(TypedDict): diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py deleted file mode 100644 index d9dfaf5f..00000000 --- a/llama_cpp/llava_cpp.py +++ /dev/null @@ -1,158 +0,0 @@ -from __future__ import annotations - -import os -from ctypes import ( - c_bool, - c_char_p, - c_int, - c_uint8, - c_float, - c_void_p, - POINTER, - _Pointer, # type: ignore - Structure, -) -import pathlib -from typing import ( - Union, - NewType, - Optional, - TYPE_CHECKING, -) - -import llama_cpp.llama_cpp as llama_cpp - -from llama_cpp._ctypes_extensions import ( - load_shared_library, - ctypes_function_for_shared_library, -) - -if TYPE_CHECKING: - from llama_cpp._ctypes_extensions import ( - CtypesArray, - ) - - -# Specify the base name of the shared library to load -_libllava_base_name = "llava" -_libllava_override_path = os.environ.get("LLAVA_CPP_LIB") -_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path() - -# Load the library -_libllava = load_shared_library(_libllava_base_name, _libllava_base_path) - -ctypes_function = ctypes_function_for_shared_library(_libllava) - - -################################################ -# llava.h -################################################ - -# struct clip_ctx; -clip_ctx_p = NewType("clip_ctx_p", int) -clip_ctx_p_ctypes = c_void_p - - -# struct llava_image_embed { -# float * embed; -# int n_image_pos; -# }; -class llava_image_embed(Structure): - _fields_ = [ - ("embed", POINTER(c_float)), - ("n_image_pos", c_int), - ] - - -# /** sanity check for clip <-> llava embed size match */ -# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); -@ctypes_function( - "llava_validate_embed_size", - [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], - c_bool, -) -def llava_validate_embed_size( - ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, / -) -> bool: - ... - - -# /** build an image embed from image file bytes */ -# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); -@ctypes_function( - "llava_image_embed_make_with_bytes", - [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], - POINTER(llava_image_embed), -) -def llava_image_embed_make_with_bytes( - ctx_clip: clip_ctx_p, - n_threads: Union[c_int, int], - image_bytes: CtypesArray[c_uint8], - image_bytes_length: Union[c_int, int], - /, -) -> "_Pointer[llava_image_embed]": - ... - - -# /** build an image embed from a path to an image filename */ -# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -@ctypes_function( - "llava_image_embed_make_with_filename", - [clip_ctx_p_ctypes, c_int, c_char_p], - POINTER(llava_image_embed), -) -def llava_image_embed_make_with_filename( - ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, / -) -> "_Pointer[llava_image_embed]": - ... - - -# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); -# /** free an embedding made with llava_image_embed_make_* */ -@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None) -def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): - ... - - -# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ -# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); -@ctypes_function( - "llava_eval_image_embed", - [ - llama_cpp.llama_context_p_ctypes, - POINTER(llava_image_embed), - c_int, - POINTER(c_int), - ], - c_bool, -) -def llava_eval_image_embed( - ctx_llama: llama_cpp.llama_context_p, - embed: "_Pointer[llava_image_embed]", - n_batch: Union[c_int, int], - n_past: "_Pointer[c_int]", - /, -) -> bool: - ... - - -################################################ -# clip.h -################################################ - - -# /** load mmproj model */ -# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); -@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) -def clip_model_load( - fname: bytes, verbosity: Union[c_int, int], / -) -> Optional[clip_ctx_p]: - ... - - -# /** free mmproj model */ -# CLIP_API void clip_free(struct clip_ctx * ctx); -@ctypes_function("clip_free", [clip_ctx_p_ctypes], None) -def clip_free(ctx: clip_ctx_p, /): - ... - diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index a45f8f40..0c641ad4 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -1,10 +1,12 @@ from __future__ import annotations import os +import ctypes from ctypes import ( c_bool, c_char_p, c_int, + c_int32, c_uint8, c_uint32, c_float, @@ -17,6 +19,7 @@ ) import pathlib from typing import ( + List, Union, NewType, Optional, @@ -31,19 +34,161 @@ ) if TYPE_CHECKING: + from llama_cpp.llama_types import ( + llama_token, + llama_pos, + ) from llama_cpp._ctypes_extensions import ( CtypesArray, + CtypesPointer, ) +# Define input text structure +class mtmd_input_text(Structure): + _fields_ = [ + ("text", c_char_p), + ("add_special", c_bool), + ("parse_special", c_bool), + ] + +# Define context parameters structure +class mtmd_context_params(Structure): + _fields_ = [ + ("use_gpu", c_bool), + ("print_timings", c_bool), + ("n_threads", c_int), + ("verbosity", c_int), + ("image_marker", c_char_p), # const char* + ("media_marker", c_char_p), # const char* + ] + +# Define input chunk type enum +mtmd_input_chunk_type = c_int +( + MTMD_INPUT_CHUNK_TYPE_TEXT, + MTMD_INPUT_CHUNK_TYPE_IMAGE, + MTMD_INPUT_CHUNK_TYPE_AUDIO, +) = (0, 1, 2) + +# Define slice template enum +mtmd_slice_tmpl = c_int +( + MTMD_SLICE_TMPL_NONE, + MTMD_SLICE_TMPL_MINICPMV_2_5, + MTMD_SLICE_TMPL_MINICPMV_2_6, + MTMD_SLICE_TMPL_LLAMA4, +) = (0, 1, 2, 3) + +# Define whisper filters structure +class whisper_filters(Structure): + _fields_ = [ + ("n_mel", c_int), + ] + +# Define mtmd_context structure +class mtmd_context(Structure): + _fields_ = [ + ("ctx_v", c_void_p), # clip_ctx* + ("ctx_a", c_void_p), # clip_ctx* + ("text_model", c_void_p), # const llama_model* + ("image_embd_v", POINTER(c_float)), # std::vector + ("print_timings", c_bool), + ("n_threads", c_int), + ("media_marker", c_char_p), # std::string + ("n_embd_text", c_int), + ("img_beg", c_char_p), # std::string + ("img_end", c_char_p), # std::string + ("aud_beg", c_char_p), # std::string + ("aud_end", c_char_p), # std::string + ("slice_tmpl", c_int), # mtmd_slice_tmpl + ("tok_ov_img_start", llama_cpp.llama_token), + ("tok_ov_img_end", llama_cpp.llama_token), + ("tok_slices_start", llama_cpp.llama_token), + ("tok_slices_end", llama_cpp.llama_token), + ("tok_sli_img_start", llama_cpp.llama_token), + ("tok_sli_img_end", llama_cpp.llama_token), + ("tok_sli_img_mid", llama_cpp.llama_token), + ("tok_row_end", llama_cpp.llama_token), + ("tok_row_end_trail", c_bool), + ("ov_img_first", c_bool), + ("use_mrope", c_bool), + ("w_filters", whisper_filters), + ] + +# Define bitmap structure +class mtmd_bitmap(Structure): + _fields_ = [ + ("nx", c_uint32), + ("ny", c_uint32), + ("data", POINTER(c_uint8)), # Vector represented as pointer + ("id", c_char_p), + ("is_audio", c_bool), + ] + +# Define image tokens structure +class mtmd_image_tokens(Structure): + _fields_ = [ + ("nx", c_uint32), + ("ny", c_uint32), + ("use_mrope_pos", c_bool), + ("batch_f32", c_void_p), # clip_image_f32_batch + ("id", c_char_p), + ] -# Specify the base name of the shared library to load +# Define audio tokens structure +class mtmd_audio_tokens(Structure): + _fields_ = [ + ("n_tokens", c_uint32), + ("batch_f32", c_void_p), # clip_image_f32_batch + ("id", c_char_p), + ] + +# Define input chunk structure +class mtmd_input_chunk(Structure): + _fields_ = [ + ("type", mtmd_input_chunk_type), + ("tokens_text", POINTER(llama_cpp.llama_token)), # Vector represented as pointer + ("tokens_image", c_void_p), # mtmd_image_tokens_ptr + ("tokens_audio", c_void_p), # mtmd_audio_tokens_ptr + ] + +# Define input chunks structure +class mtmd_input_chunks(Structure): + _fields_ = [ + ("entries", POINTER(mtmd_input_chunk)), # Vector represented as pointer + ] + +# Define context pointer type +mtmd_context_p = NewType("mtmd_context_p", int) +mtmd_context_p_ctypes = c_void_p + +# Define bitmap pointer type +mtmd_bitmap_p = NewType("mtmd_bitmap_p", int) +mtmd_bitmap_p_ctypes = c_void_p + +# Define input chunks pointer type +mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int) +mtmd_input_chunks_p_ctypes = c_void_p + +# Define input chunk pointer type +mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int) +mtmd_input_chunk_p_ctypes = c_void_p + +# Define image tokens pointer type +mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int) +mtmd_image_tokens_p_ctypes = c_void_p + +# Define audio tokens pointer type +mtmd_audio_tokens_p = NewType("mtmd_audio_tokens_p", int) +mtmd_audio_tokens_p_ctypes = c_void_p + +# Load the library _libmtmd_base_name = "mtmd" _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB") _libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path() # Load the library _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path) - ctypes_function = ctypes_function_for_shared_library(_libmtmd) ################################################ diff --git a/pyproject.toml b/pyproject.toml index 9983ef77..1f0aab57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,8 @@ test = [ "sse-starlette>=1.6.1", "starlette-context>=0.3.6,<0.4", "pydantic-settings>=2.0.1", - "huggingface-hub>=0.23.0" + "huggingface-hub>=0.23.0", + "typeguard>=4.2.1", ] dev = [ "black>=23.3.0", diff --git a/tests/monalisa.jpg b/tests/monalisa.jpg new file mode 100644 index 00000000..782ee4f9 Binary files /dev/null and b/tests/monalisa.jpg differ diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72..42bbac1f 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -1,14 +1,29 @@ import json +import os +import platform +from collections.abc import Iterator +from typing import cast +import pytest import jinja2 +from typeguard import ForwardRefPolicy, check_type from llama_cpp import ( ChatCompletionRequestUserMessage, + Llama, + llama_chat_format, + llama_supports_gpu_offload, + llama_types ) -import llama_cpp.llama_types as llama_types -import llama_cpp.llama_chat_format as llama_chat_format - from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter +from llama_cpp.llama_types import ( + ChatCompletionRequestMessage, + ChatCompletionTool, + ChatCompletionToolChoiceOption, + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, +) + def test_mistral_instruct(): chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" @@ -87,3 +102,118 @@ def test_hf_tokenizer_config_str_to_chat_formatter(): ) assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "") + + +def is_accelerator_available() -> bool: + """Check if an accelerator is available.""" + return llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8 + + +@pytest.mark.parametrize( + "stream", + [ + pytest.param(True, id="stream=True"), + pytest.param(False, id="stream=False"), + ], +) +@pytest.mark.parametrize( + "tool_choice", + [ + pytest.param("none", id="tool_choice=none"), + pytest.param("auto", id="tool_choice=auto"), + pytest.param( + {"type": "function", "function": {"name": "get_weather"}}, id="tool_choice=fixed" + ), + ], +) +@pytest.mark.parametrize( + "user_prompt_expected_tool_calls", + [ + pytest.param( + ("Is 7 a prime number?", 0), + id="expected_tool_calls=0", + ), + pytest.param( + ("What's the weather like in Paris today?", 1), + id="expected_tool_calls=1", + ), + pytest.param( + ("What's the weather like in Paris today? What about New York?", 2), + id="expected_tool_calls=2", + ), + ], +) +@pytest.mark.parametrize( + "llm_repo_id", + [ + pytest.param("bartowski/Llama-3.2-3B-Instruct-GGUF", id="llama_3.2_3B"), + pytest.param( + "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", + id="llama_3.1_8B", + marks=pytest.mark.skipif( + not is_accelerator_available(), reason="Accelerator not available" + ), + ), + ], +) +@pytest.mark.skipif( + platform.system() == "Darwin" and (os.cpu_count() or 1) < 8, + reason="Insufficient resources on macOS", +) +def test_llama_cpp_python_tool_use( + llm_repo_id: str, + user_prompt_expected_tool_calls: tuple[str, int], + tool_choice: ChatCompletionToolChoiceOption, + stream: bool, +) -> None: + """Test the upgraded chatml-function-calling llama-cpp-python chat handler.""" + user_prompt, expected_tool_calls = user_prompt_expected_tool_calls + if isinstance(tool_choice, dict) and expected_tool_calls == 0: + pytest.skip("Nonsensical") + llm = Llama.from_pretrained( + repo_id=llm_repo_id, + filename="*Q4_K_M.gguf", + n_ctx=4096, + n_gpu_layers=-1, + verbose=False, + chat_format="chatml-function-calling", + ) + messages: list[ChatCompletionRequestMessage] = [{"role": "user", "content": user_prompt}] + tools: list[ChatCompletionTool] = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather for a location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "A city name."}}, + }, + }, + } + ] + response = llm.create_chat_completion( + messages=messages, tools=tools, tool_choice=tool_choice, stream=stream + ) + if stream: + response = cast(Iterator[CreateChatCompletionStreamResponse], response) + num_tool_calls = 0 + for chunk in response: + check_type(chunk, CreateChatCompletionStreamResponse) + tool_calls = chunk["choices"][0]["delta"].get("tool_calls") + if isinstance(tool_calls, list): + num_tool_calls = max(tool_call["index"] for tool_call in tool_calls) + 1 + assert num_tool_calls == (expected_tool_calls if tool_choice != "none" else 0) + else: + response = cast(CreateChatCompletionResponse, response) + check_type( + response, CreateChatCompletionResponse, forward_ref_policy=ForwardRefPolicy.IGNORE + ) + if expected_tool_calls == 0 or tool_choice == "none": + assert response["choices"][0]["message"].get("tool_calls") is None + else: + assert len(response["choices"][0]["message"]["tool_calls"]) == expected_tool_calls + assert all( + tool_call["function"]["name"] == tools[0]["function"]["name"] + for tool_call in response["choices"][0]["message"]["tool_calls"] + ) diff --git a/tests/test_llava.py b/tests/test_llava.py new file mode 100644 index 00000000..2be60171 --- /dev/null +++ b/tests/test_llava.py @@ -0,0 +1,80 @@ +import multiprocessing +import ctypes + +from huggingface_hub import hf_hub_download + +import pytest + +import llama_cpp + +@pytest.fixture +def mmproj_model_path(): + repo_id = "second-state/Llava-v1.5-7B-GGUF" + filename = "llava-v1.5-7b-mmproj-model-f16.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + +@pytest.fixture +def llava_cpp_model_path(): + repo_id = "second-state/Llava-v1.5-7B-GGUF" + filename = "llava-v1.5-7b-Q8_0.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + +def test_real_llava(llava_cpp_model_path, mmproj_model_path): + print("initializing model") + model = llama_cpp.Llama( + llava_cpp_model_path, + n_ctx=2048, + n_batch=512, + n_threads=multiprocessing.cpu_count(), + n_threads_batch=multiprocessing.cpu_count(), + logits_all=False, + verbose=False, + ) + + # Initialize the LLaVA chat handler + from llama_cpp.llama_chat_format import Llava15ChatHandler + print("initializing chat handler") + chat_handler = Llava15ChatHandler(clip_model_path=mmproj_model_path, llama_model=model) + + # Create a chat message with the image + print("creating chat message") + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": "./tests/monalisa.jpg" + }, + { + "type": "text", + "text": "Do you know who drew this painting?" + } + ] + } + ] + + # Generate response + print("generating response") + response = chat_handler( + llama=model, + messages=messages, + max_tokens=200, + temperature=0.2, + top_p=0.95, + stream=False + ) + + print("response", response) + # Check that we got a response + assert response is not None + assert "choices" in response + assert len(response["choices"]) > 0 + assert "message" in response["choices"][0] + assert "content" in response["choices"][0]["message"] + + # The response should mention Leonardo da Vinci + content = response["choices"][0]["message"]["content"].lower() + assert "leonardo" in content and "vinci" in content # Artist name should be in response diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8846aace..f13fa9b2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8846aace4934ad29651ea61b8c7e3f6b0556e3d2 +Subproject commit f13fa9b2b523e22ba58fcf4c468f670d8c98d912