diff --git a/.gitmodules b/.gitmodules
index 7edf0975..5cc3e080 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = http://github.com/inference-sh/llama.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b06d98b..97b46852 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(llama_cpp)
 
 option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
-option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
+option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
     if(NOT TARGET ${target})
@@ -143,7 +143,7 @@ if (LLAMA_BUILD)
         )
     endif()
 
-    if (LLAVA_BUILD)
+    if (MTMD_BUILD)
         if (LLAMA_CUBLAS OR LLAMA_CUDA)
             add_compile_definitions(GGML_USE_CUBLAS)
             add_compile_definitions(GGML_USE_CUDA)
@@ -153,7 +153,7 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
-        # Building llava
+        # Building multimodal support using mtmd
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)
diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
index be7fe9b5..b1992e9d 100644
--- a/examples/notebooks/Batching.ipynb
+++ b/examples/notebooks/Batching.ipynb
@@ -230,7 +230,7 @@
    "outputs": [],
    "source": [
     "for i in range(n_parallel):\n",
-    "    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
+    "    llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
    ]
   },
   {
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
index e88ed387..032e9835 100644
--- a/llama_cpp/_ctypes_extensions.py
+++ b/llama_cpp/_ctypes_extensions.py
@@ -128,4 +128,4 @@ def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCD
     ...
 
 
-byref = _byref if TYPE_CHECKING else ctypes.byref
+byref = _byref if TYPE_CHECKING else ctypes.byref
\ No newline at end of file
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 18d73348..3dc7a67f 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+from enum import Enum
 
 from typing import (
     Dict,
@@ -26,7 +27,13 @@
 
 
 # Python wrappers over llama.h structs
-
+class LlamaBackendDev(Enum):
+    # CPU device using system memory
+    CPU = 0
+    # GPU device using dedicated memory  
+    GPU = 1
+    # accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+    ACCEL = 2
 
 class LlamaModel:
     """Intermediate Python wrapper for a llama.cpp llama_model.
@@ -95,7 +102,13 @@ def n_ctx_train(self) -> int:
         return llama_cpp.llama_model_n_ctx_train(self.model)
 
     def n_embd(self) -> int:
-        return llama_cpp.llama_model_n_embd(self.model)
+        return llama_cpp.llama_n_embd(self.model)
+    
+    def n_layer(self) -> int:
+        return llama_cpp.llama_n_layer(self.model)
+
+    def dev_layer(self, il: int) -> LlamaBackendDev:
+        return LlamaBackendDev(llama_cpp.llama_model_dev_layer(self.model, il))
 
     def rope_freq_scale_train(self) -> float:
         return llama_cpp.llama_model_rope_freq_scale_train(self.model)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index cdc05c7a..d6fd4862 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -299,65 +299,9 @@ def __init__(
             ].key = b"\0"  # ensure sentinel element is zeroed
             self.model_params.kv_overrides = self._kv_overrides_array
 
-        self.n_batch = min(n_ctx, n_batch)  # ???
-        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
-        self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count()
-
         # Used by the sampler
         self._seed = seed or llama_cpp.LLAMA_DEFAULT_SEED
 
-        # Context Params
-        self.context_params = llama_cpp.llama_context_default_params()
-        self.context_params.n_ctx = n_ctx
-        self.context_params.n_batch = self.n_batch
-        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
-        self.context_params.n_threads = self.n_threads
-        self.context_params.n_threads_batch = self.n_threads_batch
-        self.context_params.rope_scaling_type = (
-            rope_scaling_type
-            if rope_scaling_type is not None
-            else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
-        )
-        self.context_params.pooling_type = pooling_type
-        self.context_params.rope_freq_base = (
-            rope_freq_base if rope_freq_base != 0.0 else 0
-        )
-        self.context_params.rope_freq_scale = (
-            rope_freq_scale if rope_freq_scale != 0.0 else 0
-        )
-        self.context_params.yarn_ext_factor = (
-            yarn_ext_factor if yarn_ext_factor != 0.0 else 0
-        )
-        self.context_params.yarn_attn_factor = (
-            yarn_attn_factor if yarn_attn_factor != 0.0 else 0
-        )
-        self.context_params.yarn_beta_fast = (
-            yarn_beta_fast if yarn_beta_fast != 0.0 else 0
-        )
-        self.context_params.yarn_beta_slow = (
-            yarn_beta_slow if yarn_beta_slow != 0.0 else 0
-        )
-        self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self._logits_all = logits_all if draft_model is None else True
-        self.context_params.embeddings = embedding  # TODO: Rename to embeddings
-        self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
-
-        if op_offloat is not None:
-            self.context_params.op_offloat = op_offloat
-
-        if swa_full is not None:
-            self.context_params.swa_full = swa_full
-
-        #  KV cache quantization
-        if type_k is not None:
-            self.context_params.type_k = type_k
-        if type_v is not None:
-            self.context_params.type_v = type_v
-        # Sampling Params
-        self.context_params.no_perf = no_perf
-        self.last_n_tokens_size = last_n_tokens_size
-
         self.cache: Optional[BaseLlamaCache] = None
 
         self.lora_base = lora_base
@@ -378,39 +322,45 @@ def __init__(
                 )
             )
         )
-
-        # Override tokenizer
+        
+        self.draft_model = draft_model
+        
+           # Override tokenizer
         self.tokenizer_ = tokenizer or LlamaTokenizer(self)
+        
+        self._n_vocab = self.n_vocab()
 
-        # Set the default value for the context and correct the batch
-        if n_ctx == 0:
-            n_ctx = self._model.n_ctx_train()
-            self.n_batch = min(n_ctx, n_batch)
-            self.context_params.n_ctx = self._model.n_ctx_train()
-            self.context_params.n_batch = self.n_batch
-            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
-
-        self._ctx = self._stack.enter_context(
-            contextlib.closing(
-                internals.LlamaContext(
-                    model=self._model,
-                    params=self.context_params,
-                    verbose=self.verbose,
-                )
-            )
-        )
+        self._token_nl = self.token_nl()
+        self._token_eos = self.token_eos()
 
-        self._batch = self._stack.enter_context(
-            contextlib.closing(
-                internals.LlamaBatch(
-                    n_tokens=self.n_batch,
-                    embd=0,
-                    n_seq_max=self.context_params.n_ctx,
-                    verbose=self.verbose,
-                )
-            )
+        self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
+        # Context Params
+        self._create_context(
+            n_ctx=n_ctx,
+            n_batch=n_batch,
+            n_ubatch=min(n_batch, n_ubatch),
+            n_threads=n_threads,
+            n_threads_batch=n_threads_batch,
+            rope_scaling_type=rope_scaling_type,
+            pooling_type=pooling_type,
+            rope_freq_base=rope_freq_base,
+            rope_freq_scale=rope_freq_scale,
+            yarn_ext_factor=yarn_ext_factor,
+            yarn_attn_factor=yarn_attn_factor,
+            yarn_beta_fast=yarn_beta_fast,
+            yarn_beta_slow=yarn_beta_slow,
+            yarn_orig_ctx=yarn_orig_ctx,
+            logits_all=logits_all,
+            embedding=embedding,
+            offload_kqv=offload_kqv,
+            flash_attn=flash_attn,
+            no_perf=no_perf,
+            last_n_tokens_size=last_n_tokens_size,
+            type_k=type_k,
+            type_v=type_v,
         )
 
+    
         self._lora_adapter: Optional[llama_cpp.llama_adapter_lora_p] = None
 
         if self.lora_path:
@@ -447,22 +397,6 @@ def free_lora_adapter():
             str, llama_chat_format.LlamaChatCompletionHandler
         ] = {}
 
-        self.draft_model = draft_model
-
-        self._n_vocab = self.n_vocab()
-        self._n_ctx = self.n_ctx()
-
-        self._token_nl = self.token_nl()
-        self._token_eos = self.token_eos()
-
-        self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
-
-        self.n_tokens = 0
-        self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
-        self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
-        )
-
         self._mirostat_mu = ctypes.c_float(
             2.0 * 5.0
         )  # TODO: Move this to sampling context
@@ -543,7 +477,7 @@ def free_lora_adapter():
                 print(
                     f"Using fallback chat format: {self.chat_format}", file=sys.stderr
                 )
-
+                
         self._sampler = None
 
     @property
@@ -553,6 +487,13 @@ def ctx(self) -> llama_cpp.llama_context_p:
     @property
     def model(self) -> llama_cpp.llama_model_p:
         return self._model.model
+    
+    @property
+    def n_layer(self) -> int:
+        return self._model.n_layer()
+    
+    def dev_layer(self, il: int) -> internals.LlamaBackendDev:
+        return self._model.dev_layer(il)
 
     @property
     def _input_ids(self) -> npt.NDArray[np.intc]:
@@ -1041,7 +982,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1112,7 +1053,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
         self.reset()
 
         if return_count:
@@ -1120,6 +1061,50 @@ def decode_batch(seq_sizes: List[int]):
         else:
             return output
 
+    def _create_chunk(
+        self,
+        completion_id: str,
+        created: int,
+        model_name: str,
+        text: str,
+        logprobs_or_none: Union[Optional[CompletionLogprobs], None],
+        index: int,
+        finish_reason: Union[str, None],
+        usage: Optional[Dict[str, Any]] = None,
+    ) -> CreateCompletionStreamResponse:
+        """Create chunks for streaming API, depending on whether usage is requested or not."""
+        if usage is not None:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+                "usage": usage,
+            }
+        else:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+
     def _create_completion(
         self,
         prompt: Union[str, List[int]],
@@ -1446,24 +1431,20 @@ def logit_bias_processor(
                             "top_logprobs": [top_logprob],
                         }
                         returned_tokens += 1
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": self.detokenize(
-                                        [token],
-                                        prev_tokens=prompt_tokens
-                                        + completion_tokens[:returned_tokens],
-                                    ).decode("utf-8", errors="ignore"),
-                                    "index": 0,
-                                    "logprobs": logprobs_or_none,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=self.detokenize(
+                                [token],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            ).decode("utf-8", errors="ignore"),
+                            logprobs_or_none=logprobs_or_none,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
                 else:
                     while len(remaining_tokens) > 0:
                         decode_success = False
@@ -1492,20 +1473,16 @@ def logit_bias_processor(
                         remaining_tokens = remaining_tokens[i:]
                         returned_tokens += i
 
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": ts,
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=ts,
+                            logprobs_or_none=None,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
@@ -1584,54 +1561,51 @@ def logit_bias_processor(
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
-                    yield {
-                        "id": completion_id,
-                        "object": "text_completion",
-                        "created": created,
-                        "model": model_name,
-                        "choices": [
-                            {
-                                "text": last_text[
-                                    : len(last_text) - (token_end_position - end)
-                                ].decode("utf-8", errors="ignore"),
-                                "index": 0,
-                                "logprobs": logprobs_or_none,
-                                "finish_reason": None,
-                            }
-                        ],
-                    }
+                    yield self._create_chunk(
+                        completion_id=completion_id,
+                        created=created,
+                        model_name=model_name,
+                        text=last_text[
+                            : len(last_text) - (token_end_position - end)
+                        ].decode("utf-8", errors="ignore"),
+                        logprobs_or_none=logprobs_or_none,
+                        index=0,
+                        finish_reason=None,
+                        usage=None,
+                    )
                     break
                 returned_tokens += 1
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore"
-                            ),
-                            "index": 0,
-                            "logprobs": logprobs_or_none,
-                            "finish_reason": None,
-                        }
-                    ],
-                }
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": "",
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": finish_reason,
-                    }
-                ],
+                yield self._create_chunk(
+                    completion_id=completion_id,
+                    created=created,
+                    model_name=model_name,
+                    text=self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    ),
+                    logprobs_or_none=logprobs_or_none,
+                    index=0,
+                    finish_reason=None,
+                    usage=None,
+                )
+
+            # Final streaming chunk with both finish_reason and usage
+            usage = {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": returned_tokens,
+                "total_tokens": len(prompt_tokens) + returned_tokens,
             }
+
+            yield self._create_chunk(
+                completion_id=completion_id,
+                created=created,
+                model_name=model_name,
+                text="",
+                logprobs_or_none=None,
+                index=0,
+                finish_reason=finish_reason,
+                usage=usage,
+            )
+
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
@@ -2363,6 +2337,266 @@ def from_pretrained(
             **kwargs,
         )
 
+    def _create_context(
+        self,
+        *,
+        n_ctx: int = 512,
+        n_batch: int = 512,
+        n_ubatch: int = 512,
+        n_threads: Optional[int] = None,
+        n_threads_batch: Optional[int] = None,
+        rope_scaling_type: Optional[
+            int
+        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        rope_freq_base: float = 0.0,
+        rope_freq_scale: float = 0.0,
+        yarn_ext_factor: float = -1.0,
+        yarn_attn_factor: float = 1.0,
+        yarn_beta_fast: float = 32.0,
+        yarn_beta_slow: float = 1.0,
+        yarn_orig_ctx: int = 0,
+        logits_all: bool = False,
+        embedding: bool = False,
+        offload_kqv: bool = True,
+        flash_attn: bool = False,
+        # Sampling Params
+        no_perf: bool = False,
+        last_n_tokens_size: int = 64,
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
+        state: Optional[LlamaState] = None,
+    ) -> None:
+        """Free the existing context and create a new one with specified parameters.
+        
+        Args:
+            n_ctx: Text context size. If 0, value from model is used.
+            n_batch: Maximum batch size for llama_decode.
+            n_ubatch: Maximum physical batch size.
+            n_seq_max: Maximum number of sequences (distinct states for recurrent models).
+            n_threads: Number of threads to use for generation.
+            n_threads_batch: Number of threads to use for batch processing.
+            rope_scaling_type: RoPE scaling type from llama_rope_scaling_type enum.
+            pooling_type: Whether to pool embedding results by sequence id.
+            attention_type: Attention type to use for embeddings.
+            rope_freq_base: RoPE base frequency, 0 = from model.
+            rope_freq_scale: RoPE frequency scaling factor, 0 = from model.
+            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model.
+            yarn_attn_factor: YaRN magnitude scaling factor.
+            yarn_beta_fast: YaRN low correction dim.
+            yarn_beta_slow: YaRN high correction dim.
+            yarn_orig_ctx: YaRN original context size.
+            defrag_thold: Defragment KV cache if holes/size > thold, < 0 disabled.
+            type_k: Data type for K cache.
+            type_v: Data type for V cache.
+            logits_all: Compute all logits in llama_decode (deprecated).
+            embeddings: Extract embeddings with logits.
+            offload_kqv: Offload KQV ops (including KV cache) to GPU.
+            flash_attn: Use flash attention.
+            no_perf: Disable performance timings.
+            last_n_tokens_size: Size of the last n tokens.
+            type_k: Data type for K cache.
+            type_v: Data type for V cache.
+        """
+        # Create new context params with provided values
+        self.n_batch = min(n_ctx, n_batch)  # ???
+        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
+        self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count()
+
+        # Context Params
+        self.context_params = llama_cpp.llama_context_default_params()
+        self.context_params.n_ctx = n_ctx
+        self.context_params.n_batch = self.n_batch
+        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
+        self.context_params.n_threads = self.n_threads
+        self.context_params.n_threads_batch = self.n_threads_batch
+        self.context_params.rope_scaling_type = (
+            rope_scaling_type
+            if rope_scaling_type is not None
+            else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+        )
+        self.context_params.pooling_type = pooling_type
+        self.context_params.rope_freq_base = (
+            rope_freq_base if rope_freq_base != 0.0 else 0
+        )
+        self.context_params.rope_freq_scale = (
+            rope_freq_scale if rope_freq_scale != 0.0 else 0
+        )
+        self.context_params.yarn_ext_factor = (
+            yarn_ext_factor if yarn_ext_factor != 0.0 else 0
+        )
+        self.context_params.yarn_attn_factor = (
+            yarn_attn_factor if yarn_attn_factor != 0.0 else 0
+        )
+        self.context_params.yarn_beta_fast = (
+            yarn_beta_fast if yarn_beta_fast != 0.0 else 0
+        )
+        self.context_params.yarn_beta_slow = (
+            yarn_beta_slow if yarn_beta_slow != 0.0 else 0
+        )
+        self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
+        self.context_params.logits_all = (
+            logits_all if self.draft_model is None else True
+        )  # Must be set to True for speculative decoding
+        self.context_params.embeddings = embedding  # TODO: Rename to embeddings
+        self.context_params.offload_kqv = offload_kqv
+        self.context_params.flash_attn = flash_attn
+        #  KV cache quantization
+        if type_k is not None:
+            self.context_params.type_k = type_k
+        if type_v is not None:
+            self.context_params.type_v = type_v
+        
+        self.context_params.no_perf = no_perf
+        self.last_n_tokens_size = last_n_tokens_size
+        
+        # Store logits_all as instance attribute
+        self._logits_all = self.context_params.logits_all
+        
+         # Set the default value for the context and correct the batch
+        if n_ctx == 0:
+            n_ctx = self._model.n_ctx_train()
+            self.n_batch = min(n_ctx, n_batch)
+            self.context_params.n_ctx = self._model.n_ctx_train()
+            self.context_params.n_batch = self.n_batch
+            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
+
+        self._ctx = self._stack.enter_context(
+            contextlib.closing(
+                internals.LlamaContext(
+                    model=self._model,
+                    params=self.context_params,
+                    verbose=self.verbose,
+                )
+            )
+        )
+        
+        if state is not None:
+            self.load_state(state)
+        
+        self._n_ctx = self.n_ctx()
+        self.n_tokens = 0
+        
+        self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
+        self.scores: npt.NDArray[np.single] = np.ndarray(
+            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
+        )
+        
+        self._batch = self._stack.enter_context(
+            contextlib.closing(
+                internals.LlamaBatch(
+                    n_tokens=self.n_batch,
+                    embd=0,
+                    n_seq_max=self.context_params.n_ctx,
+                    verbose=self.verbose,
+                )
+            )
+        )
+        
+
+        
+        if self._ctx is None:
+            raise RuntimeError("Failed to create new context")
+
+    def recreate_context(
+        self,
+        *,
+        n_ctx: int = 512,
+        n_batch: int = 512,
+        n_ubatch: int = 512,
+        n_threads: Optional[int] = None,
+        n_threads_batch: Optional[int] = None,
+        rope_scaling_type: Optional[
+            int
+        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        rope_freq_base: float = 0.0,
+        rope_freq_scale: float = 0.0,
+        yarn_ext_factor: float = -1.0,
+        yarn_attn_factor: float = 1.0,
+        yarn_beta_fast: float = 32.0,
+        yarn_beta_slow: float = 1.0,
+        yarn_orig_ctx: int = 0,
+        logits_all: bool = False,
+        embedding: bool = False,
+        offload_kqv: bool = True,
+        flash_attn: bool = False,
+        # Sampling Params
+        no_perf: bool = False,
+        last_n_tokens_size: int = 64,
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
+    ) -> None:
+        """Free the existing context and create a new one with specified parameters.
+        
+        Args:
+            n_ctx: Text context size. If 0, value from model is used.
+            n_batch: Maximum batch size for llama_decode.
+            n_ubatch: Maximum physical batch size.
+            n_seq_max: Maximum number of sequences (distinct states for recurrent models).
+            n_threads: Number of threads to use for generation.
+            n_threads_batch: Number of threads to use for batch processing.
+            rope_scaling_type: RoPE scaling type from llama_rope_scaling_type enum.
+            pooling_type: Whether to pool embedding results by sequence id.
+            attention_type: Attention type to use for embeddings.
+            rope_freq_base: RoPE base frequency, 0 = from model.
+            rope_freq_scale: RoPE frequency scaling factor, 0 = from model.
+            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model.
+            yarn_attn_factor: YaRN magnitude scaling factor.
+            yarn_beta_fast: YaRN low correction dim.
+            yarn_beta_slow: YaRN high correction dim.
+            yarn_orig_ctx: YaRN original context size.
+            defrag_thold: Defragment KV cache if holes/size > thold, < 0 disabled.
+            type_k: Data type for K cache.
+            type_v: Data type for V cache.
+            logits_all: Compute all logits in llama_decode (deprecated).
+            embeddings: Extract embeddings with logits.
+            offload_kqv: Offload KQV ops (including KV cache) to GPU.
+            flash_attn: Use flash attention.
+            no_perf: Disable performance timings.
+            last_n_tokens_size: Size of the last n tokens.
+            type_k: Data type for K cache.
+            type_v: Data type for V cache.
+        """
+        
+        current_state = None
+                
+        if self._ctx is not None:
+            current_state = self.save_state()
+            self._ctx.close()
+            self._ctx = None
+            
+        # Free existing context if it exists
+        self._create_context(
+            n_ctx=n_ctx,
+            n_batch=n_batch,
+            n_ubatch=min(n_batch, n_ubatch),
+            n_threads=n_threads,
+            n_threads_batch=n_threads_batch,
+            rope_scaling_type=rope_scaling_type,
+            pooling_type=pooling_type,
+            rope_freq_base=rope_freq_base,
+            rope_freq_scale=rope_freq_scale,
+            yarn_ext_factor=yarn_ext_factor,
+            yarn_attn_factor=yarn_attn_factor,
+            yarn_beta_fast=yarn_beta_fast,
+            yarn_beta_slow=yarn_beta_slow,
+            yarn_orig_ctx=yarn_orig_ctx,
+            logits_all=logits_all,
+            embedding=embedding,
+            offload_kqv=offload_kqv,
+            flash_attn=flash_attn,
+            no_perf=no_perf,
+            last_n_tokens_size=last_n_tokens_size,
+            type_k=type_k,
+            type_v=type_v,
+            state=current_state,
+        )
+        
+        # Reapply any LoRA adapter if it exists
+        if self._lora_adapter is not None:
+            llama_cpp.llama_set_adapter_lora(self._ctx, self._lora_adapter, self.lora_scale)
+            
 
 class LlamaState:
     def __init__(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a288db7b..76c8ea66 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -7,6 +7,7 @@
 import dataclasses
 import random
 import string
+import warnings
 
 from contextlib import ExitStack
 from typing import (
@@ -349,6 +350,7 @@ def _convert_text_completion_chunks_to_chat(
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
+            "usage": chunk.get("usage") if "usage" in chunk else None,
         }
 
 
@@ -433,7 +435,7 @@ def _stream_response_to_function_stream(
                     created = chunk["created"]
                     model = chunk["model"]
                     tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                    yield {
+                    response = {
                         "id": id_,
                         "object": "chat.completion.chunk",
                         "created": created,
@@ -452,7 +454,11 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
-                    yield {
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
+
+                    response = {
                         "id": "chat" + chunk["id"],
                         "object": "chat.completion.chunk",
                         "created": chunk["created"],
@@ -486,10 +492,14 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
                     first = False
                     continue
+
                 assert tool_id is not None
-                yield {
+                response = {
                     "id": "chat" + chunk["id"],
                     "object": "chat.completion.chunk",
                     "created": chunk["created"],
@@ -521,9 +531,12 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
             if id_ is not None and created is not None and model is not None:
-                yield {
+                response = {
                     "id": id_,
                     "object": "chat.completion.chunk",
                     "created": created,
@@ -542,6 +555,9 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
         return _stream_response_to_function_stream(chunks)
 
@@ -2122,6 +2138,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                     first = False
                 if tools is not None:
@@ -2162,6 +2179,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
             # Yield tool_call/function_call stop message
             yield llama_types.CreateChatCompletionStreamResponse(
@@ -2184,6 +2202,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         },
                     }
                 ],
+                usage=chunk["usage"] if "usage" in chunk else None,
             )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2219,6 +2238,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 "finish_reason": None,
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 else:
                     prompt += f"{function_name}\n<|content|>"
@@ -2264,6 +2284,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
@@ -2301,6 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                 },
                                             }
                                         ],
+                                        usage=chunk["usage"] if "usage" in chunk else None,
                                     )
                                 is_end = False
                         elif chunk["choices"][0]["text"] == "\n":
@@ -2330,6 +2352,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     # Check whether the model wants to generate another turn
                     if (
@@ -2362,6 +2385,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "finish_reason": "stop",
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
                 else:
@@ -2411,6 +2435,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     prompt += completion_text.strip()
                     grammar = None
@@ -2450,6 +2475,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     },
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
 
@@ -2649,7 +2675,6 @@ def generate_streaming(tools, functions, function_call, prompt):
             usage=completion["usage"],
         )
 
-
 class Llava15ChatHandler:
     DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
         "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
@@ -3028,11 +3053,18 @@ def _load_image(image_url: str) -> bytes:
             import base64
             image_bytes = base64.b64decode(image_url.split(",")[1])
             return image_bytes
-        else:
+        elif image_url.startswith("http") or image_url.startswith("https"):
             import urllib.request
             with urllib.request.urlopen(image_url) as f:
                 image_bytes = f.read()
                 return image_bytes
+        else:
+            import os
+            if os.path.exists(image_url):
+                with open(image_url, "rb") as f:
+                    image_bytes = f.read()
+                    return image_bytes
+            raise ValueError(f"Image file does not exist: {image_url}")
 
     @staticmethod
     def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
@@ -3510,6 +3542,229 @@ def __call__(self, **kwargs):
         return super().__call__(**kwargs)
 
 
+class Gemma3ChatHandler(Llava15ChatHandler):
+    # Chat Format:
+    # '<bos><start_of_turn>user\n{system_prompt}\n\n{prompt}<end_of_turn>\n<start_of_turn>model\n'
+
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = (
+        "{{ '<bos>' }}"
+        "{%- if messages[0]['role'] == 'system' -%}"
+        "{%- if messages[0]['content'] is string -%}"
+        "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}"
+        "{%- else -%}"
+        "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}"
+        "{%- endif -%}"
+        "{%- set loop_messages = messages[1:] -%}"
+        "{%- else -%}"
+        "{%- set first_user_prefix = \"\" -%}"
+        "{%- set loop_messages = messages -%}"
+        "{%- endif -%}"
+        "{%- for message in loop_messages -%}"
+        "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}"
+        "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
+        "{%- endif -%}"
+        "{%- if (message['role'] == 'assistant') -%}"
+        "{%- set role = \"model\" -%}"
+        "{%- else -%}"
+        "{%- set role = message['role'] -%}"
+        "{%- endif -%}"
+        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}"
+        "{%- if message['content'] is string -%}"
+        "{{ message['content'] | trim }}"
+        "{%- elif message['content'] is iterable -%}"
+        "{%- for item in message['content'] -%}"
+        "{%- if item['type'] == 'image_url' -%}"
+        "{{ '<start_of_image>' }}"
+        "{%- elif item['type'] == 'text' -%}"
+        "{{ item['text'] | trim }}"
+        "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- else -%}"
+        "{{ raise_exception(\"Invalid content type\") }}"
+        "{%- endif -%}"
+        "{{ '<end_of_turn>\n' }}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "{{ '<start_of_turn>model\n' }}"
+        "{%- endif -%}"
+    )
+
+    @staticmethod
+    def split_text_on_image_urls(text: str, image_urls: List[str]):
+        split_text: List[Tuple[Literal["text", "image_url"], str]] = []
+        copied_urls = image_urls[:]
+        remaining = text
+        image_placeholder = "<start_of_image>"
+
+        while remaining:
+            # Find placeholder
+            pos = remaining.find(image_placeholder)
+            if pos != -1:
+                assert len(copied_urls) > 0
+                if pos > 0:
+                    split_text.append(("text", remaining[:pos]))
+                split_text.append(("text", "\n\n<start_of_image>"))
+                split_text.append(("image_url", copied_urls.pop(0)))
+                split_text.append(("text", "<end_of_image>\n\n"))
+                remaining = remaining[pos + len(image_placeholder):]
+            else:
+                assert len(copied_urls) == 0
+                split_text.append(("text", remaining))
+                remaining = ""
+        return split_text
+
+
+def _accumulate_chunks(
+    chunks_iterator: Iterator[llama_types.CreateCompletionStreamResponse],
+    chunks_list: List[llama_types.CreateCompletionStreamResponse],
+) -> Iterator[llama_types.CreateCompletionStreamResponse]:
+    for chunk in chunks_iterator:
+        chunks_list.append(chunk)
+        yield chunk
+
+
+def _convert_chunks_to_completion(
+    chunks: List[llama_types.CreateCompletionStreamResponse],
+) -> llama_types.CreateCompletionResponse:
+    """Convert a list of completion chunks to a completion."""
+    # Accumulate completion response values
+    text: str = ""
+    finish_reason: Optional[str] = None
+    logprobs: Optional[llama_types.CompletionLogprobs] = None
+    prompt_tokens = 0
+    completion_tokens = 0
+    total_tokens = 0
+    completion_id: Optional[str] = None
+    completion_model: Optional[str] = None
+    completion_created: Optional[int] = None
+    for chunk in chunks:
+        # Extract the id, model, and created values from the first chunk
+        if completion_id is None:
+            completion_id = chunk["id"]
+            completion_model = chunk["model"]
+            completion_created = chunk["created"]
+        # Extract the usage if present in the chunk
+        usage = chunk.get("usage")
+        if usage:
+            prompt_tokens += usage.get("prompt_tokens", 0)
+            completion_tokens += usage.get("completion_tokens", 0)
+            total_tokens += usage.get("total_tokens", 0)
+        # Accumulate the chunk text
+        choice = chunk["choices"][0]
+        text += choice.get("text", "")
+        # Extract the finish_reason and logprobs if present in the chunk
+        if choice.get("finish_reason"):
+            finish_reason = choice["finish_reason"]
+        if choice.get("logprobs"):
+            logprobs = choice["logprobs"]
+    # Create the completion response
+    completion: llama_types.CreateCompletionResponse = {
+        "id": completion_id or "unknown_id",
+        "object": "text_completion",
+        "created": completion_created or 0,
+        "model": completion_model or "unknown_model",
+        "choices": [
+            {
+                "text": text,
+                "index": 0,
+                "logprobs": logprobs,  # TODO: Improve accumulation of logprobs
+                "finish_reason": finish_reason,  # type: ignore[typeddict-item]
+            }
+        ],
+    }
+    # Add usage section if present in the chunks
+    if (prompt_tokens + completion_tokens + total_tokens) > 0:
+        completion["usage"] = {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+        }
+    return completion
+
+
+def _stream_tool_calls(
+    llama: llama.Llama,
+    prompt: str,
+    tools: List[llama_types.ChatCompletionTool],
+    tool_name: str,
+    completion_kwargs: dict[str, Any],
+    follow_up_gbnf_tool_grammar: str,
+) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+    # Generate a tool call completions
+    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    completions: List[llama_types.CreateCompletionResponse] = []
+    completions_tool_name: List[str] = []
+    finish_reason_chat_chunk = None
+    while tool is not None and len(completions) <= 16:
+        # Generate the parameter values for the selected tool
+        prompt += f"functions.{tool_name}:\n"
+        try:
+            grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+            )
+        except Exception as e:
+            warnings.warn(
+                f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}",
+                category=RuntimeWarning,
+                stacklevel=2,
+            )
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
+            )
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            **{
+                **completion_kwargs,
+                "max_tokens": None,
+                "grammar": grammar,
+            },
+        )
+        chunks: List[llama_types.CreateCompletionResponse] = []
+        chat_chunks = _convert_completion_to_chat_function(
+            tool_name,
+            _accumulate_chunks(completion_or_chunks, chunks),  # type: ignore[arg-type]
+            stream=True,
+        )
+        for chat_chunk in chat_chunks:
+            # Don't return the finish_reason chunk
+            if chat_chunk["choices"] and chat_chunk["choices"][0].get("finish_reason"):
+                finish_reason_chat_chunk = chat_chunk
+                break
+            # Update this tool call's index
+            if chat_chunk["choices"] and chat_chunk["choices"][0]["delta"].get("tool_calls"):
+                chat_chunk["choices"][0]["delta"]["tool_calls"][0]["index"] = len(completions)
+            yield chat_chunk
+        completion = _convert_chunks_to_completion(chunks)
+        completions.append(completion)
+        completions_tool_name.append(tool_name)
+        prompt += completion["choices"][0]["text"]
+        prompt += "\n"
+        # Determine whether to call another tool or stop
+        response = cast(
+            llama_types.CreateCompletionResponse,
+            llama.create_completion(
+                prompt=prompt,
+                **{
+                    **completion_kwargs,
+                    "temperature": 0,
+                    "stream": False,
+                    "stop": [*completion_kwargs["stop"], ":", "</function_calls>"],
+                    "max_tokens": None,
+                    "grammar": llama_grammar.LlamaGrammar.from_string(
+                        follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                    ),
+                },
+            ),
+        )
+        tool_name = response["choices"][0]["text"][len("functions.") :]
+        tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    # Yield the finish_reason chunk
+    if finish_reason_chat_chunk is not None:
+        yield finish_reason_chat_chunk
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
@@ -3539,7 +3794,7 @@ def chatml_function_calling(
     grammar: Optional[llama.LlamaGrammar] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
-    **kwargs,  # type: ignore
+    **kwargs: Any,
 ) -> Union[
     llama_types.CreateChatCompletionResponse,
     Iterator[llama_types.CreateChatCompletionStreamResponse],
@@ -3553,18 +3808,21 @@ def chatml_function_calling(
         "{% if tool_calls %}"
         "\n\nYou have access to the following functions:\n"
         "{% for tool in tools %}"
+        '\n{% if tool.function.get("description") %}/* {{ tool.function.description | trim }} */{% endif %}'
         "\nfunctions.{{ tool.function.name }}:\n"
         "{{ tool.function.parameters | tojson }}"
         "\n{% endfor %}"
-        "\n\nYou can respond to users messages with either a single message or one or more function calls."
-        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\nYou must respond to user messages with either a single message or with one or more function calls."
+        "\n\nTo respond with a message use the following format:"
         "\n\nmessage:"
         "\n<message>"
-        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
-        "\n\nfunctions.<function_name>:"
+        "\n\nTo respond with one or more function calls use the following format:"
+        "\n\n<function_calls>"
+        "\nfunctions.<function_name>:"
         '\n{ "arg1": "value1", "arg2": "value2" }'
         "\nfunctions.<function_name>:"
         '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\n</function_calls>"
         "{% endif %}"
         "<|im_end|>\n"
         "{% endif %}"
@@ -3575,7 +3833,7 @@ def chatml_function_calling(
         "{% endif %}"
         # Assistant message
         "{% if message.role == 'assistant' %}"
-        ## Reglar message
+        ## Regular message
         "{% if message.content and message.content | length > 0 %}"
         "{% if tool_calls %}"
         "message:\n"
@@ -3602,35 +3860,55 @@ def chatml_function_calling(
 
     # Convert legacy functions to tools
     if functions is not None:
-        tools = [
-            {
-                "type": "function",
-                "function": function,
-            }
-            for function in functions
-        ]
+        tools = [{"type": "function", "function": function} for function in functions]
 
     # Convert legacy function_call to tool_choice
     if function_call is not None:
-        if isinstance(function_call, str) and (
-            function_call == "none" or function_call == "auto"
-        ):
+        if isinstance(function_call, str) and (function_call in ("none", "auto")):
             tool_choice = function_call
         if isinstance(function_call, dict) and "name" in function_call:
-            tool_choice = {
-                "type": "function",
-                "function": {
-                    "name": function_call["name"],
-                },
-            }
+            tool_choice = {"type": "function", "function": {"name": function_call["name"]}}
 
+    # Collect the llama.create_completion keyword arguments so we don't have to repeat these with
+    # each completion call
     stop = (
         [stop, "<|im_end|>"]
         if isinstance(stop, str)
-        else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+        else [*stop, "<|im_end|>"]
+        if stop
+        else ["<|im_end|>"]
     )
+    grammar = (  # It is assumed the grammar applies to messages only, not tool calls
+        grammar
+        if grammar is not None
+        else (
+            _grammar_for_response_format(response_format)
+            if response_format is not None and response_format["type"] == "json_object"
+            else None
+        )
+    )
+    completion_kwargs = {
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "min_p": min_p,
+        "typical_p": typical_p,
+        "stream": stream,
+        "stop": stop,
+        "max_tokens": max_tokens,
+        "presence_penalty": presence_penalty,
+        "frequency_penalty": frequency_penalty,
+        "repeat_penalty": repeat_penalty,
+        "tfs_z": tfs_z,
+        "mirostat_mode": mirostat_mode,
+        "mirostat_tau": mirostat_tau,
+        "mirostat_eta": mirostat_eta,
+        "model": model,
+        "logits_processor": logits_processor,
+        "grammar": grammar,
+    }
 
-    # Case 1: No tool choice by user
+    # Case 1: No tool use
     if (
         tool_choice is None
         or (isinstance(tool_choice, str) and tool_choice == "none")
@@ -3638,316 +3916,526 @@ def chatml_function_calling(
         or len(tools) == 0
     ):
         prompt = template_renderer.render(
-            messages=messages,
-            tools=[],
-            tool_calls=None,
-            add_generation_prompt=True,
+            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
         )
-
-        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format)
-
         return _convert_completion_to_chat(
             llama.create_completion(
                 prompt=prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                stream=stream,
-                stop=stop,
-                max_tokens=max_tokens,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                grammar=grammar,
+                **completion_kwargs,  # type: ignore[arg-type]
                 logprobs=top_logprobs if logprobs else None,
             ),
             stream=stream,
         )
 
-    # Case 2: Tool choice by user
+    # Ensure there is a system prompt to attach the tool metadata to
+    if not any(message["role"] == "system" for message in messages):
+        messages = [*messages, {"role": "system", "content": ""}]
+
+    # Case 2: Automatic or fixed tool choice
+    # Case 2 step 1: Determine whether to respond with a message or a tool call
+    assert (isinstance(tool_choice, str) and tool_choice == "auto") or isinstance(tool_choice, dict)
     if isinstance(tool_choice, dict):
-        tool_name = tool_choice["function"]["name"]
-        tool = next(
-            (tool for tool in tools if tool["function"]["name"] == tool_name), None
+        tools = [t for t in tools if t["function"]["name"] == tool_choice["function"]["name"]]
+        assert tools
+    function_names = " | ".join([f'''"functions.{t['function']['name']}:"''' for t in tools])
+    prompt = template_renderer.render(
+        messages=messages, tools=tools, tool_calls=True, add_generation_prompt=True
+    )
+    initial_gbnf_tool_grammar = (
+        (
+            'root ::= "<function_calls>" "\\n" functions | "message:"\n'
+            f"functions ::= {function_names}\n"
         )
-        if tool is None:
-            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
+        if tool_choice == "auto"
+        else f'root ::= "<function_calls>" "\\n" functions\nfunctions ::= {function_names}\n'
+    )
+    completion = cast(
+        llama_types.CreateCompletionResponse,
+        llama.create_completion(
+            prompt=prompt,
+            **{  # type: ignore[arg-type]
+                **completion_kwargs,
+                "temperature": 0,
+                "stream": False,
+                "stop": [":"],
+                "max_tokens": None,
+                "grammar": llama_grammar.LlamaGrammar.from_string(
+                    initial_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            },
+        ),
+    )
+    text = completion["choices"][0]["text"]
+    tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
+
+    # Case 2 step 2A: Respond with a message
+    if tool_name is None:
         prompt = template_renderer.render(
-            messages=messages,
-            tools=tools,
-            tool_calls=True,
-            add_generation_prompt=True,
+            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
+        )
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt,
+                **completion_kwargs,  # type: ignore[arg-type]
+                logprobs=top_logprobs if logprobs else None,
+            ),
+            stream=stream,
+        )
+
+    # Case 2 step 2B: One or more function calls
+    follow_up_gbnf_tool_grammar = (
+        'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
+        f"functions ::= {function_names}\n"
+    )
+    prompt += "<function_calls>\n"
+    if stream:
+        return _stream_tool_calls(
+            llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
         )
+    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    completions: List[llama_types.CreateCompletionResponse] = []
+    completions_tool_name: List[str] = []
+    while tool is not None and len(completions) <= 16:
+        # Generate the parameter values for the selected tool
         prompt += f"functions.{tool_name}:\n"
         try:
             grammar = llama_grammar.LlamaGrammar.from_json_schema(
                 json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
             )
         except Exception as e:
+            warnings.warn(
+                f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}",
+                category=RuntimeWarning,
+                stacklevel=2,
+            )
             grammar = llama_grammar.LlamaGrammar.from_string(
                 llama_grammar.JSON_GBNF, verbose=llama.verbose
             )
-            if llama.verbose:
-                print(
-                    "Failed to parse function body as JSON schema, falling back to default grammar"
-                )
-                print(e)
         completion_or_chunks = llama.create_completion(
             prompt=prompt,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            typical_p=typical_p,
-            stream=stream,
-            stop=stop,
-            max_tokens=max_tokens,
-            presence_penalty=presence_penalty,
-            frequency_penalty=frequency_penalty,
-            repeat_penalty=repeat_penalty,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            model=model,
-            logits_processor=logits_processor,
-            grammar=grammar,
+            **{  # type: ignore[arg-type]
+                **completion_kwargs,
+                "max_tokens": None,
+                "grammar": grammar,
+            },
         )
-        return _convert_completion_to_chat_function(
-            tool_name, completion_or_chunks, stream
+        completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks)
+        completions.append(completion)
+        completions_tool_name.append(tool_name)
+        prompt += completion["choices"][0]["text"]
+        prompt += "\n"
+        # Determine whether to call another tool or stop
+        response = cast(
+            llama_types.CreateCompletionResponse,
+            llama.create_completion(
+                prompt=prompt,
+                **{  # type: ignore[arg-type]
+                    **completion_kwargs,
+                    "temperature": 0,
+                    "stream": False,
+                    "stop": [*completion_kwargs["stop"], ":", "</function_calls>"],  # type: ignore[misc]
+                    "max_tokens": None,
+                    "grammar": llama_grammar.LlamaGrammar.from_string(
+                        follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                    ),
+                },
+            ),
         )
+        tool_name = response["choices"][0]["text"][len("functions.") :]
+        tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    # Merge the completions into a single chat completion
+    chat_completion: llama_types.CreateChatCompletionResponse = {
+        "id": "chat" + completion["id"],
+        "object": "chat.completion",
+        "created": completion["created"],
+        "model": completion["model"],
+        "choices": [
+            {
+                "finish_reason": "tool_calls",
+                "index": 0,
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
+                ),
+                "message": {
+                    "role": "assistant",
+                    "content": None,
+                    "tool_calls": [
+                        {
+                            "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],
+                            "type": "function",
+                            "function": {
+                                "name": tool_name,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                        }
+                        for i, (tool_name, completion) in enumerate(
+                            zip(completions_tool_name, completions)
+                        )
+                    ],
+                },
+            }
+        ],
+        "usage": {
+            "completion_tokens": sum(
+                (completion["usage"]["completion_tokens"] if "usage" in completion else 0)
+                for completion in completions
+            ),
+            "prompt_tokens": sum(
+                completion["usage"]["prompt_tokens"] if "usage" in completion else 0
+                for completion in completions
+            ),
+            "total_tokens": sum(
+                completion["usage"]["total_tokens"] if "usage" in completion else 0
+                for completion in completions
+            ),
+        },
+    }
+    if len(completions) == 1:
+        single_function_call: llama_types.ChatCompletionResponseFunctionCall = {
+            "name": tool_name,
+            "arguments": completions[0]["choices"][0]["text"],
+        }
+        chat_completion["choices"][0]["message"]["function_call"] = single_function_call
+    return chat_completion
+
+
+@register_chat_completion_handler("gguf-function-calling")
+def gguf_function_calling(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    logprobs: Optional[bool] = None,
+    top_logprobs: Optional[int] = None,
+    **kwargs: Any,
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    
+    function_calling_template = None
+    if hasattr(llama, 'model_path'):
+        metadata = llama.metadata
+        if metadata and "tokenizer.chat_template" in metadata:
+            function_calling_template = metadata["tokenizer.chat_template"]
+
 
-    # Case 3: Automatic tool choice
-    assert isinstance(tool_choice, str) and tool_choice == "auto"
-    function_names = " | ".join(
-        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+    function_calling_template = (
+        "{% for message in messages %}"
+        "<|im_start|>{{ message.role }}\n"
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        '\n{% if tool.function.get("description") %}/* {{ tool.function.description | trim }} */{% endif %}'
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\nYou must respond to user messages with either a single message or with one or more function calls."
+        "\n\nTo respond with a message use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls use the following format:"
+        "\n\n<function_calls>"
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\n</function_calls>"
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Regular message
+        "{% if message.content and message.content | length > 0 %}"
+        "{% if tool_calls %}"
+        "message:\n"
+        "{% endif %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if 'tool_calls' in message %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ tool_call.function.arguments }}"
+        "{% endfor %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
     )
-    initial_gbnf_tool_grammar = (
-        """root   ::= functions | "message:"\n"""
-        f"""functions ::= {function_names}\n"""
+    template_renderer = ImmutableSandboxedEnvironment(
+        autoescape=jinja2.select_autoescape(["html", "xml"]),
+        undefined=jinja2.StrictUndefined,
+    ).from_string(function_calling_template)
+
+    # Convert legacy functions to tools
+    if functions is not None:
+        tools = [{"type": "function", "function": function} for function in functions]
+
+    # Convert legacy function_call to tool_choice
+    if function_call is not None:
+        if isinstance(function_call, str) and (function_call in ("none", "auto")):
+            tool_choice = function_call
+        if isinstance(function_call, dict) and "name" in function_call:
+            tool_choice = {"type": "function", "function": {"name": function_call["name"]}}
+
+    # Collect the llama.create_completion keyword arguments so we don't have to repeat these with
+    # each completion call
+    stop = (
+        [stop, "<|im_end|>"]
+        if isinstance(stop, str)
+        else [*stop, "<|im_end|>"]
+        if stop
+        else ["<|im_end|>"]
     )
-    follow_up_gbnf_tool_grammar = (
-        """root   ::= functions | "<|im_end|>"\n"""
-        f"""functions ::= {function_names}\n"""
+    grammar = (  # It is assumed the grammar applies to messages only, not tool calls
+        grammar
+        if grammar is not None
+        else (
+            _grammar_for_response_format(response_format)
+            if response_format is not None and response_format["type"] == "json_object"
+            else None
+        )
     )
+    completion_kwargs = {
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "min_p": min_p,
+        "typical_p": typical_p,
+        "stream": stream,
+        "stop": stop,
+        "max_tokens": max_tokens,
+        "presence_penalty": presence_penalty,
+        "frequency_penalty": frequency_penalty,
+        "repeat_penalty": repeat_penalty,
+        "tfs_z": tfs_z,
+        "mirostat_mode": mirostat_mode,
+        "mirostat_tau": mirostat_tau,
+        "mirostat_eta": mirostat_eta,
+        "model": model,
+        "logits_processor": logits_processor,
+        "grammar": grammar,
+    }
+
+    # Case 1: No tool use
+    if (
+        tool_choice is None
+        or (isinstance(tool_choice, str) and tool_choice == "none")
+        or tools is None
+        or len(tools) == 0
+    ):
+        prompt = template_renderer.render(
+            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
+        )
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt,
+                **completion_kwargs,  # type: ignore[arg-type]
+                logprobs=top_logprobs if logprobs else None,
+            ),
+            stream=stream,
+        )
+
+    # Ensure there is a system prompt to attach the tool metadata to
+    if not any(message["role"] == "system" for message in messages):
+        messages = [*messages, {"role": "system", "content": ""}]
+
+    # Case 2: Automatic or fixed tool choice
+    # Case 2 step 1: Determine whether to respond with a message or a tool call
+    assert (isinstance(tool_choice, str) and tool_choice == "auto") or isinstance(tool_choice, dict)
+    if isinstance(tool_choice, dict):
+        tools = [t for t in tools if t["function"]["name"] == tool_choice["function"]["name"]]
+        assert tools
+    function_names = " | ".join([f'''"functions.{t['function']['name']}:"''' for t in tools])
     prompt = template_renderer.render(
-        messages=messages,
-        tools=tools,
-        tool_calls=True,
-        add_generation_prompt=True,
+        messages=messages, tools=tools, tool_calls=True, add_generation_prompt=True
     )
-    completion_or_chunks = llama.create_completion(
-        prompt=prompt,
-        temperature=0,
-        top_p=top_p,
-        top_k=top_k,
-        min_p=min_p,
-        typical_p=typical_p,
-        stream=False,
-        stop=[":"],
-        max_tokens=None,
-        presence_penalty=presence_penalty,
-        frequency_penalty=frequency_penalty,
-        repeat_penalty=repeat_penalty,
-        tfs_z=tfs_z,
-        mirostat_mode=mirostat_mode,
-        mirostat_tau=mirostat_tau,
-        mirostat_eta=mirostat_eta,
-        model=model,
-        logits_processor=logits_processor,
-        grammar=llama_grammar.LlamaGrammar.from_string(
-            initial_gbnf_tool_grammar, verbose=llama.verbose
+    initial_gbnf_tool_grammar = (
+        (
+            'root ::= "<function_calls>" "\\n" functions | "message:"\n'
+            f"functions ::= {function_names}\n"
+        )
+        if tool_choice == "auto"
+        else f'root ::= "<function_calls>" "\\n" functions\nfunctions ::= {function_names}\n'
+    )
+    completion = cast(
+        llama_types.CreateCompletionResponse,
+        llama.create_completion(
+            prompt=prompt,
+            **{  # type: ignore[arg-type]
+                **completion_kwargs,
+                "temperature": 0,
+                "stream": False,
+                "stop": [":"],
+                "max_tokens": None,
+                "grammar": llama_grammar.LlamaGrammar.from_string(
+                    initial_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            },
         ),
     )
-    completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
     text = completion["choices"][0]["text"]
-    if "message" in text:
+    tool_name = None if text.startswith("message") else text.split("\n")[-1][len("functions.") :]
+
+    # Case 2 step 2A: Respond with a message
+    if tool_name is None:
+        prompt = template_renderer.render(
+            messages=messages, tools=[], tool_calls=None, add_generation_prompt=True
+        )
         return _convert_completion_to_chat(
             llama.create_completion(
-                prompt=prompt + "message:\n",
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                stream=stream,
-                stop=["<|im_end|>"],
+                prompt=prompt,
+                **completion_kwargs,  # type: ignore[arg-type]
                 logprobs=top_logprobs if logprobs else None,
-                max_tokens=None,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                grammar=llama_grammar.LlamaGrammar.from_string(
-                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
-                ),
             ),
             stream=stream,
         )
 
-    # One or more function calls
-    tool_name = text[len("functions.") :]
+    # Case 2 step 2B: One or more function calls
+    follow_up_gbnf_tool_grammar = (
+        'root ::= functions | "</function_calls>" | "<|im_end|>"\n'
+        f"functions ::= {function_names}\n"
+    )
+    prompt += "<function_calls>\n"
+    if stream:
+        return _stream_tool_calls(
+            llama, prompt, tools, tool_name, completion_kwargs, follow_up_gbnf_tool_grammar
+        )
     tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
-    if not stream:
-        completions: List[llama_types.CreateCompletionResponse] = []
-        completions_tool_name: List[str] = []
-        while tool is not None:
-            prompt += f"functions.{tool_name}:\n"
-            try:
-                grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
-                )
-            except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
-                )
-                if llama.verbose:
-                    print(
-                        "Failed to parse function body as JSON schema, falling back to default grammar"
-                    )
-                    print(e)
-            completion_or_chunks = llama.create_completion(
-                prompt=prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                stream=False,
-                stop=stop,
-                max_tokens=None,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                grammar=grammar,
-            )
-            completion_or_chunks = cast(
-                llama_types.CreateCompletionResponse, completion_or_chunks
+    completions: List[llama_types.CreateCompletionResponse] = []
+    completions_tool_name: List[str] = []
+    while tool is not None and len(completions) <= 16:
+        # Generate the parameter values for the selected tool
+        prompt += f"functions.{tool_name}:\n"
+        try:
+            grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
             )
-            completions.append(completion_or_chunks)
-            completions_tool_name.append(tool_name)
-            prompt += completion_or_chunks["choices"][0]["text"]
-            prompt += "\n"
-
-            response = llama.create_completion(
-                prompt=prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                min_p=min_p,
-                typical_p=typical_p,
-                stream=False,
-                stop=stop,
-                max_tokens=None,
-                presence_penalty=presence_penalty,
-                frequency_penalty=frequency_penalty,
-                repeat_penalty=repeat_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                model=model,
-                logits_processor=logits_processor,
-                grammar=llama_grammar.LlamaGrammar.from_string(
-                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
-                ),
+        except Exception as e:
+            warnings.warn(
+                f"Failed to parse function body as JSON schema, falling back to default grammar\n\n{e}",
+                category=RuntimeWarning,
+                stacklevel=2,
             )
-            response = cast(llama_types.CreateCompletionResponse, response)
-
-            tool_name = response["choices"][0]["text"][len("functions.") :]
-            tool = next(
-                (tool for tool in tools if tool["function"]["name"] == tool_name), None
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
             )
-
-        # Merge completions
-        function_call_dict: Union[
-            Dict[str, str],
-            Dict[
-                Literal["function_call"],
-                llama_types.ChatCompletionRequestAssistantMessageFunctionCall,
-            ],
-        ] = (
-            {
-                "function_call": {
-                    "name": tool_name,
-                    "arguments": completions[0]["choices"][0]["text"],
-                }
-            }
-            if len(completions) == 1
-            else {}
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            **{  # type: ignore[arg-type]
+                **completion_kwargs,
+                "max_tokens": None,
+                "grammar": grammar,
+            },
         )
-        return {
-            "id": "chat" + completion["id"],
-            "object": "chat.completion",
-            "created": completion["created"],
-            "model": completion["model"],
-            "choices": [
-                {
-                    "finish_reason": "tool_calls",
-                    "index": 0,
-                    "logprobs": _convert_text_completion_logprobs_to_chat(completion["choices"][0]["logprobs"]),
-                    "message": {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call_"
-                                + f"_{i}_"
-                                + tool_name
-                                + "_"
-                                + completion["id"],
-                                "type": "function",
-                                "function": {
-                                    "name": tool_name,
-                                    "arguments": completion["choices"][0]["text"],
-                                },
-                            }
-                            for i, (tool_name, completion) in enumerate(
-                                zip(completions_tool_name, completions)
-                            )
-                        ],
-                        **function_call_dict,
-                    },
-                }
-            ],
-            "usage": {
-                "completion_tokens": sum(
-                    (
-                        completion["usage"]["completion_tokens"]
-                        if "usage" in completion
-                        else 0
-                    )
-                    for completion in completions
-                ),
-                "prompt_tokens": sum(
-                    completion["usage"]["prompt_tokens"] if "usage" in completion else 0
-                    for completion in completions
-                ),
-                "total_tokens": sum(
-                    completion["usage"]["total_tokens"] if "usage" in completion else 0
-                    for completion in completions
+        completion = cast(llama_types.CreateCompletionResponse, completion_or_chunks)
+        completions.append(completion)
+        completions_tool_name.append(tool_name)
+        prompt += completion["choices"][0]["text"]
+        prompt += "\n"
+        # Determine whether to call another tool or stop
+        response = cast(
+            llama_types.CreateCompletionResponse,
+            llama.create_completion(
+                prompt=prompt,
+                **{  # type: ignore[arg-type]
+                    **completion_kwargs,
+                    "temperature": 0,
+                    "stream": False,
+                    "stop": [*completion_kwargs["stop"], ":", "</function_calls>"],  # type: ignore[misc]
+                    "max_tokens": None,
+                    "grammar": llama_grammar.LlamaGrammar.from_string(
+                        follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                    ),
+                },
+            ),
+        )
+        tool_name = response["choices"][0]["text"][len("functions.") :]
+        tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    # Merge the completions into a single chat completion
+    chat_completion: llama_types.CreateChatCompletionResponse = {
+        "id": "chat" + completion["id"],
+        "object": "chat.completion",
+        "created": completion["created"],
+        "model": completion["model"],
+        "choices": [
+            {
+                "finish_reason": "tool_calls",
+                "index": 0,
+                "logprobs": _convert_text_completion_logprobs_to_chat(
+                    completion["choices"][0]["logprobs"]
                 ),
-            },
+                "message": {
+                    "role": "assistant",
+                    "content": None,
+                    "tool_calls": [
+                        {
+                            "id": "call_" + f"_{i}_" + tool_name + "_" + completion["id"],
+                            "type": "function",
+                            "function": {
+                                "name": tool_name,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                        }
+                        for i, (tool_name, completion) in enumerate(
+                            zip(completions_tool_name, completions)
+                        )
+                    ],
+                },
+            }
+        ],
+        "usage": {
+            "completion_tokens": sum(
+                (completion["usage"]["completion_tokens"] if "usage" in completion else 0)
+                for completion in completions
+            ),
+            "prompt_tokens": sum(
+                completion["usage"]["prompt_tokens"] if "usage" in completion else 0
+                for completion in completions
+            ),
+            "total_tokens": sum(
+                completion["usage"]["total_tokens"] if "usage" in completion else 0
+                for completion in completions
+            ),
+        },
+    }
+    if len(completions) == 1:
+        single_function_call: llama_types.ChatCompletionResponseFunctionCall = {
+            "name": tool_name,
+            "arguments": completions[0]["choices"][0]["text"],
         }
-
-    raise ValueError("Automatic streaming tool choice is not supported")
+        chat_completion["choices"][0]["message"]["function_call"] = single_function_call
+    return chat_completion
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d13d6045..2c625505 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -10,6 +10,7 @@
     NewType,
     Optional,
     TYPE_CHECKING,
+    List,
 )
 
 from llama_cpp._ctypes_extensions import (
@@ -654,6 +655,12 @@ class llama_model_kv_override(ctypes.Structure):
 #     ggml_backend_buffer_type_t buft;
 # };
 
+class llama_model_tensor_buft_override(ctypes.Structure):
+    _fields_ = [
+        ("pattern", ctypes.c_char_p),
+        ("buft", ctypes.c_int),
+    ]
+
 
 # struct llama_model_params {
 #     // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
@@ -785,7 +792,8 @@ class llama_model_params(ctypes.Structure):
 #                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
 # };
 class llama_context_params(ctypes.Structure):
-    """Parameters for llama_context
+    """Parameters for llama_context. NOTE: changing the default values of parameters marked as [EXPERIMENTAL] 
+    may cause crashes or incorrect results in certain configurations.
 
     Attributes:
         n_ctx (int): text context, 0 = from model
@@ -795,7 +803,7 @@ class llama_context_params(ctypes.Structure):
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
-        pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        pooling_type (int): whether to pool (sum) embedding results by sequence id
         attention_type (int): attention type to use for embeddings
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
@@ -1428,6 +1436,11 @@ def llama_model_n_embd(model: llama_model_p, /) -> int:
 def llama_model_n_layer(model: llama_model_p, /) -> int:
     ...
 
+# LLAMA_API int32_t llama_model_dev_layer  (const struct llama_model * model, int32_t il);
+@ctypes_function("llama_model_dev_layer", [llama_model_p_ctypes, ctypes.c_int32], ctypes.c_int32)
+def llama_model_dev_layer(model: llama_model_p, il: Union[ctypes.c_int32, int], /) -> int:
+    ...
+
 
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index f647822f..67772b8e 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -154,13 +154,13 @@ class ChatCompletionStreamResponseChoice(TypedDict):
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
     logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
 
-
 class CreateChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
     choices: List[ChatCompletionStreamResponseChoice]
+    usage: NotRequired[CompletionUsage]
 
 
 class ChatCompletionFunctions(TypedDict):
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
deleted file mode 100644
index d9dfaf5f..00000000
--- a/llama_cpp/llava_cpp.py
+++ /dev/null
@@ -1,158 +0,0 @@
-from __future__ import annotations
-
-import os
-from ctypes import (
-    c_bool,
-    c_char_p,
-    c_int,
-    c_uint8,
-    c_float,
-    c_void_p,
-    POINTER,
-    _Pointer,  # type: ignore
-    Structure,
-)
-import pathlib
-from typing import (
-    Union,
-    NewType,
-    Optional,
-    TYPE_CHECKING,
-)
-
-import llama_cpp.llama_cpp as llama_cpp
-
-from llama_cpp._ctypes_extensions import (
-    load_shared_library,
-    ctypes_function_for_shared_library,
-)
-
-if TYPE_CHECKING:
-    from llama_cpp._ctypes_extensions import (
-        CtypesArray,
-    )
-
-
-# Specify the base name of the shared library to load
-_libllava_base_name = "llava"
-_libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
-_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
-
-# Load the library
-_libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
-
-ctypes_function = ctypes_function_for_shared_library(_libllava)
-
-
-################################################
-# llava.h
-################################################
-
-# struct clip_ctx;
-clip_ctx_p = NewType("clip_ctx_p", int)
-clip_ctx_p_ctypes = c_void_p
-
-
-# struct llava_image_embed {
-#     float * embed;
-#     int n_image_pos;
-# };
-class llava_image_embed(Structure):
-    _fields_ = [
-        ("embed", POINTER(c_float)),
-        ("n_image_pos", c_int),
-    ]
-
-
-# /** sanity check for clip <-> llava embed size match */
-# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
-@ctypes_function(
-    "llava_validate_embed_size",
-    [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
-    c_bool,
-)
-def llava_validate_embed_size(
-    ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
-) -> bool:
-    ...
-
-
-# /** build an image embed from image file bytes */
-# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
-@ctypes_function(
-    "llava_image_embed_make_with_bytes",
-    [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
-    POINTER(llava_image_embed),
-)
-def llava_image_embed_make_with_bytes(
-    ctx_clip: clip_ctx_p,
-    n_threads: Union[c_int, int],
-    image_bytes: CtypesArray[c_uint8],
-    image_bytes_length: Union[c_int, int],
-    /,
-) -> "_Pointer[llava_image_embed]":
-    ...
-
-
-# /** build an image embed from a path to an image filename */
-# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-@ctypes_function(
-    "llava_image_embed_make_with_filename",
-    [clip_ctx_p_ctypes, c_int, c_char_p],
-    POINTER(llava_image_embed),
-)
-def llava_image_embed_make_with_filename(
-    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
-) -> "_Pointer[llava_image_embed]":
-    ...
-
-
-# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
-# /** free an embedding made with llava_image_embed_make_* */
-@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
-def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
-    ...
-
-
-# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
-# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-@ctypes_function(
-    "llava_eval_image_embed",
-    [
-        llama_cpp.llama_context_p_ctypes,
-        POINTER(llava_image_embed),
-        c_int,
-        POINTER(c_int),
-    ],
-    c_bool,
-)
-def llava_eval_image_embed(
-    ctx_llama: llama_cpp.llama_context_p,
-    embed: "_Pointer[llava_image_embed]",
-    n_batch: Union[c_int, int],
-    n_past: "_Pointer[c_int]",
-    /,
-) -> bool:
-    ...
-
-
-################################################
-# clip.h
-################################################
-
-
-# /** load mmproj model */
-# CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
-@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
-def clip_model_load(
-    fname: bytes, verbosity: Union[c_int, int], /
-) -> Optional[clip_ctx_p]:
-    ...
-
-
-# /** free mmproj model */
-# CLIP_API void clip_free(struct clip_ctx * ctx);
-@ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
-def clip_free(ctx: clip_ctx_p, /):
-    ...
-
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index a45f8f40..0c641ad4 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import os
+import ctypes
 from ctypes import (
     c_bool,
     c_char_p,
     c_int,
+    c_int32,
     c_uint8,
     c_uint32,
     c_float,
@@ -17,6 +19,7 @@
 )
 import pathlib
 from typing import (
+    List,
     Union,
     NewType,
     Optional,
@@ -31,19 +34,161 @@
 )
 
 if TYPE_CHECKING:
+    from llama_cpp.llama_types import (
+        llama_token,
+        llama_pos,
+    )
     from llama_cpp._ctypes_extensions import (
         CtypesArray,
+        CtypesPointer,
     )
 
+# Define input text structure
+class mtmd_input_text(Structure):
+    _fields_ = [
+        ("text", c_char_p),
+        ("add_special", c_bool),
+        ("parse_special", c_bool),
+    ]
+
+# Define context parameters structure
+class mtmd_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("verbosity", c_int),
+        ("image_marker", c_char_p),  # const char*
+        ("media_marker", c_char_p),  # const char*
+    ]
+
+# Define input chunk type enum
+mtmd_input_chunk_type = c_int
+(
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+) = (0, 1, 2)
+
+# Define slice template enum
+mtmd_slice_tmpl = c_int
+(
+    MTMD_SLICE_TMPL_NONE,
+    MTMD_SLICE_TMPL_MINICPMV_2_5,
+    MTMD_SLICE_TMPL_MINICPMV_2_6,
+    MTMD_SLICE_TMPL_LLAMA4,
+) = (0, 1, 2, 3)
+
+# Define whisper filters structure
+class whisper_filters(Structure):
+    _fields_ = [
+        ("n_mel", c_int),
+    ]
+
+# Define mtmd_context structure
+class mtmd_context(Structure):
+    _fields_ = [
+        ("ctx_v", c_void_p),  # clip_ctx*
+        ("ctx_a", c_void_p),  # clip_ctx*
+        ("text_model", c_void_p),  # const llama_model*
+        ("image_embd_v", POINTER(c_float)),  # std::vector<float>
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("media_marker", c_char_p),  # std::string
+        ("n_embd_text", c_int),
+        ("img_beg", c_char_p),  # std::string
+        ("img_end", c_char_p),  # std::string
+        ("aud_beg", c_char_p),  # std::string
+        ("aud_end", c_char_p),  # std::string
+        ("slice_tmpl", c_int),  # mtmd_slice_tmpl
+        ("tok_ov_img_start", llama_cpp.llama_token),
+        ("tok_ov_img_end", llama_cpp.llama_token),
+        ("tok_slices_start", llama_cpp.llama_token),
+        ("tok_slices_end", llama_cpp.llama_token),
+        ("tok_sli_img_start", llama_cpp.llama_token),
+        ("tok_sli_img_end", llama_cpp.llama_token),
+        ("tok_sli_img_mid", llama_cpp.llama_token),
+        ("tok_row_end", llama_cpp.llama_token),
+        ("tok_row_end_trail", c_bool),
+        ("ov_img_first", c_bool),
+        ("use_mrope", c_bool),
+        ("w_filters", whisper_filters),
+    ]
+
+# Define bitmap structure
+class mtmd_bitmap(Structure):
+    _fields_ = [
+        ("nx", c_uint32),
+        ("ny", c_uint32),
+        ("data", POINTER(c_uint8)),  # Vector represented as pointer
+        ("id", c_char_p),
+        ("is_audio", c_bool),
+    ]
+
+# Define image tokens structure
+class mtmd_image_tokens(Structure):
+    _fields_ = [
+        ("nx", c_uint32),
+        ("ny", c_uint32),
+        ("use_mrope_pos", c_bool),
+        ("batch_f32", c_void_p),  # clip_image_f32_batch
+        ("id", c_char_p),
+    ]
 
-# Specify the base name of the shared library to load
+# Define audio tokens structure
+class mtmd_audio_tokens(Structure):
+    _fields_ = [
+        ("n_tokens", c_uint32),
+        ("batch_f32", c_void_p),  # clip_image_f32_batch
+        ("id", c_char_p),
+    ]
+
+# Define input chunk structure
+class mtmd_input_chunk(Structure):
+    _fields_ = [
+        ("type", mtmd_input_chunk_type),
+        ("tokens_text", POINTER(llama_cpp.llama_token)),  # Vector represented as pointer
+        ("tokens_image", c_void_p),  # mtmd_image_tokens_ptr
+        ("tokens_audio", c_void_p),  # mtmd_audio_tokens_ptr
+    ]
+
+# Define input chunks structure
+class mtmd_input_chunks(Structure):
+    _fields_ = [
+        ("entries", POINTER(mtmd_input_chunk)),  # Vector represented as pointer
+    ]
+
+# Define context pointer type
+mtmd_context_p = NewType("mtmd_context_p", int)
+mtmd_context_p_ctypes = c_void_p
+
+# Define bitmap pointer type
+mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
+mtmd_bitmap_p_ctypes = c_void_p
+
+# Define input chunks pointer type
+mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
+mtmd_input_chunks_p_ctypes = c_void_p
+
+# Define input chunk pointer type
+mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
+mtmd_input_chunk_p_ctypes = c_void_p
+
+# Define image tokens pointer type
+mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
+mtmd_image_tokens_p_ctypes = c_void_p
+
+# Define audio tokens pointer type
+mtmd_audio_tokens_p = NewType("mtmd_audio_tokens_p", int)
+mtmd_audio_tokens_p_ctypes = c_void_p
+
+# Load the library
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
 _libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
-
 ctypes_function = ctypes_function_for_shared_library(_libmtmd)
 
 ################################################
diff --git a/pyproject.toml b/pyproject.toml
index 9983ef77..1f0aab57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,8 @@ test = [
     "sse-starlette>=1.6.1",
     "starlette-context>=0.3.6,<0.4",
     "pydantic-settings>=2.0.1",
-    "huggingface-hub>=0.23.0"
+    "huggingface-hub>=0.23.0",
+    "typeguard>=4.2.1",
 ]
 dev = [
     "black>=23.3.0",
diff --git a/tests/monalisa.jpg b/tests/monalisa.jpg
new file mode 100644
index 00000000..782ee4f9
Binary files /dev/null and b/tests/monalisa.jpg differ
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72..42bbac1f 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -1,14 +1,29 @@
 import json
+import os
+import platform
+from collections.abc import Iterator
+from typing import cast
 
+import pytest
 import jinja2
+from typeguard import ForwardRefPolicy, check_type
 
 from llama_cpp import (
     ChatCompletionRequestUserMessage,
+    Llama,
+    llama_chat_format,
+    llama_supports_gpu_offload,
+    llama_types
 )
-import llama_cpp.llama_types as llama_types
-import llama_cpp.llama_chat_format as llama_chat_format
-
 from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
+from llama_cpp.llama_types import (
+    ChatCompletionRequestMessage,
+    ChatCompletionTool,
+    ChatCompletionToolChoiceOption,
+    CreateChatCompletionResponse,
+    CreateChatCompletionStreamResponse,
+)
+
 
 def test_mistral_instruct():
     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
@@ -87,3 +102,118 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
     )
 
     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>" "")
+
+
+def is_accelerator_available() -> bool:
+    """Check if an accelerator is available."""
+    return llama_supports_gpu_offload() or (os.cpu_count() or 1) >= 8
+
+
+@pytest.mark.parametrize(
+    "stream",
+    [
+        pytest.param(True, id="stream=True"),
+        pytest.param(False, id="stream=False"),
+    ],
+)
+@pytest.mark.parametrize(
+    "tool_choice",
+    [
+        pytest.param("none", id="tool_choice=none"),
+        pytest.param("auto", id="tool_choice=auto"),
+        pytest.param(
+            {"type": "function", "function": {"name": "get_weather"}}, id="tool_choice=fixed"
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "user_prompt_expected_tool_calls",
+    [
+        pytest.param(
+            ("Is 7 a prime number?", 0),
+            id="expected_tool_calls=0",
+        ),
+        pytest.param(
+            ("What's the weather like in Paris today?", 1),
+            id="expected_tool_calls=1",
+        ),
+        pytest.param(
+            ("What's the weather like in Paris today? What about New York?", 2),
+            id="expected_tool_calls=2",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "llm_repo_id",
+    [
+        pytest.param("bartowski/Llama-3.2-3B-Instruct-GGUF", id="llama_3.2_3B"),
+        pytest.param(
+            "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
+            id="llama_3.1_8B",
+            marks=pytest.mark.skipif(
+                not is_accelerator_available(), reason="Accelerator not available"
+            ),
+        ),
+    ],
+)
+@pytest.mark.skipif(
+    platform.system() == "Darwin" and (os.cpu_count() or 1) < 8,
+    reason="Insufficient resources on macOS",
+)
+def test_llama_cpp_python_tool_use(
+    llm_repo_id: str,
+    user_prompt_expected_tool_calls: tuple[str, int],
+    tool_choice: ChatCompletionToolChoiceOption,
+    stream: bool,
+) -> None:
+    """Test the upgraded chatml-function-calling llama-cpp-python chat handler."""
+    user_prompt, expected_tool_calls = user_prompt_expected_tool_calls
+    if isinstance(tool_choice, dict) and expected_tool_calls == 0:
+        pytest.skip("Nonsensical")
+    llm = Llama.from_pretrained(
+        repo_id=llm_repo_id,
+        filename="*Q4_K_M.gguf",
+        n_ctx=4096,
+        n_gpu_layers=-1,
+        verbose=False,
+        chat_format="chatml-function-calling",
+    )
+    messages: list[ChatCompletionRequestMessage] = [{"role": "user", "content": user_prompt}]
+    tools: list[ChatCompletionTool] = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the weather for a location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "A city name."}},
+                },
+            },
+        }
+    ]
+    response = llm.create_chat_completion(
+        messages=messages, tools=tools, tool_choice=tool_choice, stream=stream
+    )
+    if stream:
+        response = cast(Iterator[CreateChatCompletionStreamResponse], response)
+        num_tool_calls = 0
+        for chunk in response:
+            check_type(chunk, CreateChatCompletionStreamResponse)
+            tool_calls = chunk["choices"][0]["delta"].get("tool_calls")
+            if isinstance(tool_calls, list):
+                num_tool_calls = max(tool_call["index"] for tool_call in tool_calls) + 1
+        assert num_tool_calls == (expected_tool_calls if tool_choice != "none" else 0)
+    else:
+        response = cast(CreateChatCompletionResponse, response)
+        check_type(
+            response, CreateChatCompletionResponse, forward_ref_policy=ForwardRefPolicy.IGNORE
+        )
+        if expected_tool_calls == 0 or tool_choice == "none":
+            assert response["choices"][0]["message"].get("tool_calls") is None
+        else:
+            assert len(response["choices"][0]["message"]["tool_calls"]) == expected_tool_calls
+            assert all(
+                tool_call["function"]["name"] == tools[0]["function"]["name"]
+                for tool_call in response["choices"][0]["message"]["tool_calls"]
+            )
diff --git a/tests/test_llava.py b/tests/test_llava.py
new file mode 100644
index 00000000..2be60171
--- /dev/null
+++ b/tests/test_llava.py
@@ -0,0 +1,80 @@
+import multiprocessing
+import ctypes
+
+from huggingface_hub import hf_hub_download
+
+import pytest
+
+import llama_cpp
+
+@pytest.fixture
+def mmproj_model_path():
+    repo_id = "second-state/Llava-v1.5-7B-GGUF"
+    filename = "llava-v1.5-7b-mmproj-model-f16.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+@pytest.fixture
+def llava_cpp_model_path():
+    repo_id = "second-state/Llava-v1.5-7B-GGUF"
+    filename = "llava-v1.5-7b-Q8_0.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+def test_real_llava(llava_cpp_model_path, mmproj_model_path):
+    print("initializing model")
+    model = llama_cpp.Llama(
+        llava_cpp_model_path,
+        n_ctx=2048,
+        n_batch=512,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        verbose=False,
+    )
+
+    # Initialize the LLaVA chat handler
+    from llama_cpp.llama_chat_format import Llava15ChatHandler
+    print("initializing chat handler")
+    chat_handler = Llava15ChatHandler(clip_model_path=mmproj_model_path, llama_model=model)
+
+    # Create a chat message with the image
+    print("creating chat message")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": "./tests/monalisa.jpg"
+                },
+                {
+                    "type": "text",
+                    "text": "Do you know who drew this painting?"
+                }
+            ]
+        }
+    ]
+
+    # Generate response
+    print("generating response")
+    response = chat_handler(
+        llama=model,
+        messages=messages,
+        max_tokens=200,
+        temperature=0.2,
+        top_p=0.95,
+        stream=False
+    )
+
+    print("response", response)
+    # Check that we got a response
+    assert response is not None
+    assert "choices" in response
+    assert len(response["choices"]) > 0
+    assert "message" in response["choices"][0]
+    assert "content" in response["choices"][0]["message"]
+    
+    # The response should mention Leonardo da Vinci
+    content = response["choices"][0]["message"]["content"].lower()
+    assert "leonardo" in content and "vinci" in content  # Artist name should be in response
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8846aace..f13fa9b2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8846aace4934ad29651ea61b8c7e3f6b0556e3d2
+Subproject commit f13fa9b2b523e22ba58fcf4c468f670d8c98d912