inference-sh · MikeLP · Dec 25, 2024 · Dec 25, 2024 · Jan 5, 2025 · Jan 12, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+	url = http://github.com/inference-sh/llama.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(llama_cpp)
 
 option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
-option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
+option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
     if(NOT TARGET ${target})
@@ -143,7 +143,7 @@ if (LLAMA_BUILD)
         )
     endif()
 
-    if (LLAVA_BUILD)
+    if (MTMD_BUILD)
         if (LLAMA_CUBLAS OR LLAMA_CUDA)
             add_compile_definitions(GGML_USE_CUBLAS)
             add_compile_definitions(GGML_USE_CUDA)
@@ -153,7 +153,7 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
-        # Building llava
+        # Building multimodal support using mtmd
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)

diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
@@ -230,7 +230,7 @@
    "outputs": [],
    "source": [
     "for i in range(n_parallel):\n",
-    "    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
+    "    llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
    ]
   },
   {

diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
@@ -128,4 +128,4 @@ def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCD
     ...
 
 
-byref = _byref if TYPE_CHECKING else ctypes.byref
+byref = _byref if TYPE_CHECKING else ctypes.byref
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+from enum import Enum
 
 from typing import (
     Dict,
@@ -26,7 +27,13 @@
 
 
 # Python wrappers over llama.h structs
-
+class LlamaBackendDev(Enum):
+    # CPU device using system memory
+    CPU = 0
+    # GPU device using dedicated memory  
+    GPU = 1
+    # accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+    ACCEL = 2
 
 class LlamaModel:
     """Intermediate Python wrapper for a llama.cpp llama_model.
@@ -95,7 +102,13 @@ def n_ctx_train(self) -> int:
         return llama_cpp.llama_model_n_ctx_train(self.model)
 
     def n_embd(self) -> int:
-        return llama_cpp.llama_model_n_embd(self.model)
+        return llama_cpp.llama_n_embd(self.model)
+
+    def n_layer(self) -> int:
+        return llama_cpp.llama_n_layer(self.model)
+
+    def dev_layer(self, il: int) -> LlamaBackendDev:
+        return LlamaBackendDev(llama_cpp.llama_model_dev_layer(self.model, il))
 
     def rope_freq_scale_train(self) -> float:
         return llama_cpp.llama_model_rope_freq_scale_train(self.model)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -128,4 +128,4 @@ def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCD
		...


		byref = _byref if TYPE_CHECKING else ctypes.byref
		byref = _byref if TYPE_CHECKING else ctypes.byref