From 91b5d1496b17045dd311a92ea9c6cc1a2d23ff47 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Thu, 15 May 2025 12:29:14 -0400
Subject: [PATCH 01/20] Updates Python and torch versions

Updates the allowed Python version to include 3.10 up to, but not including, 3.14.

Sets the minimum torch version to 2.5 in multiple dependency files.
---
 pyproject.toml                                          | 2 +-
 requirements/requirements-convert_hf_to_gguf.txt        | 2 +-
 requirements/requirements-convert_hf_to_gguf_update.txt | 2 +-
 tools/mtmd/requirements.txt                             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d71b055a8d..b751cbf285c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.9"
+python = ">=3.10,<3.14"
 numpy = "^1.25.0"
 sentencepiece = ">=0.1.98,<=0.2.0"
 transformers = ">=4.35.2,<5.0.0"
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 8cb9c354f01..8dbf5f525bc 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,3 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch>=2.5
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
index 8cb9c354f01..8dbf5f525bc 100644
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,3 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch>=2.5
diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index cbcbf26c9b4..34e6cbf1ca3 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -1,5 +1,5 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.2.1
+torch>=2.5
 torchvision~=0.17.1

From c99863b427ee5530ee790b4cb06ec29d78afa636 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Thu, 15 May 2025 14:03:55 -0400
Subject: [PATCH 02/20] Updates torchvision dependency

Updates torchvision dependency to a more recent version.

This ensures compatibility with the current torch version
and leverages the latest features and bug fixes.
---
 tools/mtmd/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index 34e6cbf1ca3..513f9dda513 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch>=2.5
-torchvision~=0.17.1
+torchvision>=0.20.1

From ef77f28b6db1573c01958bbd99584b51492af5df Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Thu, 15 May 2025 15:49:51 -0400
Subject: [PATCH 03/20] Updates NumPy dependency to version 2.1 or higher

Increases the minimum required NumPy version to 2.1 across multiple project dependencies. This ensures compatibility with newer features and improvements in the NumPy library.

Updates also include a minor patch to Torch version.
---
 gguf-py/pyproject.toml                                  | 2 +-
 pyproject.toml                                          | 2 +-
 requirements/requirements-convert_hf_to_gguf.txt        | 2 +-
 requirements/requirements-convert_hf_to_gguf_update.txt | 2 +-
 requirements/requirements-convert_legacy_llama.txt      | 2 +-
 requirements/requirements-gguf_editor_gui.txt           | 2 +-
 requirements/requirements-tool_bench.txt                | 4 ++--
 tools/server/tests/requirements.txt                     | 4 ++--
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index bb9b86ace75..dcd6769ebae 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = ">=3.8"
-numpy = ">=1.17"
+numpy = ">=2.1"
 tqdm = ">=4.27"
 pyyaml = ">=5.1"
 sentencepiece = ">=0.1.98,<=0.2.0"
diff --git a/pyproject.toml b/pyproject.toml
index b751cbf285c..9e37dcc7a9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = ">=3.10,<3.14"
-numpy = "^1.25.0"
+numpy = "^2.1"
 sentencepiece = ">=0.1.98,<=0.2.0"
 transformers = ">=4.35.2,<5.0.0"
 protobuf = ">=4.21.0,<5.0.0"
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 8dbf5f525bc..d327a01fb15 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,3 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch>=2.5
+torch>=2.5.1
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
index 8dbf5f525bc..d327a01fb15 100644
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,3 +1,3 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch>=2.5
+torch>=2.5.1
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt
index 859204b27eb..c025e101226 100644
--- a/requirements/requirements-convert_legacy_llama.txt
+++ b/requirements/requirements-convert_legacy_llama.txt
@@ -1,4 +1,4 @@
-numpy~=1.26.4
+numpy>=2.1
 sentencepiece~=0.2.0
 transformers>=4.45.1,<5.0.0
 gguf>=0.1.0
diff --git a/requirements/requirements-gguf_editor_gui.txt b/requirements/requirements-gguf_editor_gui.txt
index 920dc7cf90b..7e4724253f7 100644
--- a/requirements/requirements-gguf_editor_gui.txt
+++ b/requirements/requirements-gguf_editor_gui.txt
@@ -1,3 +1,3 @@
-numpy~=1.26.4
+numpy>=2.1
 PySide6~=6.9.0
 gguf>=0.16.0
diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt
index b94521fc7fa..2959802ed27 100644
--- a/requirements/requirements-tool_bench.txt
+++ b/requirements/requirements-tool_bench.txt
@@ -2,11 +2,11 @@ aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
 matplotlib~=3.10.0
-numpy~=1.26.4
+numpy>=2.1
 openai~=1.55.3
 pandas~=2.2.3
 prometheus-client~=0.20.0
-requests~=2.32.3
+requests>=2.28.1
 wget~=3.2
 typer~=0.15.1
 seaborn~=0.13.2
diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt
index 15d024914e8..4fc3c75d36a 100644
--- a/tools/server/tests/requirements.txt
+++ b/tools/server/tests/requirements.txt
@@ -1,8 +1,8 @@
 aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
-numpy~=1.26.4
+numpy>=2.1
 openai~=1.55.3
 prometheus-client~=0.20.0
-requests~=2.32.3
+requests>=2.28.1
 wget~=3.2

From c278affb18879adfb3c85af2e8c41cedaa48a710 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Fri, 16 May 2025 09:43:59 -0400
Subject: [PATCH 04/20] pyright-based changes for 
 tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py

Fix torch 2.5.1 / numpy 2.x compatibility in convert_image_encoder_to_gguf.py

- Updated Tensor-to-array conversions to use `np.asarray(..., dtype=...)` per NumPy 2.x migration rules (avoids copy error on float16).
- Used explicit typing and `cast(...)` to guide Pyright/Pylance under torch 2.5.1:
  - Annotated `model` as PreTrainedModel.
  - Re-cast `model.vision_model` to `CLIPVisionTransformer` to safely access `.encoder.layers`.
  - Replaced slice assignment with `__init__` to reset ModuleList contents.
- Verified compatibility by converting `openai/clip-vit-base-patch32` using `--clip-model-is-openclip`.
---
 .../convert_image_encoder_to_gguf.py          | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
index 2949faec421..c664c4e15a2 100644
--- a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
+++ b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -6,6 +6,10 @@
 import torch
 import numpy as np
 from gguf import *
+from typing import cast
+from torch.nn import ModuleList
+from transformers.models.clip.modeling_clip import CLIPVisionTransformer
+from transformers import PreTrainedModel
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 
 TEXT = "clip.text"
@@ -162,13 +166,13 @@ def bytes_to_unicode():
     ftype = 0
 
 if args.clip_model_is_siglip:
-    model = SiglipVisionModel.from_pretrained(dir_model)
+    model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
     processor = None
 elif args.clip_model_is_vision or args.clip_model_is_openclip:
-    model = CLIPVisionModel.from_pretrained(dir_model)
+    model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
-    model = CLIPModel.from_pretrained(dir_model)
+    model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
     processor = CLIPProcessor.from_pretrained(dir_model)
 
 fname_middle = None
@@ -350,9 +354,14 @@ def get_non_negative_vision_feature_layers(v_hparams):
     # By default, we drop the last layer for llava projector
     # models unless we have explicitly set vision feature layers
     if feature_layers is None:
-        model.vision_model.encoder.layers.pop(-1)
+        vision_model = cast(CLIPVisionTransformer, model.vision_model)
+        encoder_layers = vision_model.encoder.layers
+        encoder_layers.pop(-1)
     else:
-        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+        vision_model = cast(CLIPVisionTransformer, model.vision_model)
+        encoder_layers = vision_model.encoder.layers
+        encoder_layers = cast(ModuleList, encoder_layers)
+        encoder_layers.__init__(encoder_layers[:max(feature_layers)])
 
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
@@ -375,7 +384,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
         continue
 
     name = get_tensor_name(name)
-    data = data.squeeze().numpy()
+    data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())
 
     n_dims = len(data.shape)
 
@@ -383,16 +392,16 @@ def get_non_negative_vision_feature_layers(v_hparams):
     ftype_cur = 0
     if n_dims == 4:
         print(f"tensor {name} is always saved in f16")
-        data = data.astype(np.float16)
+        data = np.asarray(data, dtype=np.float16)
         ftype_cur = 1
     elif ftype == 1:
         if name[-7:] == ".weight" and n_dims == 2:
             print("  Converting to float16")
-            data = data.astype(np.float16)
+            data = np.asarray(data, dtype=np.float16)
             ftype_cur = 1
         else:
             print("  Converting to float32")
-            data = data.astype(np.float32)
+            data = np.asarray(data, dtype=np.float32)
             ftype_cur = 0
     else:
         if data.dtype != np.float32:

From a6ab0977456eb034aee7f4d48130d2ce24de7784 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Fri, 16 May 2025 10:08:38 -0400
Subject: [PATCH 05/20] Fixes type annotation for numpy arrays

Corrects type annotations for numpy arrays to allow for a broader range of numpy dtypes and resolves type checking errors.

Removes NumPy DTypeLike type hint

Updates type hints for NumPy compatibility by removing DTypeLike.

Ensures alignment with NumPy's typing system, preventing potential
type-related issues.
---
 gguf-py/gguf/quants.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 3c8ba82e19d..a8ab51ac228 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -3,8 +3,6 @@
 from typing import Any, Callable, Sequence
 from math import log2, ceil
 
-from numpy.typing import DTypeLike
-
 from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
 from .lazy import LazyNumpyTensor
 
@@ -26,7 +24,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
 
 
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
-def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
+def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: np.dtype[Any], oshape: tuple[int, ...]) -> np.ndarray:
     rows = arr.reshape((-1, arr.shape[-1]))
     osize = 1
     for dim in oshape:
@@ -80,7 +78,7 @@ class __Quant(ABC):
     block_size: int
     type_size: int
 
-    grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
+    grid: np.ndarray | None = None # np.float32
     grid_shape: tuple[int, int] = (0, 0)
     grid_map: tuple[int | float, ...] = ()
     grid_hex: bytes | None = None

From eb7cef1f7e7d860f9fa5f28e40dc0ac065df946b Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Fri, 16 May 2025 10:13:01 -0400
Subject: [PATCH 06/20] Updates type hints for numpy compatibility

Updates type hints related to numpy to use `np.dtype[Any]` instead of `DTypeLike` for better compatibility and clarity. This resolves potential issues with type checking and ensures accurate type annotations for numpy-related operations within the `gguf` library.
---
 gguf-py/gguf/lazy.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
index f9bcadae022..73f580bd479 100644
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -5,7 +5,6 @@
 from typing import Any, Callable
 
 import numpy as np
-from numpy.typing import DTypeLike
 
 
 logger = logging.getLogger(__name__)
@@ -107,7 +106,7 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
             return o
 
     @classmethod
-    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | np.dtype[Any] | tuple[np.dtype[Any], Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
         def wrapped_fn(*args, **kwargs):
             if kwargs is None:
                 kwargs = {}
@@ -204,7 +203,7 @@ class LazyNumpyTensor(LazyBase):
     shape: tuple[int, ...]  # Makes the type checker happy in quants.py
 
     @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: np.dtype[Any], shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
         # The initial idea was to use np.nan as the fill value,
         # but non-float types like np.int16 can't use that.
         # So zero it is.

From 150f157acf0fd4bd5379bdb3aae3dde1aa1397a5 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Fri, 16 May 2025 12:40:28 -0400
Subject: [PATCH 07/20] Better fix than earlier for quants.py

after researching the typing issues introduced in numpy 2.2
---
 gguf-py/gguf/quants.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index a8ab51ac228..28f6ce797a7 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -1,3 +1,5 @@
+# pyright: reportInvalidTypeForm=false
+
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Sequence
@@ -78,7 +80,7 @@ class __Quant(ABC):
     block_size: int
     type_size: int
 
-    grid: np.ndarray | None = None # np.float32
+    grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
     grid_shape: tuple[int, int] = (0, 0)
     grid_map: tuple[int | float, ...] = ()
     grid_hex: bytes | None = None

From 822c83cc780821f9eea62aef2bf465ed88a3d121 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Fri, 16 May 2025 12:41:31 -0400
Subject: [PATCH 08/20] Several types in numpy 2.2+ are defined as typeAliases
 with no underlying class (such as np.float32)

looking at https://github.com/microsoft/pyright/issues/9051, they declined to fix it themselves, and suggested instead that the used must add a # pyright: ignore or # type: ignore directive to suppress this error.

Numpy is working to resolve them: https://github.com/numpy/numpy/issues/28076 and has already done so with npfloat64 (which I can verify in our errors) -- see https://github.com/numpy/numpy/issues/27957 .
---
 gguf-py/gguf/gguf_reader.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 5991cdb76be..c38b47e6308 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -1,3 +1,4 @@
+# pyright: reportInvalidTypeForm=false
 #
 # GGUF file reading/modification support. For API usage information,
 # please see the files scripts/ for some fairly simple examples.
@@ -15,6 +16,7 @@
 
 from .quants import quant_shape_to_byte_shape
 
+
 if __name__ == "__main__":
     from pathlib import Path
 
@@ -104,7 +106,7 @@ class ReaderTensor(NamedTuple):
     n_elements: int
     n_bytes: int
     data_offset: int
-    data: npt.NDArray[Any]
+    data: np.ndarray
     field: ReaderField
 
 
@@ -181,7 +183,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         self.data_offset = offs
         self._build_tensors(offs, tensors_fields)
 
-    _DT = TypeVar('_DT', bound = npt.DTypeLike)
+    _DT = TypeVar('_DT', bound = np.dtype[Any])
 
     # Fetch a key/value metadata field by key.
     def get_field(self, key: str) -> Union[ReaderField, None]:
@@ -192,8 +194,8 @@ def get_tensor(self, idx: int) -> ReaderTensor:
         return self.tensors[idx]
 
     def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
-    ) -> npt.NDArray[Any]:
+        self, offset: int, dtype: np.dtype[Any], count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
+    ) -> np.ndarray:
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
         end_offs = offset + itemsize * count
@@ -213,7 +215,7 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
 
     def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
         slen = self._get(offset, np.uint64)
-        return slen, self._get(offset + 8, np.uint8, slen[0])
+        return slen, self._get(offset + 8, np.uint8, slen[0].item())
 
     def _get_field_parts(
         self, orig_offs: int, raw_type: int,
@@ -230,7 +232,7 @@ def _get_field_parts(
         # Check if it's a simple scalar type.
         nptype = self.gguf_scalar_to_np.get(gtype)
         if nptype is not None:
-            val = self._get(offs, nptype)
+            val = self._get(offs, np.dtype(nptype))
             return int(val.nbytes), [val], [0], types
         # Handle arrays.
         if gtype == GGUFValueType.ARRAY:
@@ -242,7 +244,7 @@ def _get_field_parts(
             data_idxs: list[int] = []
             # FIXME: Handle multi-dimensional arrays properly instead of flattening
             for idx in range(alen[0]):
-                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
+                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0].item())
                 if idx == 0:
                     types += curr_types
                 idxs_offs = len(aparts)
@@ -265,7 +267,7 @@ def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs += int(n_dims.nbytes)
 
         # Get Tensor Dimension Array
-        dims = self._get(offs, np.uint64, n_dims[0])
+        dims = self._get(offs, np.uint64, n_dims[0].item())
         offs += int(dims.nbytes)
 
         # Get Tensor Encoding Scheme Type
@@ -292,7 +294,7 @@ def _build_fields(self, offs: int, count: int) -> int:
             offs += int(raw_kv_type.nbytes)
             parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
             idxs_offs = len(parts)
-            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
+            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0].item())
             parts += field_parts
             self._push_field(ReaderField(
                 orig_offs,
@@ -328,28 +330,28 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])
-            item_type: npt.DTypeLike
+            item_type: np.dtype[Any]
             if ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
-                item_type = np.float16
+                item_type = np.dtype(np.float16)
             elif ggml_type == GGMLQuantizationType.F32:
                 item_count = n_elems
-                item_type = np.float32
+                item_type = np.dtype(np.float32)
             elif ggml_type == GGMLQuantizationType.F64:
                 item_count = n_elems
-                item_type = np.float64
+                item_type = np.dtype(np.float64)
             elif ggml_type == GGMLQuantizationType.I8:
                 item_count = n_elems
-                item_type = np.int8
+                item_type = np.dtype(np.int8)
             elif ggml_type == GGMLQuantizationType.I16:
                 item_count = n_elems
-                item_type = np.int16
+                item_type = np.dtype(np.int16)
             elif ggml_type == GGMLQuantizationType.I32:
                 item_count = n_elems
-                item_type = np.int32
+                item_type = np.dtype(np.int32)
             elif ggml_type == GGMLQuantizationType.I64:
                 item_count = n_elems
-                item_type = np.int64
+                item_type = np.dtype(np.int64)
             else:
                 item_count = n_bytes
                 item_type = np.uint8

From 4cfda13d7d1cdd803edbd1815d3ee5f587818dfd Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 16:31:38 -0400
Subject: [PATCH 09/20] revert change from NDArray in this file

---
 gguf-py/gguf/gguf_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index c38b47e6308..e2bccef7eee 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -106,7 +106,7 @@ class ReaderTensor(NamedTuple):
     n_elements: int
     n_bytes: int
     data_offset: int
-    data: np.ndarray
+    data: npt.NDArray[Any]
     field: ReaderField
 
 

From 2f3854cfc69a1b9af2109339c6838d6b2e5ee5dd Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:00:17 -0400
Subject: [PATCH 10/20] Updates pyright configuration

Updates the pyright configuration to use Python 3.10.

This ensures compatibility with newer language features
and libraries. It also excludes the 'tools/legacy' and 'tests'
directories from analysis.
---
 gguf-py/gguf/quants.py | 2 --
 pyrightconfig.json     | 8 ++++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 28f6ce797a7..26672909f97 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -1,5 +1,3 @@
-# pyright: reportInvalidTypeForm=false
-
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Sequence
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 5320fe5864a..07f267e6f1c 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,6 +1,6 @@
 {
   "extraPaths": ["gguf-py"],
-  "pythonVersion": "3.9",
+  "pythonVersion": "3.10",
   "pythonPlatform": "All",
   "reportUnusedImport": "warning",
   "reportDuplicateImport": "error",
@@ -11,7 +11,7 @@
     {
       // TODO: make this version override work correctly
       "root": "gguf-py",
-      "pythonVersion": "3.8",
+      "pythonVersion": "3.10",
     },
     {
       // uses match expressions in steps.py
@@ -19,4 +19,8 @@
       "pythonVersion": "3.10",
     },
   ],
+  "exclude": [
+    "tools/legacy",
+    "tests"
+  ]
  }

From eb6499cc7852c4c69e8628863d2b548d837a235f Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:05:26 -0400
Subject: [PATCH 11/20] Revert tools/mtmd/ to match master

---
 tools/mtmd/clip-impl.h                        |  74 +++++-
 tools/mtmd/clip.cpp                           | 246 +++++++++++++++---
 tools/mtmd/clip.h                             |   4 -
 .../convert_image_encoder_to_gguf.py          |  27 +-
 tools/mtmd/mtmd.cpp                           |  97 +++++--
 tools/mtmd/requirements.txt                   |   4 +-
 tools/mtmd/tests.sh                           |  22 +-
 7 files changed, 382 insertions(+), 92 deletions(-)

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 23036ba72f1..7b7d2df3962 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -4,6 +4,7 @@
 
 #include <climits>
 #include <cstdarg>
+#include <cinttypes>
 #include <string>
 #include <map>
 #include <sstream>
@@ -44,7 +45,7 @@
 // tensor name constants
 //
 
-#define TN_POS_EMBD        "%s.position_embd.weight"
+#define TN_POS_EMBD        "v.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
@@ -110,6 +111,7 @@ enum projector_type {
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
     PROJECTOR_TYPE_INTERNVL,
+    PROJECTOR_TYPE_LLAMA4,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -125,6 +127,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
     { PROJECTOR_TYPE_INTERNVL,  "internvl"},
+    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -240,6 +243,11 @@ struct clip_image_u8_batch {
 struct clip_image_f32_batch {
     std::vector<clip_image_f32_ptr> entries;
 
+    // for llava-uhd style models, we need to know the grid size
+    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
+    int grid_x = 0;
+    int grid_y = 0;
+
     clip_image_f32_batch clone() const {
         clip_image_f32_batch new_batch;
         new_batch.entries.reserve(entries.size());
@@ -358,6 +366,70 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
     }
 }
 
+//
+// debugging
+//
+
+static void print_tensor_shape(ggml_tensor * t) {
+    printf("%s.shape = [", t->name);
+    for (int i = 0; i < ggml_n_dims(t); ++i) {
+        printf("%" PRId64, t->ne[i]);
+        if (i < ggml_n_dims(t) - 1) {
+            printf(", ");
+        }
+    }
+    printf("]\n");
+}
+
+static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
+    ggml_type type = t->type;
+    int64_t * ne = t->ne;
+    size_t * nb = t->nb;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        printf("%s.data: [\n", t->name);
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                printf("     ..., \n");
+                i2 = ne[2] - n;
+            }
+            printf("     [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    printf("      ..., \n");
+                    i1 = ne[1] - n;
+                }
+                printf("      [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        printf("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    printf("%8.4f", v);
+                    if (i0 < ne[0] - 1) printf(", ");
+                }
+                printf("],\n");
+            }
+            printf("     ],\n");
+        }
+        printf("    ]\n");
+    }
+}
+
 //
 // API used internally with mtmd
 //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 128a95cc11f..eba07f6c82e 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -359,9 +359,12 @@ struct clip_ctx {
     int max_nodes = 8192;
     ggml_backend_sched_ptr sched;
 
-    clip_image_size load_image_size;
+    // for debugging
+    bool debug_graph = false;
+    std::vector<ggml_tensor *> debug_print_tensors;
 
     clip_ctx(clip_context_params & ctx_params) {
+        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (!backend_cpu) {
             throw std::runtime_error("failed to initialize CPU backend");
@@ -440,7 +443,7 @@ struct clip_graph {
         };
         ctx0_ptr.reset(ggml_init(params));
         ctx0 = ctx0_ptr.get();
-        gf = ggml_new_graph(ctx0);
+        gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
     }
 
     ggml_cgraph * build_siglip() {
@@ -522,7 +525,7 @@ struct clip_graph {
         ggml_set_input(pos_w);
 
         auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
+            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
         };
 
         ggml_tensor * inp = build_inp();
@@ -936,6 +939,101 @@ struct clip_graph {
         return gf;
     }
 
+    ggml_cgraph * build_llama4() {
+        GGML_ASSERT(model.class_embedding != nullptr);
+        GGML_ASSERT(model.position_embeddings != nullptr);
+
+        const int n_pos = n_patches + 1; // +1 for [CLS]
+
+        // 2D input positions
+        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(pos_h, "pos_h");
+        ggml_set_input(pos_h);
+
+        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(pos_w, "pos_w");
+        ggml_set_input(pos_w);
+
+        ggml_tensor * inp = build_inp_raw();
+
+        // Llama4UnfoldConvolution
+        {
+            ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+                                                    patch_size, patch_size, 3, n_embd);
+            inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+            inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+            inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+            cb(inp, "patch_conv", -1);
+        }
+
+        // add CLS token
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+        // build ViT with 2D position embeddings
+        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+            // first half is X axis and second half is Y axis
+            // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+            // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+        };
+        ggml_tensor * cur = build_vit(
+                                inp, n_pos,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                model.position_embeddings,
+                                add_pos);
+
+        // remove CLS token
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, n_patches,
+            ggml_row_size(cur->type, n_embd), 0);
+
+        // pixel shuffle
+        // based on Llama4VisionPixelShuffleMLP
+        // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+        {
+            const int scale_factor = model.hparams.proj_scale_factor;
+            const int bsz = 1; // batch size, always 1 for now since we don't support batching
+            GGML_ASSERT(scale_factor > 0);
+            GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+            cur = ggml_reshape_4d(ctx0, cur,
+                n_embd * scale_factor,
+                n_patches_x / scale_factor,
+                n_patches_y,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                n_patches_x / scale_factor,
+                n_patches_y / scale_factor,
+                bsz);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            // flatten to 2D
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
+                n_embd * scale_factor * scale_factor,
+                n_patches / scale_factor / scale_factor);
+            cb(cur, "pixel_shuffle", -1);
+        }
+
+        // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+        {
+            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+            cur = ggml_gelu(ctx0, cur);
+            cb(cur, "adapter_mlp", -1);
+        }
+
+        // Llama4MultiModalProjector
+        cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+        cb(cur, "projected", -1);
+
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     // this graph is used by llava, granite and glm
     // due to having embedding_stack (used by granite), we cannot reuse build_vit
     ggml_cgraph * build_llava() {
@@ -1315,11 +1413,15 @@ struct clip_graph {
     // utility functions
     //
 
-    void cb(ggml_tensor * cur, const char * name, int il) const {
-        // TODO: implement this
-        GGML_UNUSED(cur);
-        GGML_UNUSED(name);
-        GGML_UNUSED(il);
+    void cb(ggml_tensor * cur0, const char * name, int il) const {
+        if (ctx->debug_graph) {
+            ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
+            std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
+            ggml_set_name(cur, cur_name.c_str());
+            ggml_set_output(cur);
+            ggml_build_forward_expand(gf, cur);
+            ctx->debug_print_tensors.push_back(cur);
+        }
     }
 
     // build vision transformer (ViT) cgraph
@@ -1630,9 +1732,10 @@ struct clip_graph {
     static ggml_tensor * build_rope_2d(
         ggml_context * ctx0,
         ggml_tensor * cur,
-        ggml_tensor * pos_h,
-        ggml_tensor * pos_w,
-        const float freq_base
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
     ) {
         const int64_t n_dim  = cur->ne[0];
         const int64_t n_head = cur->ne[1];
@@ -1646,7 +1749,9 @@ struct clip_graph {
         //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
         // then for the second half, we use freq_scale to shift the inv_freq
         //  ^ why? replace (2i) with (2i+1) in the above equation
-        const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
+        const float freq_scale_odd = interleave_freq
+                                    ? std::pow(freq_base, (float)-2/n_dim)
+                                    : 1.0;
 
         // first half
         ggml_tensor * first;
@@ -1659,7 +1764,7 @@ struct clip_graph {
             first = ggml_rope_ext(
                 ctx0,
                 first,
-                pos_h,      // positions
+                pos_a,      // positions
                 nullptr,    // freq factors
                 n_dim/2,    // n_dims
                 0, 0, freq_base,
@@ -1679,7 +1784,7 @@ struct clip_graph {
             second = ggml_rope_ext(
                 ctx0,
                 second,
-                pos_w,      // positions
+                pos_b,      // positions
                 nullptr,    // freq factors
                 n_dim/2,    // n_dims
                 0, 0, freq_base,
@@ -1723,6 +1828,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = graph.build_internvl();
             } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                res = graph.build_llama4();
+            } break;
         default:
             {
                 res = graph.build_llava();
@@ -1926,6 +2035,21 @@ struct clip_model_loader {
                         hparams.warmup_image_size = hparams.patch_size * 8;
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
                     } break;
+                case PROJECTOR_TYPE_LLAMA4:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
+
+                        // borrowed from llava-1.6
+                        const int isize = hparams.image_size;
+                        hparams.image_grid_pinpoints = {
+                            isize,   isize*2, // 336, 672
+                            isize*2, isize,   // 672, 336
+                            isize*2, isize*2, // 672, 672
+                            isize*3, isize,   // 1008, 336
+                            isize,   isize*3, // 336, 1008
+                        };
+                    } break;
                 default:
                     break;
             }
@@ -1946,6 +2070,10 @@ struct clip_model_loader {
             LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
+
+            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
+                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+            }
         }
     }
 
@@ -2001,7 +2129,7 @@ struct clip_model_loader {
         vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
         vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
 
-        vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
+        vision_model.position_embeddings = get_tensor(TN_POS_EMBD, false);
 
         // layers
         vision_model.layers.resize(hparams.n_layer);
@@ -2182,6 +2310,12 @@ struct clip_model_loader {
                     vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
                     vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
                 } break;
+            case PROJECTOR_TYPE_LLAMA4:
+                {
+                    vision_model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
+                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -2328,14 +2462,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
     return ctx_clip;
 }
 
-void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
-    ctx_clip->load_image_size = *load_image_size; // copy
-}
-
-struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
-    return &ctx_clip->load_image_size;
-}
-
 struct clip_image_size * clip_image_size_init() {
     struct clip_image_size * load_image_size = new struct clip_image_size();
     load_image_size->width = 448;
@@ -2849,7 +2975,7 @@ struct llava_uhd {
 
     // used by llava 1.6 with custom list of pinpoints
     static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
-        std::vector<clip_image_size> possible_resolutions;
+        std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
         for (size_t i = 0; i < pinpoints.size(); i += 2) {
             possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
         }
@@ -2916,12 +3042,6 @@ struct llava_uhd {
     }
 };
 
-// TODO @ngxson : decprecate the load_image_size singleton pattern
-int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
-    const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
-    return inst.grid_size.width;
-}
-
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -2943,9 +3063,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
             normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
             res_imgs->entries.push_back(std::move(res));
         }
+
+        res_imgs->grid_x = inst.grid_size.width;
+        res_imgs->grid_y = inst.grid_size.height;
         return true;
-    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
         clip_image_u8 resized;
         auto patch_size = params.patch_size * 2;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -2971,8 +3094,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
-    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
         clip_image_u8 resized_image;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
         image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -2980,6 +3103,22 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
+        GGML_ASSERT(!params.image_grid_pinpoints.empty());
+        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            clip_image_f32_ptr res(clip_image_f32_init());
+            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
+            res_imgs->entries.push_back(std::move(res));
+        }
+
+        res_imgs->grid_x = inst.grid_size.width;
+        res_imgs->grid_y = inst.grid_size.height;
+        return true;
+
     }
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@@ -3098,6 +3237,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
 
     if (ctx->proj_type == PROJECTOR_TYPE_LDP
             || ctx->proj_type == PROJECTOR_TYPE_LDPV2
@@ -3136,6 +3276,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
         int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
         n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
+    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
+        n_patches /= (scale_factor * scale_factor);
     }
 
     return n_patches;
@@ -3247,6 +3389,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     // build the inference graph
+    ctx->debug_print_tensors.clear();
     ggml_backend_sched_reset(ctx->sched.get());
     ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3261,8 +3404,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
-    const int pos_w = ctx->load_image_size.width  / patch_size;
-    const int pos_h = ctx->load_image_size.height / patch_size;
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
 
     const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
 
@@ -3528,6 +3671,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             {
                 // do nothing
             } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                // set the 2D positions
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
+                // last pos is always kept 0, it's for CLS
+                // dimension H
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i / n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i % n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
         default:
             GGML_ABORT("Unknown projector type");
     }
@@ -3548,6 +3708,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
+    // print debug nodes
+    if (ctx->debug_graph) {
+        LOG_INF("\n\n---\n\n");
+        LOG_INF("\n\nDebug graph:\n\n");
+        for (ggml_tensor * t : ctx->debug_print_tensors) {
+            std::vector<uint8_t> data(ggml_nbytes(t));
+            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
+            print_tensor_shape(t);
+            print_tensor_data(t, data.data(), 3);
+        }
+    }
+
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
@@ -3596,6 +3768,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->vision_model.projection->ne[1];
         case PROJECTOR_TYPE_INTERNVL:
             return ctx->vision_model.mm_3_w->ne[1];
+        case PROJECTOR_TYPE_LLAMA4:
+            return ctx->vision_model.mm_model_proj->ne[1];
         default:
             GGML_ABORT("Unknown projector type");
     }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 2d70eec9473..e7a1c0782dd 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -47,10 +47,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
 // this should be equal to the embedding dimension of the text model
 int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
-int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
-void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
-struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
-
 struct clip_image_size      * clip_image_size_init(void);
 struct clip_image_u8        * clip_image_u8_init (void);
 struct clip_image_f32       * clip_image_f32_init(void);
diff --git a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
index c664c4e15a2..2949faec421 100644
--- a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
+++ b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -6,10 +6,6 @@
 import torch
 import numpy as np
 from gguf import *
-from typing import cast
-from torch.nn import ModuleList
-from transformers.models.clip.modeling_clip import CLIPVisionTransformer
-from transformers import PreTrainedModel
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 
 TEXT = "clip.text"
@@ -166,13 +162,13 @@ def bytes_to_unicode():
     ftype = 0
 
 if args.clip_model_is_siglip:
-    model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
+    model = SiglipVisionModel.from_pretrained(dir_model)
     processor = None
 elif args.clip_model_is_vision or args.clip_model_is_openclip:
-    model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
+    model = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
-    model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
+    model = CLIPModel.from_pretrained(dir_model)
     processor = CLIPProcessor.from_pretrained(dir_model)
 
 fname_middle = None
@@ -354,14 +350,9 @@ def get_non_negative_vision_feature_layers(v_hparams):
     # By default, we drop the last layer for llava projector
     # models unless we have explicitly set vision feature layers
     if feature_layers is None:
-        vision_model = cast(CLIPVisionTransformer, model.vision_model)
-        encoder_layers = vision_model.encoder.layers
-        encoder_layers.pop(-1)
+        model.vision_model.encoder.layers.pop(-1)
     else:
-        vision_model = cast(CLIPVisionTransformer, model.vision_model)
-        encoder_layers = vision_model.encoder.layers
-        encoder_layers = cast(ModuleList, encoder_layers)
-        encoder_layers.__init__(encoder_layers[:max(feature_layers)])
+        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
 
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
@@ -384,7 +375,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
         continue
 
     name = get_tensor_name(name)
-    data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())
+    data = data.squeeze().numpy()
 
     n_dims = len(data.shape)
 
@@ -392,16 +383,16 @@ def get_non_negative_vision_feature_layers(v_hparams):
     ftype_cur = 0
     if n_dims == 4:
         print(f"tensor {name} is always saved in f16")
-        data = np.asarray(data, dtype=np.float16)
+        data = data.astype(np.float16)
         ftype_cur = 1
     elif ftype == 1:
         if name[-7:] == ".weight" and n_dims == 2:
             print("  Converting to float16")
-            data = np.asarray(data, dtype=np.float16)
+            data = data.astype(np.float16)
             ftype_cur = 1
         else:
             print("  Converting to float32")
-            data = np.asarray(data, dtype=np.float32)
+            data = data.astype(np.float32)
             ftype_cur = 0
     else:
         if data.dtype != np.float32:
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 2a852d9c19b..1234dbb4687 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -42,6 +42,7 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_NONE,
     MTMD_SLICE_TMPL_MINICPMV_2_5,
     MTMD_SLICE_TMPL_MINICPMV_2_6,
+    MTMD_SLICE_TMPL_LLAMA4,
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
@@ -64,15 +65,19 @@ struct mtmd_context {
     int n_threads;
     std::string image_marker;
 
-    // for minicpmv, we need special tokens in-between slices
+    // for llava-uhd style models, we need special tokens in-between slices
+    // minicpmv calls them "slices", llama 4 calls them "tiles"
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
     llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
     llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
-    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
-    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
+    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
+    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
+    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
+    bool        tok_row_end_trail = false;
+    bool        ov_img_first      = false;
 
     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
 
@@ -96,6 +101,7 @@ struct mtmd_context {
 
         use_mrope = clip_is_qwen2vl(ctx_clip);
 
+        projector_type proj = clip_get_projector_type(ctx_clip);
         int minicpmv_version = clip_is_minicpmv(ctx_clip);
         if (minicpmv_version == 2) {
             // minicpmv 2.5 format:
@@ -108,6 +114,8 @@ struct mtmd_context {
             tok_sli_img_start = tok_ov_img_start;
             tok_sli_img_end   = tok_ov_img_end;
             tok_row_end       = lookup_token("\n");
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;
 
         } else if (minicpmv_version == 3 || minicpmv_version == 4) {
             // minicpmv 2.6 format:
@@ -118,9 +126,25 @@ struct mtmd_context {
             tok_sli_img_start = lookup_token("<slice>");
             tok_sli_img_end   = lookup_token("</slice>");
             tok_row_end       = lookup_token("\n");
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;
 
         } else if (minicpmv_version != 0) {
             GGML_ASSERT(false && "unsupported minicpmv version");
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // llama 4 format:
+            // <|image_start|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
+            // <|image|> (overview)           <-- overview image is last
+            // <|image_end|>
+            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
+            tok_ov_img_start  = lookup_token("<|image|>");
+            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
+            tok_row_end       = lookup_token("<|tile_y_separator|>");
+            tok_row_end_trail = true; // add trailing end-of-row token
+            ov_img_first      = false; // overview image is last
         }
     }
 
@@ -243,16 +267,18 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
         marker_modified = ctx->image_marker + "[IMG_END]";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-    }
 
-    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
         // <|vision_start|> ... (image embeddings) ... <|vision_end|>
         marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
 
-    }
+    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
+        // (more details in mtmd_context constructor)
+        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
 
-    else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
+    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
         // <img> ... (image embeddings) ... </img>
         marker_modified = "<img>" + ctx->image_marker + "</img>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
@@ -328,7 +354,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             img_u8->ny = bitmaps[i_img]->ny;
             img_u8->buf.resize(bitmaps[i_img]->data.size());
             std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
-            clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
 
             // preprocess image
             clip_image_f32_batch batch_f32;
@@ -338,28 +363,40 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 return 2;
             }
 
-            if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
+            // handle llava-uhd style preprocessing
+            if (
+                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+            ) {
                 // split batch into chunks of single images
                 auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
                 GGML_ASSERT(chunks.size() > 0);
 
-                // add overview image
-                add_text_chunk({ctx->tok_ov_img_start});
-                output->entries.emplace_back(std::move(chunks.front()));
+                auto ov_chunk = std::move(chunks.front());
                 chunks.erase(chunks.begin());
-                add_text_chunk({ctx->tok_ov_img_end});
 
-                // add slices
+                // add overview image (first)
+                if (ctx->ov_img_first) {
+                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_start});
+                    }
+                    output->entries.emplace_back(std::move(ov_chunk));
+                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_end});
+                    }
+                }
+
+                // add slices (or tiles)
                 if (!chunks.empty()) {
-                    clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
-                    int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
-                    int n_row = (int)chunks.size() / n_col;
-                    GGML_ASSERT(n_row * n_col == (int)chunks.size());
+                    const int n_col = batch_f32.grid_x;
+                    const int n_row = batch_f32.grid_y;
                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
                         add_text_chunk({ctx->tok_slices_start});
                     }
                     for (int y = 0; y < n_row; y++) {
                         for (int x = 0; x < n_col; x++) {
+                            const bool is_last_in_row = (x == n_col - 1);
                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_start});
                             }
@@ -367,8 +404,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_end});
                             }
+                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_mid});
+                            }
                         }
-                        if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
+                        if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
                             add_text_chunk({ctx->tok_row_end});
                         }
                     }
@@ -377,6 +417,17 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     }
                 }
 
+                // add overview image (last)
+                if (!ctx->ov_img_first) {
+                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_start});
+                    }
+                    output->entries.emplace_back(std::move(ov_chunk));
+                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_ov_img_end});
+                    }
+                }
+
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
@@ -427,14 +478,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
-    // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
-    {
-        clip_image_size slice_size{
-            image_tokens->batch_f32.entries[0]->nx,
-            image_tokens->batch_f32.entries[0]->ny};
-        clip_add_load_image_size(ctx->ctx_clip, &slice_size);
-    }
-
     if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index 513f9dda513..cbcbf26c9b4 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -1,5 +1,5 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch>=2.5
-torchvision>=0.20.1
+torch~=2.2.1
+torchvision~=0.17.1
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 05ac7a04d8f..15a37b0d22b 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -21,6 +21,13 @@ if [ "${1:-}" = "big" ]; then
     echo "Include BIG models..."
 fi
 
+RUN_HUGE_TESTS=false
+if [ "${1:-}" = "huge" ]; then
+    RUN_HUGE_TESTS=true
+    RUN_BIG_TESTS=true
+    echo "Include BIG models..."
+fi
+
 ###############
 
 arr_bin=()
@@ -42,7 +49,7 @@ add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"           "vicuna"
+add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
 add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -60,10 +67,17 @@ if [ "$RUN_BIG_TESTS" = true ]; then
     add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
     add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
     add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
     # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big
+fi
+
+# to test the huge models, run: ./tests.sh huge
+# this will run both the big and huge models
+# huge models are > 32B parameters
+if [ "$RUN_HUGE_TESTS" = true ]; then
+    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
 fi
 
 # these models always give the wrong answer, not sure why

From ba079d3dbaa8a3b2c026192773c7a14a8905d2e3 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:08:07 -0400
Subject: [PATCH 12/20] Revert "Revert tools/mtmd/ to match master"

This reverts commit eb6499cc7852c4c69e8628863d2b548d837a235f.
---
 tools/mtmd/clip-impl.h                        |  74 +-----
 tools/mtmd/clip.cpp                           | 246 +++---------------
 tools/mtmd/clip.h                             |   4 +
 .../convert_image_encoder_to_gguf.py          |  27 +-
 tools/mtmd/mtmd.cpp                           |  97 ++-----
 tools/mtmd/requirements.txt                   |   4 +-
 tools/mtmd/tests.sh                           |  22 +-
 7 files changed, 92 insertions(+), 382 deletions(-)

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 7b7d2df3962..23036ba72f1 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -4,7 +4,6 @@
 
 #include <climits>
 #include <cstdarg>
-#include <cinttypes>
 #include <string>
 #include <map>
 #include <sstream>
@@ -45,7 +44,7 @@
 // tensor name constants
 //
 
-#define TN_POS_EMBD        "v.position_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
@@ -111,7 +110,6 @@ enum projector_type {
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
     PROJECTOR_TYPE_INTERNVL,
-    PROJECTOR_TYPE_LLAMA4,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -127,7 +125,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
     { PROJECTOR_TYPE_INTERNVL,  "internvl"},
-    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -243,11 +240,6 @@ struct clip_image_u8_batch {
 struct clip_image_f32_batch {
     std::vector<clip_image_f32_ptr> entries;
 
-    // for llava-uhd style models, we need to know the grid size
-    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
-    int grid_x = 0;
-    int grid_y = 0;
-
     clip_image_f32_batch clone() const {
         clip_image_f32_batch new_batch;
         new_batch.entries.reserve(entries.size());
@@ -366,70 +358,6 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
     }
 }
 
-//
-// debugging
-//
-
-static void print_tensor_shape(ggml_tensor * t) {
-    printf("%s.shape = [", t->name);
-    for (int i = 0; i < ggml_n_dims(t); ++i) {
-        printf("%" PRId64, t->ne[i]);
-        if (i < ggml_n_dims(t) - 1) {
-            printf(", ");
-        }
-    }
-    printf("]\n");
-}
-
-static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
-    ggml_type type = t->type;
-    int64_t * ne = t->ne;
-    size_t * nb = t->nb;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("%s.data: [\n", t->name);
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                printf("     ..., \n");
-                i2 = ne[2] - n;
-            }
-            printf("     [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    printf("      ..., \n");
-                    i1 = ne[1] - n;
-                }
-                printf("      [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-                    float v;
-                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
-                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
-                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
-                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
-                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
-                    } else {
-                        GGML_ABORT("fatal error");
-                    }
-                    printf("%8.4f", v);
-                    if (i0 < ne[0] - 1) printf(", ");
-                }
-                printf("],\n");
-            }
-            printf("     ],\n");
-        }
-        printf("    ]\n");
-    }
-}
-
 //
 // API used internally with mtmd
 //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index eba07f6c82e..128a95cc11f 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -359,12 +359,9 @@ struct clip_ctx {
     int max_nodes = 8192;
     ggml_backend_sched_ptr sched;
 
-    // for debugging
-    bool debug_graph = false;
-    std::vector<ggml_tensor *> debug_print_tensors;
+    clip_image_size load_image_size;
 
     clip_ctx(clip_context_params & ctx_params) {
-        debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
         backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (!backend_cpu) {
             throw std::runtime_error("failed to initialize CPU backend");
@@ -443,7 +440,7 @@ struct clip_graph {
         };
         ctx0_ptr.reset(ggml_init(params));
         ctx0 = ctx0_ptr.get();
-        gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
+        gf = ggml_new_graph(ctx0);
     }
 
     ggml_cgraph * build_siglip() {
@@ -525,7 +522,7 @@ struct clip_graph {
         ggml_set_input(pos_w);
 
         auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+            return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
         };
 
         ggml_tensor * inp = build_inp();
@@ -939,101 +936,6 @@ struct clip_graph {
         return gf;
     }
 
-    ggml_cgraph * build_llama4() {
-        GGML_ASSERT(model.class_embedding != nullptr);
-        GGML_ASSERT(model.position_embeddings != nullptr);
-
-        const int n_pos = n_patches + 1; // +1 for [CLS]
-
-        // 2D input positions
-        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(pos_h, "pos_h");
-        ggml_set_input(pos_h);
-
-        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
-        ggml_set_name(pos_w, "pos_w");
-        ggml_set_input(pos_w);
-
-        ggml_tensor * inp = build_inp_raw();
-
-        // Llama4UnfoldConvolution
-        {
-            ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
-                                                    patch_size, patch_size, 3, n_embd);
-            inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
-            inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
-            inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
-            cb(inp, "patch_conv", -1);
-        }
-
-        // add CLS token
-        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
-
-        // build ViT with 2D position embeddings
-        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-            // first half is X axis and second half is Y axis
-            // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
-            // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
-            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
-        };
-        ggml_tensor * cur = build_vit(
-                                inp, n_pos,
-                                NORM_TYPE_NORMAL,
-                                hparams.ffn_op,
-                                model.position_embeddings,
-                                add_pos);
-
-        // remove CLS token
-        cur = ggml_view_2d(ctx0, cur,
-            n_embd, n_patches,
-            ggml_row_size(cur->type, n_embd), 0);
-
-        // pixel shuffle
-        // based on Llama4VisionPixelShuffleMLP
-        // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
-        {
-            const int scale_factor = model.hparams.proj_scale_factor;
-            const int bsz = 1; // batch size, always 1 for now since we don't support batching
-            GGML_ASSERT(scale_factor > 0);
-            GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
-            cur = ggml_reshape_4d(ctx0, cur,
-                n_embd * scale_factor,
-                n_patches_x / scale_factor,
-                n_patches_y,
-                bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
-                n_embd * scale_factor * scale_factor,
-                n_patches_x / scale_factor,
-                n_patches_y / scale_factor,
-                bsz);
-            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-            // flatten to 2D
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
-                n_embd * scale_factor * scale_factor,
-                n_patches / scale_factor / scale_factor);
-            cb(cur, "pixel_shuffle", -1);
-        }
-
-        // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
-        {
-            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
-            cur = ggml_gelu(ctx0, cur);
-            cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
-            cur = ggml_gelu(ctx0, cur);
-            cb(cur, "adapter_mlp", -1);
-        }
-
-        // Llama4MultiModalProjector
-        cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
-        cb(cur, "projected", -1);
-
-        // build the graph
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
     // this graph is used by llava, granite and glm
     // due to having embedding_stack (used by granite), we cannot reuse build_vit
     ggml_cgraph * build_llava() {
@@ -1413,15 +1315,11 @@ struct clip_graph {
     // utility functions
     //
 
-    void cb(ggml_tensor * cur0, const char * name, int il) const {
-        if (ctx->debug_graph) {
-            ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
-            std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
-            ggml_set_name(cur, cur_name.c_str());
-            ggml_set_output(cur);
-            ggml_build_forward_expand(gf, cur);
-            ctx->debug_print_tensors.push_back(cur);
-        }
+    void cb(ggml_tensor * cur, const char * name, int il) const {
+        // TODO: implement this
+        GGML_UNUSED(cur);
+        GGML_UNUSED(name);
+        GGML_UNUSED(il);
     }
 
     // build vision transformer (ViT) cgraph
@@ -1732,10 +1630,9 @@ struct clip_graph {
     static ggml_tensor * build_rope_2d(
         ggml_context * ctx0,
         ggml_tensor * cur,
-        ggml_tensor * pos_a, // first half
-        ggml_tensor * pos_b, // second half
-        const float freq_base,
-        const bool interleave_freq
+        ggml_tensor * pos_h,
+        ggml_tensor * pos_w,
+        const float freq_base
     ) {
         const int64_t n_dim  = cur->ne[0];
         const int64_t n_head = cur->ne[1];
@@ -1749,9 +1646,7 @@ struct clip_graph {
         //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
         // then for the second half, we use freq_scale to shift the inv_freq
         //  ^ why? replace (2i) with (2i+1) in the above equation
-        const float freq_scale_odd = interleave_freq
-                                    ? std::pow(freq_base, (float)-2/n_dim)
-                                    : 1.0;
+        const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
 
         // first half
         ggml_tensor * first;
@@ -1764,7 +1659,7 @@ struct clip_graph {
             first = ggml_rope_ext(
                 ctx0,
                 first,
-                pos_a,      // positions
+                pos_h,      // positions
                 nullptr,    // freq factors
                 n_dim/2,    // n_dims
                 0, 0, freq_base,
@@ -1784,7 +1679,7 @@ struct clip_graph {
             second = ggml_rope_ext(
                 ctx0,
                 second,
-                pos_b,      // positions
+                pos_w,      // positions
                 nullptr,    // freq factors
                 n_dim/2,    // n_dims
                 0, 0, freq_base,
@@ -1828,10 +1723,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = graph.build_internvl();
             } break;
-        case PROJECTOR_TYPE_LLAMA4:
-            {
-                res = graph.build_llama4();
-            } break;
         default:
             {
                 res = graph.build_llava();
@@ -2035,21 +1926,6 @@ struct clip_model_loader {
                         hparams.warmup_image_size = hparams.patch_size * 8;
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
                     } break;
-                case PROJECTOR_TYPE_LLAMA4:
-                    {
-                        hparams.rope_theta = 10000.0f;
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
-
-                        // borrowed from llava-1.6
-                        const int isize = hparams.image_size;
-                        hparams.image_grid_pinpoints = {
-                            isize,   isize*2, // 336, 672
-                            isize*2, isize,   // 672, 336
-                            isize*2, isize*2, // 672, 672
-                            isize*3, isize,   // 1008, 336
-                            isize,   isize*3, // 336, 1008
-                        };
-                    } break;
                 default:
                     break;
             }
@@ -2070,10 +1946,6 @@ struct clip_model_loader {
             LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
-
-            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
-                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-            }
         }
     }
 
@@ -2129,7 +2001,7 @@ struct clip_model_loader {
         vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
         vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
 
-        vision_model.position_embeddings = get_tensor(TN_POS_EMBD, false);
+        vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
 
         // layers
         vision_model.layers.resize(hparams.n_layer);
@@ -2310,12 +2182,6 @@ struct clip_model_loader {
                     vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
                     vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
                 } break;
-            case PROJECTOR_TYPE_LLAMA4:
-                {
-                    vision_model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
-                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
-                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
-                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -2462,6 +2328,14 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
     return ctx_clip;
 }
 
+void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
+    ctx_clip->load_image_size = *load_image_size; // copy
+}
+
+struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
+    return &ctx_clip->load_image_size;
+}
+
 struct clip_image_size * clip_image_size_init() {
     struct clip_image_size * load_image_size = new struct clip_image_size();
     load_image_size->width = 448;
@@ -2975,7 +2849,7 @@ struct llava_uhd {
 
     // used by llava 1.6 with custom list of pinpoints
     static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
-        std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
+        std::vector<clip_image_size> possible_resolutions;
         for (size_t i = 0; i < pinpoints.size(); i += 2) {
             possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
         }
@@ -3042,6 +2916,12 @@ struct llava_uhd {
     }
 };
 
+// TODO @ngxson : decprecate the load_image_size singleton pattern
+int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
+    const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
+    return inst.grid_size.width;
+}
+
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -3063,12 +2943,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
             normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
             res_imgs->entries.push_back(std::move(res));
         }
-
-        res_imgs->grid_x = inst.grid_size.width;
-        res_imgs->grid_y = inst.grid_size.height;
         return true;
-
-    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
         clip_image_u8 resized;
         auto patch_size = params.patch_size * 2;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -3094,8 +2971,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
-
-    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
         clip_image_u8 resized_image;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
         image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -3103,22 +2980,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
-
-    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
-        GGML_ASSERT(!params.image_grid_pinpoints.empty());
-        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
-            res_imgs->entries.push_back(std::move(res));
-        }
-
-        res_imgs->grid_x = inst.grid_size.width;
-        res_imgs->grid_y = inst.grid_size.height;
-        return true;
-
     }
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@@ -3237,7 +3098,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
 
     if (ctx->proj_type == PROJECTOR_TYPE_LDP
             || ctx->proj_type == PROJECTOR_TYPE_LDPV2
@@ -3276,8 +3136,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
         int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
         n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
-    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
-        n_patches /= (scale_factor * scale_factor);
     }
 
     return n_patches;
@@ -3389,7 +3247,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 
     // build the inference graph
-    ctx->debug_print_tensors.clear();
     ggml_backend_sched_reset(ctx->sched.get());
     ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3404,8 +3261,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
-    const int pos_w = image_size_width  / patch_size;
-    const int pos_h = image_size_height / patch_size;
+    const int pos_w = ctx->load_image_size.width  / patch_size;
+    const int pos_h = ctx->load_image_size.height / patch_size;
 
     const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
 
@@ -3671,23 +3528,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             {
                 // do nothing
             } break;
-        case PROJECTOR_TYPE_LLAMA4:
-            {
-                // set the 2D positions
-                int n_patches_per_col = image_size_width / patch_size;
-                std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
-                // last pos is always kept 0, it's for CLS
-                // dimension H
-                for (int i = 0; i < num_patches; i++) {
-                    pos_data[i] = (i / n_patches_per_col) + 1;
-                }
-                set_input_i32("pos_h", pos_data);
-                // dimension W
-                for (int i = 0; i < num_patches; i++) {
-                    pos_data[i] = (i % n_patches_per_col) + 1;
-                }
-                set_input_i32("pos_w", pos_data);
-            } break;
         default:
             GGML_ABORT("Unknown projector type");
     }
@@ -3708,18 +3548,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
-    // print debug nodes
-    if (ctx->debug_graph) {
-        LOG_INF("\n\n---\n\n");
-        LOG_INF("\n\nDebug graph:\n\n");
-        for (ggml_tensor * t : ctx->debug_print_tensors) {
-            std::vector<uint8_t> data(ggml_nbytes(t));
-            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
-            print_tensor_shape(t);
-            print_tensor_data(t, data.data(), 3);
-        }
-    }
-
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
@@ -3768,8 +3596,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->vision_model.projection->ne[1];
         case PROJECTOR_TYPE_INTERNVL:
             return ctx->vision_model.mm_3_w->ne[1];
-        case PROJECTOR_TYPE_LLAMA4:
-            return ctx->vision_model.mm_model_proj->ne[1];
         default:
             GGML_ABORT("Unknown projector type");
     }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index e7a1c0782dd..2d70eec9473 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -47,6 +47,10 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
 // this should be equal to the embedding dimension of the text model
 int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
+int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
+void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
+
 struct clip_image_size      * clip_image_size_init(void);
 struct clip_image_u8        * clip_image_u8_init (void);
 struct clip_image_f32       * clip_image_f32_init(void);
diff --git a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
index 2949faec421..c664c4e15a2 100644
--- a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
+++ b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -6,6 +6,10 @@
 import torch
 import numpy as np
 from gguf import *
+from typing import cast
+from torch.nn import ModuleList
+from transformers.models.clip.modeling_clip import CLIPVisionTransformer
+from transformers import PreTrainedModel
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 
 TEXT = "clip.text"
@@ -162,13 +166,13 @@ def bytes_to_unicode():
     ftype = 0
 
 if args.clip_model_is_siglip:
-    model = SiglipVisionModel.from_pretrained(dir_model)
+    model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
     processor = None
 elif args.clip_model_is_vision or args.clip_model_is_openclip:
-    model = CLIPVisionModel.from_pretrained(dir_model)
+    model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
-    model = CLIPModel.from_pretrained(dir_model)
+    model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
     processor = CLIPProcessor.from_pretrained(dir_model)
 
 fname_middle = None
@@ -350,9 +354,14 @@ def get_non_negative_vision_feature_layers(v_hparams):
     # By default, we drop the last layer for llava projector
     # models unless we have explicitly set vision feature layers
     if feature_layers is None:
-        model.vision_model.encoder.layers.pop(-1)
+        vision_model = cast(CLIPVisionTransformer, model.vision_model)
+        encoder_layers = vision_model.encoder.layers
+        encoder_layers.pop(-1)
     else:
-        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+        vision_model = cast(CLIPVisionTransformer, model.vision_model)
+        encoder_layers = vision_model.encoder.layers
+        encoder_layers = cast(ModuleList, encoder_layers)
+        encoder_layers.__init__(encoder_layers[:max(feature_layers)])
 
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
@@ -375,7 +384,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
         continue
 
     name = get_tensor_name(name)
-    data = data.squeeze().numpy()
+    data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())
 
     n_dims = len(data.shape)
 
@@ -383,16 +392,16 @@ def get_non_negative_vision_feature_layers(v_hparams):
     ftype_cur = 0
     if n_dims == 4:
         print(f"tensor {name} is always saved in f16")
-        data = data.astype(np.float16)
+        data = np.asarray(data, dtype=np.float16)
         ftype_cur = 1
     elif ftype == 1:
         if name[-7:] == ".weight" and n_dims == 2:
             print("  Converting to float16")
-            data = data.astype(np.float16)
+            data = np.asarray(data, dtype=np.float16)
             ftype_cur = 1
         else:
             print("  Converting to float32")
-            data = data.astype(np.float32)
+            data = np.asarray(data, dtype=np.float32)
             ftype_cur = 0
     else:
         if data.dtype != np.float32:
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 1234dbb4687..2a852d9c19b 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -42,7 +42,6 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_NONE,
     MTMD_SLICE_TMPL_MINICPMV_2_5,
     MTMD_SLICE_TMPL_MINICPMV_2_6,
-    MTMD_SLICE_TMPL_LLAMA4,
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
@@ -65,19 +64,15 @@ struct mtmd_context {
     int n_threads;
     std::string image_marker;
 
-    // for llava-uhd style models, we need special tokens in-between slices
-    // minicpmv calls them "slices", llama 4 calls them "tiles"
+    // for minicpmv, we need special tokens in-between slices
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
     llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
     llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
-    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
-    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
-    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
+    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
+    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
-    bool        tok_row_end_trail = false;
-    bool        ov_img_first      = false;
 
     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
 
@@ -101,7 +96,6 @@ struct mtmd_context {
 
         use_mrope = clip_is_qwen2vl(ctx_clip);
 
-        projector_type proj = clip_get_projector_type(ctx_clip);
         int minicpmv_version = clip_is_minicpmv(ctx_clip);
         if (minicpmv_version == 2) {
             // minicpmv 2.5 format:
@@ -114,8 +108,6 @@ struct mtmd_context {
             tok_sli_img_start = tok_ov_img_start;
             tok_sli_img_end   = tok_ov_img_end;
             tok_row_end       = lookup_token("\n");
-            tok_row_end_trail = false; // no trailing end-of-row token
-            ov_img_first      = true;
 
         } else if (minicpmv_version == 3 || minicpmv_version == 4) {
             // minicpmv 2.6 format:
@@ -126,25 +118,9 @@ struct mtmd_context {
             tok_sli_img_start = lookup_token("<slice>");
             tok_sli_img_end   = lookup_token("</slice>");
             tok_row_end       = lookup_token("\n");
-            tok_row_end_trail = false; // no trailing end-of-row token
-            ov_img_first      = true;
 
         } else if (minicpmv_version != 0) {
             GGML_ASSERT(false && "unsupported minicpmv version");
-        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
-            // llama 4 format:
-            // <|image_start|>
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
-            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
-            // <|image|> (overview)           <-- overview image is last
-            // <|image_end|>
-            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
-            tok_ov_img_start  = lookup_token("<|image|>");
-            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
-            tok_row_end       = lookup_token("<|tile_y_separator|>");
-            tok_row_end_trail = true; // add trailing end-of-row token
-            ov_img_first      = false; // overview image is last
         }
     }
 
@@ -267,18 +243,16 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
         marker_modified = ctx->image_marker + "[IMG_END]";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+    }
 
-    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
         // <|vision_start|> ... (image embeddings) ... <|vision_end|>
         marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
 
-    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
-        // (more details in mtmd_context constructor)
-        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+    }
 
-    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
+    else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
         // <img> ... (image embeddings) ... </img>
         marker_modified = "<img>" + ctx->image_marker + "</img>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
@@ -354,6 +328,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             img_u8->ny = bitmaps[i_img]->ny;
             img_u8->buf.resize(bitmaps[i_img]->data.size());
             std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
+            clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
 
             // preprocess image
             clip_image_f32_batch batch_f32;
@@ -363,40 +338,28 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 return 2;
             }
 
-            // handle llava-uhd style preprocessing
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-            ) {
+            if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
                 // split batch into chunks of single images
                 auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
                 GGML_ASSERT(chunks.size() > 0);
 
-                auto ov_chunk = std::move(chunks.front());
+                // add overview image
+                add_text_chunk({ctx->tok_ov_img_start});
+                output->entries.emplace_back(std::move(chunks.front()));
                 chunks.erase(chunks.begin());
+                add_text_chunk({ctx->tok_ov_img_end});
 
-                // add overview image (first)
-                if (ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
-                    }
-                    output->entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
-                    }
-                }
-
-                // add slices (or tiles)
+                // add slices
                 if (!chunks.empty()) {
-                    const int n_col = batch_f32.grid_x;
-                    const int n_row = batch_f32.grid_y;
+                    clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
+                    int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
+                    int n_row = (int)chunks.size() / n_col;
+                    GGML_ASSERT(n_row * n_col == (int)chunks.size());
                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
                         add_text_chunk({ctx->tok_slices_start});
                     }
                     for (int y = 0; y < n_row; y++) {
                         for (int x = 0; x < n_col; x++) {
-                            const bool is_last_in_row = (x == n_col - 1);
                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_start});
                             }
@@ -404,11 +367,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
                                 add_text_chunk({ctx->tok_sli_img_end});
                             }
-                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_mid});
-                            }
                         }
-                        if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
+                        if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
                             add_text_chunk({ctx->tok_row_end});
                         }
                     }
@@ -417,17 +377,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     }
                 }
 
-                // add overview image (last)
-                if (!ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
-                    }
-                    output->entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
-                    }
-                }
-
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
@@ -478,6 +427,14 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
+    // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
+    {
+        clip_image_size slice_size{
+            image_tokens->batch_f32.entries[0]->nx,
+            image_tokens->batch_f32.entries[0]->ny};
+        clip_add_load_image_size(ctx->ctx_clip, &slice_size);
+    }
+
     if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index cbcbf26c9b4..513f9dda513 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -1,5 +1,5 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
-torch~=2.2.1
-torchvision~=0.17.1
+torch>=2.5
+torchvision>=0.20.1
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 15a37b0d22b..05ac7a04d8f 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -21,13 +21,6 @@ if [ "${1:-}" = "big" ]; then
     echo "Include BIG models..."
 fi
 
-RUN_HUGE_TESTS=false
-if [ "${1:-}" = "huge" ]; then
-    RUN_HUGE_TESTS=true
-    RUN_BIG_TESTS=true
-    echo "Include BIG models..."
-fi
-
 ###############
 
 arr_bin=()
@@ -49,7 +42,7 @@ add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
+add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"           "vicuna"
 add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -67,17 +60,10 @@ if [ "$RUN_BIG_TESTS" = true ]; then
     add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
     add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
     add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+    add_test "llama-mtmd-cli"  "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
     # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
-fi
-
-# to test the huge models, run: ./tests.sh huge
-# this will run both the big and huge models
-# huge models are > 32B parameters
-if [ "$RUN_HUGE_TESTS" = true ]; then
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
+    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" # too big
 fi
 
 # these models always give the wrong answer, not sure why

From 143473053313817177b914e5fc09e9febbb5c24b Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:31:24 -0400
Subject: [PATCH 13/20] reverting change to legacy-models script

---
 .../convert_image_encoder_to_gguf.py          | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
index c664c4e15a2..2949faec421 100644
--- a/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
+++ b/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -6,10 +6,6 @@
 import torch
 import numpy as np
 from gguf import *
-from typing import cast
-from torch.nn import ModuleList
-from transformers.models.clip.modeling_clip import CLIPVisionTransformer
-from transformers import PreTrainedModel
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 
 TEXT = "clip.text"
@@ -166,13 +162,13 @@ def bytes_to_unicode():
     ftype = 0
 
 if args.clip_model_is_siglip:
-    model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
+    model = SiglipVisionModel.from_pretrained(dir_model)
     processor = None
 elif args.clip_model_is_vision or args.clip_model_is_openclip:
-    model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
+    model = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
-    model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
+    model = CLIPModel.from_pretrained(dir_model)
     processor = CLIPProcessor.from_pretrained(dir_model)
 
 fname_middle = None
@@ -354,14 +350,9 @@ def get_non_negative_vision_feature_layers(v_hparams):
     # By default, we drop the last layer for llava projector
     # models unless we have explicitly set vision feature layers
     if feature_layers is None:
-        vision_model = cast(CLIPVisionTransformer, model.vision_model)
-        encoder_layers = vision_model.encoder.layers
-        encoder_layers.pop(-1)
+        model.vision_model.encoder.layers.pop(-1)
     else:
-        vision_model = cast(CLIPVisionTransformer, model.vision_model)
-        encoder_layers = vision_model.encoder.layers
-        encoder_layers = cast(ModuleList, encoder_layers)
-        encoder_layers.__init__(encoder_layers[:max(feature_layers)])
+        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
 
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
@@ -384,7 +375,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
         continue
 
     name = get_tensor_name(name)
-    data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())
+    data = data.squeeze().numpy()
 
     n_dims = len(data.shape)
 
@@ -392,16 +383,16 @@ def get_non_negative_vision_feature_layers(v_hparams):
     ftype_cur = 0
     if n_dims == 4:
         print(f"tensor {name} is always saved in f16")
-        data = np.asarray(data, dtype=np.float16)
+        data = data.astype(np.float16)
         ftype_cur = 1
     elif ftype == 1:
         if name[-7:] == ".weight" and n_dims == 2:
             print("  Converting to float16")
-            data = np.asarray(data, dtype=np.float16)
+            data = data.astype(np.float16)
             ftype_cur = 1
         else:
             print("  Converting to float32")
-            data = np.asarray(data, dtype=np.float32)
+            data = data.astype(np.float32)
             ftype_cur = 0
     else:
         if data.dtype != np.float32:

From afd66e9c2a96c50247c57b4d554d0fec1bdc90ef Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:40:38 -0400
Subject: [PATCH 14/20] Excludes legacy models from type checking

Updates the pyright configuration to exclude the legacy models directory from type checking.
---
 pyrightconfig.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyrightconfig.json b/pyrightconfig.json
index 07f267e6f1c..8c89e6ff774 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -20,7 +20,7 @@
     },
   ],
   "exclude": [
-    "tools/legacy",
+    "tools/mtmd/legacy-models",
     "tests"
   ]
  }

From 6ee825381b5cdecedb42d4ca5b43d1cf4d09f2c1 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 17:44:53 -0400
Subject: [PATCH 15/20] Configures pyright to ignore invalid type forms

Temporarily disables `reportInvalidTypeForm` in pyright configuration due to ongoing transition to the new type system in numpy 2.2.x.
---
 gguf-py/gguf/gguf_reader.py | 2 --
 pyrightconfig.json          | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e2bccef7eee..81fa0f1b05d 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -1,5 +1,3 @@
-# pyright: reportInvalidTypeForm=false
-#
 # GGUF file reading/modification support. For API usage information,
 # please see the files scripts/ for some fairly simple examples.
 #
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 8c89e6ff774..d6409972d83 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -2,6 +2,7 @@
   "extraPaths": ["gguf-py"],
   "pythonVersion": "3.10",
   "pythonPlatform": "All",
+  "reportInvalidTypeForm": false, // TODO: remove once numpy 2.2.x resolves the transition to their new type system
   "reportUnusedImport": "warning",
   "reportDuplicateImport": "error",
   "reportDeprecated": "warning",

From d49b004f8fa17e34c2df38cddec1f61a0460caeb Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 18:07:47 -0400
Subject: [PATCH 16/20] Uses `np.dtype` for NumPy type hints in GGUF reader

Replaces uses of bare NumPy type hints (e.g., `np.uint32`) with
`np.dtype(np.uint32)` in the GGUF reader. This aligns with changes in
NumPy 2.0+ and ensures forward compatibility.

Also applies this change to the quantize/dequantize functions in the
quants module.
---
 gguf-py/gguf/gguf_reader.py | 26 +++++++++++++-------------
 gguf-py/gguf/quants.py      |  8 ++++----
 pyrightconfig.json          |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 81fa0f1b05d..d71b110a8c8 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -134,12 +134,12 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         offs = 0
 
         # Check for GGUF magic
-        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
+        if self._get(offs, np.dtype(np.uint32), override_order = '<')[0] != GGUF_MAGIC:
             raise ValueError('GGUF magic invalid')
         offs += 4
 
         # Check GGUF version
-        temp_version = self._get(offs, np.uint32)
+        temp_version = self._get(offs, np.dtype(np.uint32))
         if temp_version[0] & 65535 == 0:
             # If we get 0 here that means it's (probably) a GGUF file created for
             # the opposite byte order of the machine this script is running on.
@@ -162,7 +162,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
 
         # Check tensor count and kv count
-        temp_counts = self._get(offs, np.uint64, 2)
+        temp_counts = self._get(offs, np.dtype(np.uint64), 2)
         offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
         offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
         tensor_count, kv_count = temp_counts
@@ -212,8 +212,8 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
 
     def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
-        slen = self._get(offset, np.uint64)
-        return slen, self._get(offset + 8, np.uint8, slen[0].item())
+        slen = self._get(offset, np.dtype(np.uint64))
+        return slen, self._get(offset + 8, np.dtype(np.uint8), slen[0].item())
 
     def _get_field_parts(
         self, orig_offs: int, raw_type: int,
@@ -234,9 +234,9 @@ def _get_field_parts(
             return int(val.nbytes), [val], [0], types
         # Handle arrays.
         if gtype == GGUFValueType.ARRAY:
-            raw_itype = self._get(offs, np.uint32)
+            raw_itype = self._get(offs, np.dtype(np.uint32))
             offs += int(raw_itype.nbytes)
-            alen = self._get(offs, np.uint64)
+            alen = self._get(offs, np.dtype(np.uint64))
             offs += int(alen.nbytes)
             aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
             data_idxs: list[int] = []
@@ -261,19 +261,19 @@ def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs += int(name_len.nbytes + name_data.nbytes)
 
         # Get Tensor Dimensions Count
-        n_dims = self._get(offs, np.uint32)
+        n_dims = self._get(offs, np.dtype(np.uint32))
         offs += int(n_dims.nbytes)
 
         # Get Tensor Dimension Array
-        dims = self._get(offs, np.uint64, n_dims[0].item())
+        dims = self._get(offs, np.dtype(np.uint64), n_dims[0].item())
         offs += int(dims.nbytes)
 
         # Get Tensor Encoding Scheme Type
-        raw_dtype = self._get(offs, np.uint32)
+        raw_dtype = self._get(offs, np.dtype(np.uint32))
         offs += int(raw_dtype.nbytes)
 
         # Get Tensor Offset
-        offset_tensor = self._get(offs, np.uint64)
+        offset_tensor = self._get(offs, np.dtype(np.uint64))
         offs += int(offset_tensor.nbytes)
 
         return ReaderField(
@@ -288,7 +288,7 @@ def _build_fields(self, offs: int, count: int) -> int:
             orig_offs = offs
             kv_klen, kv_kdata = self._get_str(offs)
             offs += int(kv_klen.nbytes + kv_kdata.nbytes)
-            raw_kv_type = self._get(offs, np.uint32)
+            raw_kv_type = self._get(offs, np.dtype(np.uint32))
             offs += int(raw_kv_type.nbytes)
             parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
             idxs_offs = len(parts)
@@ -352,7 +352,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
                 item_type = np.dtype(np.int64)
             else:
                 item_count = n_bytes
-                item_type = np.uint8
+                item_type = np.dtype(np.uint8)
                 np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
             tensors.append(ReaderTensor(
                 name = tensor_name,
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 26672909f97..735758577ee 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -91,11 +91,11 @@ def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
         cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
         cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
             cls.__quantize_array,
-            meta_noop=(np.uint8, cls.__shape_to_bytes)
+            meta_noop=(np.dtype(np.uint8), cls.__shape_to_bytes)
         )
         cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
             cls.__dequantize_array,
-            meta_noop=(np.float32, cls.__shape_from_bytes)
+            meta_noop=(np.dtype(np.float32), cls.__shape_from_bytes)
         )
         assert qtype not in _type_traits
         _type_traits[qtype] = cls
@@ -163,12 +163,12 @@ def __shape_from_bytes(cls, shape: Sequence[int]):
 
     @classmethod
     def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
-        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
+        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.dtype(np.uint8), oshape=cls.__shape_to_bytes(array.shape))
 
     @classmethod
     def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
         cls.init_grid()
-        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
+        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.dtype(np.float32), oshape=cls.__shape_from_bytes(array.shape))
 
     @classmethod
     def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
diff --git a/pyrightconfig.json b/pyrightconfig.json
index d6409972d83..f87acf35bdc 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -2,7 +2,7 @@
   "extraPaths": ["gguf-py"],
   "pythonVersion": "3.10",
   "pythonPlatform": "All",
-  "reportInvalidTypeForm": false, // TODO: remove once numpy 2.2.x resolves the transition to their new type system
+  "reportInvalidTypeForm": false,
   "reportUnusedImport": "warning",
   "reportDuplicateImport": "error",
   "reportDeprecated": "warning",

From a5c74dd2d7284a2c9eae0b6ccf92aadd5bad2b50 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 18:29:48 -0400
Subject: [PATCH 17/20] reverting removal of DTypeLike changes

DTypeLike was removed because pyright was choking on it against numpy 2.2.1 Now numpy at 2.2.6 seems to have changes needed for pyright to not have issues with it. additionally, DTypeLike is accurate in a way that np.dtype[Any] was not.
---
 gguf-py/gguf/gguf_reader.py | 6 +++---
 gguf-py/gguf/lazy.py        | 5 +++--
 gguf-py/gguf/quants.py      | 4 +++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index d71b110a8c8..e35553d2ad9 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -181,7 +181,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         self.data_offset = offs
         self._build_tensors(offs, tensors_fields)
 
-    _DT = TypeVar('_DT', bound = np.dtype[Any])
+    _DT = TypeVar('_DT', bound = npt.DTypeLike)
 
     # Fetch a key/value metadata field by key.
     def get_field(self, key: str) -> Union[ReaderField, None]:
@@ -192,7 +192,7 @@ def get_tensor(self, idx: int) -> ReaderTensor:
         return self.tensors[idx]
 
     def _get(
-        self, offset: int, dtype: np.dtype[Any], count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
     ) -> np.ndarray:
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
@@ -328,7 +328,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
             block_size, type_size = GGML_QUANT_SIZES[ggml_type]
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])
-            item_type: np.dtype[Any]
+            item_type: npt.DTypeLike
             if ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
                 item_type = np.dtype(np.float16)
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
index 73f580bd479..f9bcadae022 100644
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -5,6 +5,7 @@
 from typing import Any, Callable
 
 import numpy as np
+from numpy.typing import DTypeLike
 
 
 logger = logging.getLogger(__name__)
@@ -106,7 +107,7 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
             return o
 
     @classmethod
-    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | np.dtype[Any] | tuple[np.dtype[Any], Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
         def wrapped_fn(*args, **kwargs):
             if kwargs is None:
                 kwargs = {}
@@ -203,7 +204,7 @@ class LazyNumpyTensor(LazyBase):
     shape: tuple[int, ...]  # Makes the type checker happy in quants.py
 
     @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: np.dtype[Any], shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
         # The initial idea was to use np.nan as the fill value,
         # but non-float types like np.int16 can't use that.
         # So zero it is.
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 735758577ee..9a12eea9dda 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -3,6 +3,8 @@
 from typing import Any, Callable, Sequence
 from math import log2, ceil
 
+from numpy.typing import DTypeLike
+
 from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
 from .lazy import LazyNumpyTensor
 
@@ -24,7 +26,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
 
 
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
-def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: np.dtype[Any], oshape: tuple[int, ...]) -> np.ndarray:
+def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
     rows = arr.reshape((-1, arr.shape[-1]))
     osize = 1
     for dim in oshape:

From 340075537d4a7b67f93e71f02d1edb3f262d5302 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Mon, 19 May 2025 19:04:07 -0400
Subject: [PATCH 18/20] Fixes byteorder check and removes unrequested exclusion
 tests

Updates the byteorder check in the conversion script to be more robust.

Removes the "tests" directory from the pyright exclude list.
---
 examples/convert_legacy_llama.py | 2 +-
 pyrightconfig.json               | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
index c4ec5c524e9..710cca96be1 100755
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -1308,7 +1308,7 @@ def do_dump_model(model_plus: ModelPlus) -> None:
 
 def main(args_in: list[str] | None = None) -> None:
     output_choices = ["f32", "f16"]
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+    if np.uint32(1) == np.uint32(1).view(np.dtype(np.uint32).newbyteorder("<")):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
     parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
diff --git a/pyrightconfig.json b/pyrightconfig.json
index f87acf35bdc..4fc9822c011 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -22,6 +22,5 @@
   ],
   "exclude": [
     "tools/mtmd/legacy-models",
-    "tests"
   ]
  }

From 4802d01df197786e8fa8d57950ba5cee41a9e195 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Tue, 20 May 2025 10:54:02 -0400
Subject: [PATCH 19/20] reverting unnecessary changes in gguf-py/gguf as of
 numpy 2.2.6

---
 gguf-py/gguf/gguf_reader.py | 50 ++++++++++++++++++-------------------
 gguf-py/gguf/quants.py      |  8 +++---
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e35553d2ad9..5991cdb76be 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -1,3 +1,4 @@
+#
 # GGUF file reading/modification support. For API usage information,
 # please see the files scripts/ for some fairly simple examples.
 #
@@ -14,7 +15,6 @@
 
 from .quants import quant_shape_to_byte_shape
 
-
 if __name__ == "__main__":
     from pathlib import Path
 
@@ -134,12 +134,12 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         offs = 0
 
         # Check for GGUF magic
-        if self._get(offs, np.dtype(np.uint32), override_order = '<')[0] != GGUF_MAGIC:
+        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
             raise ValueError('GGUF magic invalid')
         offs += 4
 
         # Check GGUF version
-        temp_version = self._get(offs, np.dtype(np.uint32))
+        temp_version = self._get(offs, np.uint32)
         if temp_version[0] & 65535 == 0:
             # If we get 0 here that means it's (probably) a GGUF file created for
             # the opposite byte order of the machine this script is running on.
@@ -162,7 +162,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
 
         # Check tensor count and kv count
-        temp_counts = self._get(offs, np.dtype(np.uint64), 2)
+        temp_counts = self._get(offs, np.uint64, 2)
         offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
         offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
         tensor_count, kv_count = temp_counts
@@ -193,7 +193,7 @@ def get_tensor(self, idx: int) -> ReaderTensor:
 
     def _get(
         self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
-    ) -> np.ndarray:
+    ) -> npt.NDArray[Any]:
         count = int(count)
         itemsize = int(np.empty([], dtype = dtype).itemsize)
         end_offs = offset + itemsize * count
@@ -212,8 +212,8 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
 
     def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
-        slen = self._get(offset, np.dtype(np.uint64))
-        return slen, self._get(offset + 8, np.dtype(np.uint8), slen[0].item())
+        slen = self._get(offset, np.uint64)
+        return slen, self._get(offset + 8, np.uint8, slen[0])
 
     def _get_field_parts(
         self, orig_offs: int, raw_type: int,
@@ -230,19 +230,19 @@ def _get_field_parts(
         # Check if it's a simple scalar type.
         nptype = self.gguf_scalar_to_np.get(gtype)
         if nptype is not None:
-            val = self._get(offs, np.dtype(nptype))
+            val = self._get(offs, nptype)
             return int(val.nbytes), [val], [0], types
         # Handle arrays.
         if gtype == GGUFValueType.ARRAY:
-            raw_itype = self._get(offs, np.dtype(np.uint32))
+            raw_itype = self._get(offs, np.uint32)
             offs += int(raw_itype.nbytes)
-            alen = self._get(offs, np.dtype(np.uint64))
+            alen = self._get(offs, np.uint64)
             offs += int(alen.nbytes)
             aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
             data_idxs: list[int] = []
             # FIXME: Handle multi-dimensional arrays properly instead of flattening
             for idx in range(alen[0]):
-                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0].item())
+                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
                 if idx == 0:
                     types += curr_types
                 idxs_offs = len(aparts)
@@ -261,19 +261,19 @@ def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs += int(name_len.nbytes + name_data.nbytes)
 
         # Get Tensor Dimensions Count
-        n_dims = self._get(offs, np.dtype(np.uint32))
+        n_dims = self._get(offs, np.uint32)
         offs += int(n_dims.nbytes)
 
         # Get Tensor Dimension Array
-        dims = self._get(offs, np.dtype(np.uint64), n_dims[0].item())
+        dims = self._get(offs, np.uint64, n_dims[0])
         offs += int(dims.nbytes)
 
         # Get Tensor Encoding Scheme Type
-        raw_dtype = self._get(offs, np.dtype(np.uint32))
+        raw_dtype = self._get(offs, np.uint32)
         offs += int(raw_dtype.nbytes)
 
         # Get Tensor Offset
-        offset_tensor = self._get(offs, np.dtype(np.uint64))
+        offset_tensor = self._get(offs, np.uint64)
         offs += int(offset_tensor.nbytes)
 
         return ReaderField(
@@ -288,11 +288,11 @@ def _build_fields(self, offs: int, count: int) -> int:
             orig_offs = offs
             kv_klen, kv_kdata = self._get_str(offs)
             offs += int(kv_klen.nbytes + kv_kdata.nbytes)
-            raw_kv_type = self._get(offs, np.dtype(np.uint32))
+            raw_kv_type = self._get(offs, np.uint32)
             offs += int(raw_kv_type.nbytes)
             parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
             idxs_offs = len(parts)
-            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0].item())
+            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
             parts += field_parts
             self._push_field(ReaderField(
                 orig_offs,
@@ -331,28 +331,28 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
             item_type: npt.DTypeLike
             if ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
-                item_type = np.dtype(np.float16)
+                item_type = np.float16
             elif ggml_type == GGMLQuantizationType.F32:
                 item_count = n_elems
-                item_type = np.dtype(np.float32)
+                item_type = np.float32
             elif ggml_type == GGMLQuantizationType.F64:
                 item_count = n_elems
-                item_type = np.dtype(np.float64)
+                item_type = np.float64
             elif ggml_type == GGMLQuantizationType.I8:
                 item_count = n_elems
-                item_type = np.dtype(np.int8)
+                item_type = np.int8
             elif ggml_type == GGMLQuantizationType.I16:
                 item_count = n_elems
-                item_type = np.dtype(np.int16)
+                item_type = np.int16
             elif ggml_type == GGMLQuantizationType.I32:
                 item_count = n_elems
-                item_type = np.dtype(np.int32)
+                item_type = np.int32
             elif ggml_type == GGMLQuantizationType.I64:
                 item_count = n_elems
-                item_type = np.dtype(np.int64)
+                item_type = np.int64
             else:
                 item_count = n_bytes
-                item_type = np.dtype(np.uint8)
+                item_type = np.uint8
                 np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
             tensors.append(ReaderTensor(
                 name = tensor_name,
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 9a12eea9dda..3c8ba82e19d 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -93,11 +93,11 @@ def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
         cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
         cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
             cls.__quantize_array,
-            meta_noop=(np.dtype(np.uint8), cls.__shape_to_bytes)
+            meta_noop=(np.uint8, cls.__shape_to_bytes)
         )
         cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
             cls.__dequantize_array,
-            meta_noop=(np.dtype(np.float32), cls.__shape_from_bytes)
+            meta_noop=(np.float32, cls.__shape_from_bytes)
         )
         assert qtype not in _type_traits
         _type_traits[qtype] = cls
@@ -165,12 +165,12 @@ def __shape_from_bytes(cls, shape: Sequence[int]):
 
     @classmethod
     def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
-        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.dtype(np.uint8), oshape=cls.__shape_to_bytes(array.shape))
+        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
 
     @classmethod
     def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
         cls.init_grid()
-        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.dtype(np.float32), oshape=cls.__shape_from_bytes(array.shape))
+        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
 
     @classmethod
     def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:

From 0570f177c2c873051574b8402a201365bea19ac9 Mon Sep 17 00:00:00 2001
From: robbiemu <roberto.tomas.cuentas@gmail.com>
Date: Tue, 20 May 2025 11:02:44 -0400
Subject: [PATCH 20/20] pyrightconfig.json

I thought I had already done this!
---
 pyrightconfig.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyrightconfig.json b/pyrightconfig.json
index 4fc9822c011..2f91e31ec2e 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -2,7 +2,6 @@
   "extraPaths": ["gguf-py"],
   "pythonVersion": "3.10",
   "pythonPlatform": "All",
-  "reportInvalidTypeForm": false,
   "reportUnusedImport": "warning",
   "reportDuplicateImport": "error",
   "reportDeprecated": "warning",
@@ -23,4 +22,4 @@
   "exclude": [
     "tools/mtmd/legacy-models",
   ]
- }
+}