BradHutchings
diff --git a/‎.devops/musa.Dockerfile‎
Lines changed: 3 additions & 3 deletions b/‎.devops/musa.Dockerfile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/README.md‎
Lines changed: 1 addition & 1 deletion b/‎ci/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎completion-ui/admin/index.html‎
Lines changed: 1 addition & 1 deletion b/‎completion-ui/admin/index.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎completion-ui/completion/help.html‎
Lines changed: 1 addition & 1 deletion b/‎completion-ui/completion/help.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎completion-ui/completion/index.html‎
Lines changed: 1 addition & 1 deletion b/‎completion-ui/completion/index.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/build-s390x.md‎
Lines changed: 38 additions & 8 deletions b/‎docs/build-s390x.md‎
Lines changed: 38 additions & 8 deletions
diff --git a/‎docs/development/HOWTO-add-model.md‎
Lines changed: 15 additions & 6 deletions b/‎docs/development/HOWTO-add-model.md‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎docs/docker.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/docker.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎ggml/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc4.2.0
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
 
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 
 
@@ -515,7 +515,7 @@ jobs:
 
   ubuntu-22-cmake-musa:
     runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
 
     steps:
       - name: Clone
 
@@ -54,7 +54,7 @@ docker run --privileged -it \
     -v $HOME/llama.cpp/ci-cache:/ci-cache \
     -v $HOME/llama.cpp/ci-results:/ci-results \
     -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
 ```
 
 Inside the container, execute the following commands:
 
@@ -6,7 +6,7 @@
         // License Inquiries: [email protected].
     -->
     <head>
-        <--
+        <!--
             To do: Script file that can determine actual server, redirect to another port. -Brad 2025-07-24
         -->
         <meta http-equiv="refresh" content="0; url=https://mmojo.local:8443/">
 
@@ -28,7 +28,7 @@
     <div id="content">
 
         <div id="title-bar" class="row">
-            <div id="mmojo">Mmojo Completion - Help</div>
+            <div id="mmojo-completion">Mmojo Completion - Help</div>
         </div>
 
         <div id="work-area-text" class="row">
 
@@ -32,7 +32,7 @@
             <div id="mmojo-completion" onclick="ClickMmojoCompletion();">Mmojo Completion</div>
             <img id="settings-icon" src="images/settings-64.png" onclick="ToggleSettings();" title="Toggle Settings" />
             <img id="chat-icon" src="images/chat-64.png" onclick="Chat();" title="Chat" />
-            <img id="print-icon" src="images/print-64.png" onclick="Print();"/>
+            <img id="print-icon" src="images/print-64.png" onclick="Print();" title="Print"/>
             <img id="bookmark-icon" src="images/bookmark-64.png" onclick="EditBookmark();" title="Bookmark Maker"/>
             <img id="full-screen-icon" src="images/fullscreen-64.png" onclick="ToggleFullScreen();" title="Toggle Full Screen"/>
             <img id="help-icon" src="images/help-64.png" onclick="Help();" title="Help"/>
 
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
     cmake --build build --config Release -j $(nproc)
     ```
 
--   By default, NNPA is enabled when available. To disable it (not recommended):
+-   By default, NNPA is disabled by default. To enable it:
 
     ```bash
     cmake -S . -B build             \
         -DCMAKE_BUILD_TYPE=Release  \
         -DGGML_BLAS=ON              \
         -DGGML_BLAS_VENDOR=OpenBLAS \
-        -DGGML_NNPA=OFF
+        -DGGML_NNPA=ON
 
     cmake --build build --config Release -j $(nproc)
     ```
@@ -84,16 +84,24 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
 
     ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
 
-    You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
+    You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
 
-    These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
+    These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
 
 2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
 
     ![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
 
     The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
 
+    Ensure that you have installed the required packages in advance
+
+    ```bash
+    pip3 install -r requirements.txt
+    ```
+
+    Convert the `safetensors` model to `GGUF`
+
     ```bash
     python3 convert_hf_to_gguf.py \
         --outfile model-name-be.f16.gguf \
@@ -116,7 +124,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
 
     ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
 
-    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
+    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
 
     ```bash
     python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
@@ -141,15 +149,15 @@ Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by
 
 ### 2. NNPA Vector Intrinsics Acceleration
 
-Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
 
 ### 3. zDNN Accelerator
 
-_Only available in IBM z16 or later system. No direction at the moment._
+_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
 
 ### 4. Spyre Accelerator
 
-_No direction at the moment._
+_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
 
 ## Performance Tuning
 
@@ -189,6 +197,26 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 
     Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
 
+4. Failing to install the `sentencepiece` package using GCC 15+
+
+    Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
+
+    As a temporary workaround, please run the installation command with the following environment variables.
+
+    ```bash
+    export CXXFLAGS="-include cstdint"
+    ```
+
+    For example,
+
+    ```bash
+    CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
+    ```
+
+5. `-DGGML_NNPA=ON` generates gibberish output
+
+    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
+
 ## Getting Help on IBM Z & LinuxONE
 
 1. **Bugs, Feature Requests**
@@ -244,3 +272,5 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   ✅ - acceleration available
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself
+
+Last Updated by **Aaron Teo ([email protected])** on July 25, 2025.
@@ -23,11 +23,19 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
 
 The required steps to implement for an HF model are:
 
-1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
 
 ```python
-@Model.register("MyModelForCausalLM")
-class MyModel(Model):
+@ModelBase.register("MyModelForCausalLM")
+class MyModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.MYMODEL
+```
+
+or
+
+```python
+@ModelBase.register("MyModelForConditionalGeneration")
+class MyModel(MmprojModel):
     model_arch = gguf.MODEL_ARCH.MYMODEL
 ```
 
@@ -75,9 +83,10 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
 `transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
 
 Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
-- `Model#set_gguf_parameters`
-- `Model#set_vocab`
-- `Model#write_tensors`
+- `TextModel#set_gguf_parameters`
+- `MmprojModel#set_gguf_parameters`
+- `ModelBase#set_vocab`
+- `ModelBase#modify_tensors`
 
 NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
 
 
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
 
 The defaults are:
 
-- `MUSA_VERSION` set to `rc4.0.1`
+- `MUSA_VERSION` set to `rc4.2.0`
 
 The resulting images, are essentially the same as the non-MUSA images:
 
 
@@ -131,7 +131,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             ON)
+option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental,
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
+option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
+option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)