ggml-org · ggerganov · May 27, 2025 · May 27, 2025 · May 19, 2025 · May 20, 2025
diff --git a/.devops/main-musa.Dockerfile b/.devops/main-musa.Dockerfile
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.1
+ARG MUSA_VERSION=rc4.0.1
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
 
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 WORKDIR /app

diff --git a/README.md b/README.md
@@ -386,7 +386,7 @@ Run the inference examples as usual, for example:
 ## Moore Threads GPU support
 
 With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels.
-First, make sure you have installed `MUSA SDK rc3.1.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=rc3.1.1
+First, make sure you have installed `MUSA SDK rc4.0.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=rc4.0.1
 
 Now build `whisper.cpp` with MUSA support:
 

diff --git a/bindings/ruby/ext/options.rb b/bindings/ruby/ext/options.rb
@@ -160,6 +160,7 @@ def configure
     bool "GGML_VULKAN_SHADER_DEBUG_INFO"
     pending "GGML_VULKAN_VALIDATE"
     bool "GGML_VXE"
+    bool "GGML_XTHEADVECTOR"
     filepath "GIT_EXE"
     filepath "MATH_LIBRARY"
     filepath "METALKIT_FRAMEWORK"

diff --git a/examples/talk-llama/llama-batch.cpp b/examples/talk-llama/llama-batch.cpp
@@ -1,5 +1,6 @@
 #include "llama-batch.h"
 
+#include <cassert>
 #include <cstring>
 #include <algorithm>
 
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
     batch = in_batch;
     GGML_ASSERT(batch.n_tokens > 0);
     if (!batch.pos) {
+        assert(p0 >= 0);
         pos.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = i + p0;
+            pos[i] = p0 + i;
         }
         batch.pos = pos.data();
     }