ggml-org
diff --git a/‎.devops/s390x.Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎.devops/s390x.Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/labeler.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/labeler.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/build-linux-cross.yml‎
Lines changed: 37 additions & 37 deletions b/‎.github/workflows/build-linux-cross.yml‎
Lines changed: 37 additions & 37 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 4 deletions b/‎README.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 15 additions & 1 deletion b/‎common/arg.cpp‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎common/chat.cpp‎
Lines changed: 17 additions & 2 deletions b/‎common/chat.cpp‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎common/common.h‎
Lines changed: 2 additions & 0 deletions b/‎common/common.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 107 additions & 0 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎docs/docker.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/docker.md‎
Lines changed: 3 additions & 3 deletions
@@ -24,8 +24,9 @@ RUN --mount=type=cache,target=/root/.ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_BACKEND_DL=OFF \
         -DGGML_NATIVE=OFF \
+        -DGGML_BACKEND_DL=ON \
+        -DGGML_CPU_ALL_VARIANTS=ON \
         -DGGML_BLAS=ON \
         -DGGML_BLAS_VENDOR=OpenBLAS && \
     cmake --build build --config Release -j $(nproc) && \
@@ -103,6 +104,7 @@ FROM base AS light
 WORKDIR /llama.cpp/bin
 
 # Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
 
 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -116,6 +118,7 @@ ENV LLAMA_ARG_HOST=0.0.0.0
 WORKDIR /llama.cpp/bin
 
 # Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
 
 EXPOSE 8080
 
@@ -76,6 +76,10 @@ ggml:
     - changed-files:
         - any-glob-to-any-file:
             - ggml/**
+model:
+    - changed-files:
+        - any-glob-to-any-file:
+            - src/models/**
 nix:
     - changed-files:
         - any-glob-to-any-file:
 
@@ -4,49 +4,49 @@ on:
   workflow_call:
 
 jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
+  # ubuntu-24-riscv64-cpu-cross:
+  #   runs-on: ubuntu-24.04
 
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo dpkg --add-architecture riscv64
 
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
+  #         # Add arch-specific repositories for non-amd64 architectures
+  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+  #         EOF
 
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
 
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu
 
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+  #     - name: Build
+  #       run: |
+  #         cmake -B build -DLLAMA_CURL=OFF \
+  #                        -DCMAKE_BUILD_TYPE=Release \
+  #                        -DGGML_OPENMP=OFF \
+  #                        -DLLAMA_BUILD_EXAMPLES=ON \
+  #                        -DLLAMA_BUILD_TOOLS=ON \
+  #                        -DLLAMA_BUILD_TESTS=OFF \
+  #                        -DCMAKE_SYSTEM_NAME=Linux \
+  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
 
-          cmake --build build --config Release -j $(nproc)
+  #         cmake --build build --config Release -j $(nproc)
 
   # ubuntu-24-riscv64-vulkan-cross:
   #   runs-on: ubuntu-24.04
 
@@ -134,8 +134,8 @@ jobs:
         include:
           - build: 'x64'
             os: ubuntu-22.04
-          - build: 's390x-z15' # z15 because our CI runners are on z15
-            os: ubuntu-22.04-s390x
+          - build: 's390x'
+            os: ubuntu-24.04-s390x
           # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
           # - build: 'arm64'
           #   os: ubuntu-22.04-arm
 
@@ -17,14 +17,13 @@ LLM inference in C/C++
 
 ## Hot topics
 
-- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
-- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
+- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
+- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
+- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
-- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
-- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 
@@ -2030,7 +2030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.system_prompt.pop_back();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
@@ -2768,6 +2768,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    add_opt(common_arg(
+        {"--image-min-tokens"}, "N",
+        "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
+        [](common_params & params, int value) {
+            params.image_min_tokens = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
+    add_opt(common_arg(
+        {"--image-max-tokens"}, "N",
+        "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
+        [](common_params & params, int value) {
+            params.image_max_tokens = value;
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
 
@@ -313,7 +313,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
         }
         if (!msg.reasoning_content.empty()) {
             jmsg["reasoning_content"] = msg.reasoning_content;
-            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
         }
         if (!msg.tool_name.empty()) {
             jmsg["name"] = msg.tool_name;
@@ -1810,7 +1809,23 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
 
 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
-    auto prompt = apply(tmpl, inputs);
+
+    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+        auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message = msg;
+            adjusted_message["thinking"] = msg.at("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
+        }
+    }
+
+    auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
 
     // Check if we need to replace the return token with end token during
     // inference and without generation prompt. For more details see:
 
@@ -406,6 +406,8 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    int image_min_tokens = -1;
+    int image_max_tokens = -1;
 
     // finetune
     struct lr_opt lr;
 
@@ -9802,6 +9802,113 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [(self.map_tensor_name(name), data_torch)]
 
+
+@ModelBase.register("JanusForConditionalGeneration")
+class JanusProModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA  # reuse Llama arch
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Skip vision, aligner, and generation tensors
+        skip_prefixes = (
+            'model.vision_model.',
+            'model.aligner.',
+            'model.vqmodel.',
+            'model.generation_embeddings.',
+            'model.generation_aligner.',
+            'model.generation_head.',
+        )
+        if name.startswith(skip_prefixes):
+            return []
+
+        if name.startswith('model.language_model.'):
+            name = name.replace('model.language_model.', 'model.')
+        elif name.startswith('language_model.'):
+            name = name.replace('language_model.', '')
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("JanusForConditionalGeneration")
+class JanusProVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        if "intermediate_size" not in self.hparams_vision:
+            mlp_ratio = self.hparams_vision.get("mlp_ratio")
+            hidden_size = self.hparams_vision.get("hidden_size")
+            if mlp_ratio is not None and hidden_size is not None:
+                self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
+
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
+
+        hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
+        if hidden_act == "gelu":
+            self.gguf_writer.add_vision_use_gelu(True)
+        elif hidden_act == "silu":
+            self.gguf_writer.add_vision_use_silu(True)
+
+    def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
+        """Map aligner tensors to projector format"""
+        suffix = ".bias" if name.endswith(".bias") else ".weight"
+
+        if name.startswith("model.aligner."):
+            local_name = name[len("model.aligner."):]
+        elif name.startswith("aligner."):
+            local_name = name[len("aligner."):]
+        else:
+            raise ValueError(f"Unsupported Janus aligner prefix: {name}")
+
+        if local_name.startswith("fc1."):
+            mm_index = 0
+        elif local_name.startswith("hidden_layers."):
+            parts = local_name.split(".", 2)
+            if len(parts) < 3:
+                raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
+            mm_index = int(parts[1]) + 1
+        else:
+            raise ValueError(f"Unsupported Janus aligner tensor: {name}")
+
+        tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
+        return [(tensor_name, data_torch)]
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # Skip language model tensors as they will be handled by `JanusProModel`
+        if name.startswith(('model.language_model.', 'language_model.')):
+            return []
+
+        # Skip generation-related components
+        skip_generation_prefixes = (
+            'model.vqmodel.',
+            'vqmodel.',
+            'model.generation_embeddings.',
+            'generation_embeddings.',
+            'model.generation_aligner.',
+            'generation_aligner.',
+            'model.generation_head.',
+            'generation_head.',
+        )
+        if name.startswith(skip_generation_prefixes):
+            return []
+
+        # Handle aligner tensors
+        if name.startswith(('model.aligner.', 'aligner.')):
+            return list(self._map_aligner_tensor(data_torch, name))
+
+        # Handle vision tensors
+        if name.startswith(('model.vision_model.', 'vision_model.')):
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return []
+
+
 ###### CONVERSION LOGIC ######
 
 
 
@@ -7,9 +7,9 @@
 ## Images
 We have three Docker images available for this project:
 
-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
 
 Additionally, there the following images, similar to the above: