ggml-org
diff --git a/‎.devops/cann.Dockerfile‎
Lines changed: 130 additions & 0 deletions b/‎.devops/cann.Dockerfile‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎.github/workflows/pre-tokenizer-hashes.yml‎
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/pre-tokenizer-hashes.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 51 additions & 7 deletions b/‎common/arg.cpp‎
Lines changed: 51 additions & 7 deletions
diff --git a/‎common/chat.cpp‎
Lines changed: 3 additions & 5 deletions b/‎common/chat.cpp‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 1 addition & 0 deletions b/‎common/common.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 13 additions & 5 deletions b/‎common/common.h‎
Lines changed: 13 additions & 5 deletions
@@ -0,0 +1,130 @@
+# ==============================================================================
+# ARGUMENTS
+# ==============================================================================
+
+# Define the CANN base image for easier version updates later
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
+
+# ==============================================================================
+# BUILD STAGE
+# Compile all binary files and libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS build
+
+# Define the Ascend chip model for compilation. Default is Ascend910B3
+ARG ASCEND_SOC_TYPE=Ascend910B3
+
+# -- Install build dependencies --
+RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+
+# -- Set the working directory --
+WORKDIR /app
+
+# -- Copy project files --
+COPY . .
+
+# -- Set CANN environment variables (required for compilation) --
+# Using ENV instead of `source` allows environment variables to persist across the entire image layer
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+# ... You can add other environment variables from the original file as needed ...
+# For brevity, only core variables are listed here. You can paste the original ENV list here.
+
+# -- Build llama.cpp --
+# Use the passed ASCEND_SOC_TYPE argument and add general build options
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
+    && \
+    cmake -B build \
+        -DGGML_CANN=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
+        . && \
+    cmake --build build --config Release -j$(nproc)
+
+# -- Organize build artifacts for copying in later stages --
+# Create a lib directory to store all .so files
+RUN mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib \;
+
+# Create a full directory to store all executables and Python scripts
+RUN mkdir -p /app/full && \
+    cp build/bin/* /app/full/ && \
+    cp *.py /app/full/ && \
+    cp -r gguf-py /app/full/ && \
+    cp -r requirements /app/full/ && \
+    cp requirements.txt /app/full/
+    # If you have a tools.sh script, make sure it is copied here
+    # cp .devops/tools.sh /app/full/tools.sh
+
+# ==============================================================================
+# BASE STAGE
+# Create a minimal base image with CANN runtime and common libraries
+# ==============================================================================
+FROM ${CANN_BASE_IMAGE} AS base
+
+# -- Install runtime dependencies --
+RUN yum install -y libgomp curl && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+
+# -- Set CANN environment variables (required for runtime) --
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+# ... You can add other environment variables from the original file as needed ...
+
+WORKDIR /app
+
+# Copy compiled .so files from the build stage
+COPY --from=build /app/lib/ /app
+
+# ==============================================================================
+# FINAL STAGES (TARGETS)
+# ==============================================================================
+
+### Target: full
+# Complete image with all tools, Python bindings, and dependencies
+# ==============================================================================
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+# Install Python dependencies
+RUN yum install -y git python3 python3-pip && \
+    pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip3 install --no-cache-dir -r requirements.txt && \
+    yum clean all && \
+    rm -rf /var/cache/yum
+
+# You need to provide a tools.sh script as the entrypoint
+ENTRYPOINT ["/app/tools.sh"]
+# If there is no tools.sh, you can set the default to start the server
+# ENTRYPOINT ["/app/llama-server"]
+
+### Target: light
+# Lightweight image containing only llama-cli
+# ==============================================================================
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Target: server
+# Dedicated server image containing only llama-server
+# ==============================================================================
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
@@ -0,0 +1,45 @@
+name: Check Pre-Tokenizer Hashes
+
+on:
+    push:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+    pull_request:
+        paths:
+            - 'convert_hf_to_gguf.py'
+            - 'convert_hf_to_gguf_update.py'
+
+jobs:
+    pre-tokenizer-hashes:
+        runs-on: ubuntu-latest
+
+        steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+
+        - name: Set up Python
+          uses: actions/setup-python@v5
+          with:
+              python-version: '3.11'
+
+        - name: Install Python dependencies
+          run: |
+              python3 -m venv .venv
+              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
+
+        - name: Update pre-tokenizer hashes
+          run: |
+              cp convert_hf_to_gguf.py /tmp
+              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
+
+        - name: Check if committed pre-tokenizer hashes matches generated version
+          run: |
+              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+                  echo "Differences found:"
+                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  exit 1
+              fi
+              echo "Model pre-tokenizer hashes are up to date."
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
             string_process_escapes(seq_breaker);
         }
+        for (auto & pair : params.speculative.replacements) {
+            string_process_escapes(pair.first);
+            string_process_escapes(pair.second);
+        }
     }
 
     if (!params.kv_overrides.empty()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.no_kv_offload = true;
         }
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    add_opt(common_arg(
+        {"-nr", "--no-repack"},
+        "disable weight repacking",
+        [](common_params & params) {
+            params.no_extra_bufts = true;
+        }
+    ).set_env("LLAMA_ARG_NO_REPACK"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         string_format(
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ));
+    add_opt(common_arg(
+        {"--cpu-moe"},
+        "use CPU for Mixture of Experts (MoE) weights",
+        [](common_params & params) {
+            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+        }
+    ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
@@ -3249,6 +3269,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    add_opt(common_arg(
+        {"--spec-replace"}, "TARGET", "DRAFT",
+        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+        [](common_params & params, const std::string & tgt, const std::string & dft) {
+            params.speculative.replacements.push_back({ tgt, dft });
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ctkd", "--cache-type-k-draft"}, "TYPE",
         string_format(
@@ -3438,34 +3465,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
 
-    // diffusion parameters
     add_opt(common_arg(
         { "--diffusion-steps" }, "N",
         string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
         [](common_params & params, int value) { params.diffusion.steps = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-visual" },
+        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+                      params.diffusion.visual_mode ? "true" : "false"),
+        [](common_params & params) { params.diffusion.visual_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
     add_opt(common_arg(
         { "--diffusion-eps" }, "F",
         string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
         [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-algorithm" }, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
                       params.diffusion.algorithm),
         [](common_params & params, int value) { params.diffusion.algorithm = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-alg-temp" }, "F",
-        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
         [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
     add_opt(common_arg(
-        { "--diffusion-visual" },
-        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
-                      params.diffusion.visual_mode ? "true" : "false"),
-        [](common_params & params) { params.diffusion.visual_mode = true; }
+        { "--diffusion-block-length" }, "N",
+        string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+        [](common_params & params, int value) { params.diffusion.block_length = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-cfg-scale" }, "F",
+        string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+        [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-add-gumbel-noise" }, "F",
+        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
 
     return ctx_arg;
 }
@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
         "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
     );
 
-    if (auto res = builder.try_find_regex(open_regex)) {
+    while (auto res = builder.try_find_regex(open_regex)) {
         const auto & block_start = res->groups[1];
         std::string block_end = block_start.empty() ? "" : "```";
 
@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                     builder.consume_literal(block_end);
                     builder.consume_spaces();
                 }
-                builder.add_content(builder.consume_rest());
             } else {
                 throw common_chat_msg_partial_exception("failed to parse tool call");
             }
@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                     builder.consume_spaces();
                 }
             }
-            builder.add_content(builder.consume_rest());
         }
-    } else {
-        builder.add_content(builder.consume_rest());
     }
+
+    builder.add_content(builder.consume_rest());
 }
 
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
 
@@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+    mparams.use_extra_bufts = !params.no_extra_bufts;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
 
@@ -201,6 +201,7 @@ struct common_params_speculative {
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
 
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
 };
 
 struct common_params_diffusion {
-    int32_t steps       = 64;     // number of diffusion steps
-    float   eps         = 1e-3f;  // epsilon for timesteps
-    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
-    float   alg_temp    = 0.0f;   // algorithm temperature
-    bool    visual_mode = false;  // show progressive diffusion on screen
+    int32_t steps         = 128;
+    bool    visual_mode   = false;
+
+    float   eps           = 0;        // epsilon for timesteps
+    int32_t block_length  = 0;        // block length for generation
+
+    int32_t algorithm     = 4;        // default algorithm: low-confidence
+    float   alg_temp      = 0.0f;     // algorithm temperature
+
+    float   cfg_scale     = 0;        // classifier-free guidance scale
+    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };
 
 enum common_reasoning_format {
@@ -352,6 +359,7 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
     bool no_op_offload     = false; // globally disable offload host tensor operations to device
+    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
 
     bool single_turn       = false; // single turn chat conversation
Original file line number	Diff line number	Diff line change
`@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {`
`1646`	`1646`	`"\|<function name=\"([^\"]+)\">" // match 5 (function name again)`
`1647`	`1647`	`);`
`1648`	`1648`
`1649`		`- if (auto res = builder.try_find_regex(open_regex)) {`
	`1649`	`+ while (auto res = builder.try_find_regex(open_regex)) {`
`1650`	`1650`	`const auto & block_start = res->groups[1];`
`1651`	`1651`	std::string block_end = block_start.empty() ? "" : "```";
`1652`	`1652`
`@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {`
`1668`	`1668`	`builder.consume_literal(block_end);`
`1669`	`1669`	`builder.consume_spaces();`
`1670`	`1670`	`}`
`1671`		`- builder.add_content(builder.consume_rest());`
`1672`	`1671`	`} else {`
`1673`	`1672`	`throw common_chat_msg_partial_exception("failed to parse tool call");`
`1674`	`1673`	`}`
`@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {`
`1693`	`1692`	`builder.consume_spaces();`
`1694`	`1693`	`}`
`1695`	`1694`	`}`
`1696`		`- builder.add_content(builder.consume_rest());`
`1697`	`1695`	`}`
`1698`		`- } else {`
`1699`		`- builder.add_content(builder.consume_rest());`
`1700`	`1696`	`}`
	`1697`	`+`
	`1698`	`+ builder.add_content(builder.consume_rest());`
`1701`	`1699`	`}`
`1702`	`1700`
`1703`	`1701`	`static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {`