From d4b1a7f9c509152a0685c4675c35d31c95822eae Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 3 Jun 2025 08:55:02 +0300
Subject: [PATCH 1/2] Adding the XTC sampler

---
 common/common.cpp      | 14 ++++++++++++++
 common/sampling.cpp    | 13 +++++++++++--
 common/sampling.h      |  3 +++
 include/llama.h        |  8 ++++++++
 src/llama-sampling.cpp | 34 ++++++++++++++++++++++++++++++++++
 src/llama-sampling.h   |  1 +
 src/llama.cpp          |  5 +++++
 7 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2df8d4d4c..cefbf63f6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -649,6 +649,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         sparams.mirostat_tau = std::stof(argv[i]);
         return true;
     }
+    if (arg == "--xtc-probability") {
+        CHECK_ARG
+        sparams.xtc_probability = std::stof(argv[i]);
+        return true;
+    }
+    if (arg == "--xtc-threshold") {
+        CHECK_ARG
+        sparams.xtc_threshold = std::stof(argv[i]);
+        return true;
+    }
     if (arg == "--cfg-negative-prompt") {
         CHECK_ARG
         sparams.cfg_negative_prompt = argv[i];
@@ -1635,6 +1645,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
     options.push_back({ "*",           "       --mirostat-lr N",        "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
     options.push_back({ "*",           "       --mirostat-ent N",       "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
+    options.push_back({ "*",           "       --xtc-probability p",    "xtc probability (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_probability });
+    options.push_back({ "*",           "       --xtc-threshold t",      "xtc threshold (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_threshold});
     options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
                                                                         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
                                                                         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
@@ -3396,6 +3408,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
+    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_threshold: %f # default: 0.0\n", sparams.xtc_threshold);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 079e40516..84691d93b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -121,10 +121,12 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n"
+            "\txtc_probability = %.3f, xtc_threshold = %.3f",
             params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
             params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            params.mirostat, params.mirostat_eta, params.mirostat_tau,
+            params.xtc_probability, params.xtc_threshold);
 
     return std::string(result);
 }
@@ -153,6 +155,7 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
         case llama_sampler_type::TOP_P:       return "top_p";
         case llama_sampler_type::MIN_P:       return "min_p";
         case llama_sampler_type::TEMPERATURE: return "temperature";
+        case llama_sampler_type::XTC        : return "xtc";
         default : return "";
     }
 }
@@ -164,6 +167,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
         {"typical_p",   llama_sampler_type::TYPICAL_P},
         {"min_p",       llama_sampler_type::MIN_P},
         {"tfs_z",       llama_sampler_type::TFS_Z},
+        {"xtc",         llama_sampler_type::XTC},
         {"temperature", llama_sampler_type::TEMPERATURE}
     };
 
@@ -178,6 +182,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
         {"min-p",       llama_sampler_type::MIN_P},
         {"tfs-z",       llama_sampler_type::TFS_Z},
         {"tfs",         llama_sampler_type::TFS_Z},
+        {"xtc",         llama_sampler_type::XTC},
         {"temp",        llama_sampler_type::TEMPERATURE}
     };
 
@@ -212,6 +217,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
         {'y', llama_sampler_type::TYPICAL_P},
         {'m', llama_sampler_type::MIN_P},
         {'f', llama_sampler_type::TFS_Z},
+        {'x', llama_sampler_type::XTC},
         {'t', llama_sampler_type::TEMPERATURE}
     };
 
@@ -240,6 +246,8 @@ static void sampler_queue(
     const float         min_p             = params.min_p;
     const float         tfs_z             = params.tfs_z;
     const float         typical_p         = params.typical_p;
+    const float         xtc_probability   = params.xtc_probability;
+    const float         xtc_threshold     = params.xtc_threshold;
     const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
 
     for (auto sampler_type : samplers_sequence) {
@@ -249,6 +257,7 @@ static void sampler_queue(
             case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
             case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
             case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::XTC      : llama_sample_xtc      (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break;
             case llama_sampler_type::TEMPERATURE:
                 if (dynatemp_range > 0) {
                     float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
diff --git a/common/sampling.h b/common/sampling.h
index eeaa53b8b..163cdfca2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -15,6 +15,7 @@ enum class llama_sampler_type : char {
     TOP_P       = 'p',
     MIN_P       = 'm',
     TFS_Z       = 'f',
+    XTC         = 'x',
     TYPICAL_P   = 'y',
     TEMPERATURE = 't'
 };
@@ -39,6 +40,8 @@ typedef struct llama_sampling_params {
     int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float       mirostat_tau          = 5.00f;              // target entropy
     float       mirostat_eta          = 0.10f;              // learning rate
+    float       xtc_probability       = 0.0f;               // xtc probability
+    float       xtc_threshold         = 1.0f;               // xtc threashold, disabled if > 0.5
     bool        penalize_nl           = false;              // consider newlines as a repeatable token
     uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
 
diff --git a/include/llama.h b/include/llama.h
index 607a590d3..895262769 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1208,6 +1208,14 @@ extern "C" {
           llama_token_data_array * candidates,
                            float   temp);
 
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API void llama_sample_xtc(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates_p,
+                           float   probability,
+                           float   threshold,
+                           size_t  min_keep);
+
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8910f6d65..06f44b02d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -434,6 +434,40 @@ void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array
     }
 }
 
+void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep) {
+    if (probability < 0 || threshold > 0.5f || candidates->size < 2) {
+        return;
+    }
+    GGML_ASSERT(smpl);
+    const int64_t t_start_sample_us = ggml_time_us();
+    if (probability < 1) {
+        std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+        float chance = distribution(smpl->rng);
+        if (chance > probability) return;
+    }
+
+    llama_sample_softmax_impl(nullptr, candidates);
+
+    auto cur_size = candidates->size;
+
+    int pos_last = 0;
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        if (candidates->data[i].p >= threshold) {
+            pos_last = i;
+        } else break;
+    }
+
+    if (candidates->size - pos_last >= min_keep && pos_last > 0) {
+        candidates->data += pos_last;
+        candidates->size -= pos_last;
+    }
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    smpl->n_sample++;
+
+}
+
 void llama_sample_repetition_penalties_impl(
         struct llama_sampling * smpl,
        llama_token_data_array * candidates,
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
index f7f8e3ef7..c2a9e45f4 100644
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -32,6 +32,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
 void llama_sample_typical_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
 void llama_sample_entropy_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
 void llama_sample_temp_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
+void llama_sample_xtc_impl      (struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep);
 
 void llama_sample_repetition_penalties_impl(
         struct llama_sampling * smpl,
diff --git a/src/llama.cpp b/src/llama.cpp
index 18c7cd0f5..90e342e13 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -23265,6 +23265,11 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
     llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
 }
 
+void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p,
+                           float   probability, float threshold, size_t min_keep) {
+    llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep);
+}
+
 void llama_sample_repetition_penalties(
             struct llama_context * ctx,
           llama_token_data_array * candidates,

From 626f49ab842d406b2603ec8364139afcebed4ed6 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 3 Jun 2025 09:16:53 +0300
Subject: [PATCH 2/2] Check if MMVQ is supported before using it.

---
 ggml/src/ggml-cuda.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 7f4b01d43..c8b06aa8a 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2269,6 +2269,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
         ggml_is_quantized(src0->type) &&
+        ggml_cuda_mmvq_type_supported(src0->type) &&
         ggml_backend_buffer_is_cuda(src0->buffer) &&
         ggml_backend_buffer_is_cuda(src1->buffer) &&
         ggml_backend_buffer_is_cuda(dst->buffer) &&
@@ -2442,8 +2443,8 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
     const ggml_tensor * ids  = dst->src[3];
 
     if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
-        ggml_is_quantized(src0_1->type) &&
-        ggml_is_quantized(src0_2->type) &&
+        ggml_is_quantized(src0_1->type) && ggml_cuda_mmvq_type_supported(src0_1->type) &&
+        ggml_is_quantized(src0_2->type) && ggml_cuda_mmvq_type_supported(src0_2->type) &&
         ggml_backend_buffer_is_cuda(src0_1->buffer) &&
         ggml_backend_buffer_is_cuda(src0_2->buffer) &&
         ggml_backend_buffer_is_cuda(src1->buffer) &&
@@ -2502,6 +2503,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
             CUDA_CHECK(cudaGetLastError());
 
             if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
+                ggml_cuda_mmvq_type_supported(next->src[0]->type) &&
                 ggml_backend_buffer_is_cuda(next->src[0]->buffer) &&
                !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) &&
                 ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id &&