From d4b1a7f9c509152a0685c4675c35d31c95822eae Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 3 Jun 2025 08:55:02 +0300 Subject: [PATCH 1/2] Adding the XTC sampler --- common/common.cpp | 14 ++++++++++++++ common/sampling.cpp | 13 +++++++++++-- common/sampling.h | 3 +++ include/llama.h | 8 ++++++++ src/llama-sampling.cpp | 34 ++++++++++++++++++++++++++++++++++ src/llama-sampling.h | 1 + src/llama.cpp | 5 +++++ 7 files changed, 76 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2df8d4d4c..cefbf63f6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -649,6 +649,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa sparams.mirostat_tau = std::stof(argv[i]); return true; } + if (arg == "--xtc-probability") { + CHECK_ARG + sparams.xtc_probability = std::stof(argv[i]); + return true; + } + if (arg == "--xtc-threshold") { + CHECK_ARG + sparams.xtc_threshold = std::stof(argv[i]); + return true; + } if (arg == "--cfg-negative-prompt") { CHECK_ARG sparams.cfg_negative_prompt = argv[i]; @@ -1635,6 +1645,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); + options.push_back({ "*", " --xtc-probability p", "xtc probability (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_probability }); + options.push_back({ "*", " --xtc-threshold t", "xtc threshold (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_threshold}); options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); @@ -3396,6 +3408,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); + fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); + fprintf(stream, "xtc_threshold: %f # default: 0.0\n", sparams.xtc_threshold); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); diff --git a/common/sampling.cpp b/common/sampling.cpp index 079e40516..84691d93b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -121,10 +121,12 @@ std::string llama_sampling_print(const llama_sampling_params & params) { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n" + "\txtc_probability = %.3f, xtc_threshold = %.3f", params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, - params.mirostat, params.mirostat_eta, params.mirostat_tau); + params.mirostat, params.mirostat_eta, params.mirostat_tau, + params.xtc_probability, params.xtc_threshold); return std::string(result); } @@ -153,6 +155,7 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { case llama_sampler_type::TOP_P: return "top_p"; case llama_sampler_type::MIN_P: return "min_p"; case llama_sampler_type::TEMPERATURE: return "temperature"; + case llama_sampler_type::XTC : return "xtc"; default : return ""; } } @@ -164,6 +167,7 @@ std::vector llama_sampling_types_from_names(const std::vecto {"typical_p", llama_sampler_type::TYPICAL_P}, {"min_p", llama_sampler_type::MIN_P}, {"tfs_z", llama_sampler_type::TFS_Z}, + {"xtc", llama_sampler_type::XTC}, {"temperature", llama_sampler_type::TEMPERATURE} }; @@ -178,6 +182,7 @@ std::vector llama_sampling_types_from_names(const std::vecto {"min-p", llama_sampler_type::MIN_P}, {"tfs-z", llama_sampler_type::TFS_Z}, {"tfs", llama_sampler_type::TFS_Z}, + {"xtc", llama_sampler_type::XTC}, {"temp", llama_sampler_type::TEMPERATURE} }; @@ -212,6 +217,7 @@ std::vector llama_sampling_types_from_chars(const std::strin {'y', llama_sampler_type::TYPICAL_P}, {'m', llama_sampler_type::MIN_P}, {'f', llama_sampler_type::TFS_Z}, + {'x', llama_sampler_type::XTC}, {'t', llama_sampler_type::TEMPERATURE} }; @@ -240,6 +246,8 @@ static void sampler_queue( const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; + const float xtc_probability = params.xtc_probability; + const float xtc_threshold = params.xtc_threshold; const std::vector & samplers_sequence = params.samplers_sequence; for (auto sampler_type : samplers_sequence) { @@ -249,6 +257,7 @@ static void sampler_queue( case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::XTC : llama_sample_xtc (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break; case llama_sampler_type::TEMPERATURE: if (dynatemp_range > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); diff --git a/common/sampling.h b/common/sampling.h index eeaa53b8b..163cdfca2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -15,6 +15,7 @@ enum class llama_sampler_type : char { TOP_P = 'p', MIN_P = 'm', TFS_Z = 'f', + XTC = 'x', TYPICAL_P = 'y', TEMPERATURE = 't' }; @@ -39,6 +40,8 @@ typedef struct llama_sampling_params { int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate + float xtc_probability = 0.0f; // xtc probability + float xtc_threshold = 1.0f; // xtc threashold, disabled if > 0.5 bool penalize_nl = false; // consider newlines as a repeatable token uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context diff --git a/include/llama.h b/include/llama.h index 607a590d3..895262769 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1208,6 +1208,14 @@ extern "C" { llama_token_data_array * candidates, float temp); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API void llama_sample_xtc( + struct llama_context * ctx, + llama_token_data_array * candidates_p, + float probability, + float threshold, + size_t min_keep); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8910f6d65..06f44b02d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -434,6 +434,40 @@ void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array } } +void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep) { + if (probability < 0 || threshold > 0.5f || candidates->size < 2) { + return; + } + GGML_ASSERT(smpl); + const int64_t t_start_sample_us = ggml_time_us(); + if (probability < 1) { + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(smpl->rng); + if (chance > probability) return; + } + + llama_sample_softmax_impl(nullptr, candidates); + + auto cur_size = candidates->size; + + int pos_last = 0; + + for (size_t i = 0; i < candidates->size; ++i) { + if (candidates->data[i].p >= threshold) { + pos_last = i; + } else break; + } + + if (candidates->size - pos_last >= min_keep && pos_last > 0) { + candidates->data += pos_last; + candidates->size -= pos_last; + } + + smpl->t_sample_us += ggml_time_us() - t_start_sample_us; + smpl->n_sample++; + +} + void llama_sample_repetition_penalties_impl( struct llama_sampling * smpl, llama_token_data_array * candidates, diff --git a/src/llama-sampling.h b/src/llama-sampling.h index f7f8e3ef7..c2a9e45f4 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -32,6 +32,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_ void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep); void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val); void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp); +void llama_sample_xtc_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep); void llama_sample_repetition_penalties_impl( struct llama_sampling * smpl, diff --git a/src/llama.cpp b/src/llama.cpp index 18c7cd0f5..90e342e13 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -23265,6 +23265,11 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp); } +void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p, + float probability, float threshold, size_t min_keep) { + llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep); +} + void llama_sample_repetition_penalties( struct llama_context * ctx, llama_token_data_array * candidates, From 626f49ab842d406b2603ec8364139afcebed4ed6 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 3 Jun 2025 09:16:53 +0300 Subject: [PATCH 2/2] Check if MMVQ is supported before using it. --- ggml/src/ggml-cuda.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 7f4b01d43..c8b06aa8a 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2269,6 +2269,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 && ggml_is_quantized(src0->type) && + ggml_cuda_mmvq_type_supported(src0->type) && ggml_backend_buffer_is_cuda(src0->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && ggml_backend_buffer_is_cuda(dst->buffer) && @@ -2442,8 +2443,8 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor const ggml_tensor * ids = dst->src[3]; if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 && - ggml_is_quantized(src0_1->type) && - ggml_is_quantized(src0_2->type) && + ggml_is_quantized(src0_1->type) && ggml_cuda_mmvq_type_supported(src0_1->type) && + ggml_is_quantized(src0_2->type) && ggml_cuda_mmvq_type_supported(src0_2->type) && ggml_backend_buffer_is_cuda(src0_1->buffer) && ggml_backend_buffer_is_cuda(src0_2->buffer) && ggml_backend_buffer_is_cuda(src1->buffer) && @@ -2502,6 +2503,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor CUDA_CHECK(cudaGetLastError()); if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) && + ggml_cuda_mmvq_type_supported(next->src[0]->type) && ggml_backend_buffer_is_cuda(next->src[0]->buffer) && !ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) && ((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id &&