Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
sparams.mirostat_tau = std::stof(argv[i]);
return true;
}
if (arg == "--xtc-probability") {
CHECK_ARG
sparams.xtc_probability = std::stof(argv[i]);
return true;
}
if (arg == "--xtc-threshold") {
CHECK_ARG
sparams.xtc_threshold = std::stof(argv[i]);
return true;
}
if (arg == "--cfg-negative-prompt") {
CHECK_ARG
sparams.cfg_negative_prompt = argv[i];
Expand Down Expand Up @@ -1635,6 +1645,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
options.push_back({ "*", " --xtc-probability p", "xtc probability (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_probability });
options.push_back({ "*", " --xtc-threshold t", "xtc threshold (default: %.1f, 0.0 = disabled)", (double)sparams.xtc_threshold});
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
Expand Down Expand Up @@ -3396,6 +3408,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
fprintf(stream, "xtc_threshold: %f # default: 0.0\n", sparams.xtc_threshold);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
Expand Down
13 changes: 11 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,12 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f\n"
"\txtc_probability = %.3f, xtc_threshold = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
params.mirostat, params.mirostat_eta, params.mirostat_tau,
params.xtc_probability, params.xtc_threshold);

return std::string(result);
}
Expand Down Expand Up @@ -153,6 +155,7 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
case llama_sampler_type::XTC : return "xtc";
default : return "";
}
}
Expand All @@ -164,6 +167,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"xtc", llama_sampler_type::XTC},
{"temperature", llama_sampler_type::TEMPERATURE}
};

Expand All @@ -178,6 +182,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vecto
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"xtc", llama_sampler_type::XTC},
{"temp", llama_sampler_type::TEMPERATURE}
};

Expand Down Expand Up @@ -212,6 +217,7 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'x', llama_sampler_type::XTC},
{'t', llama_sampler_type::TEMPERATURE}
};

Expand Down Expand Up @@ -240,6 +246,8 @@ static void sampler_queue(
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const float xtc_probability = params.xtc_probability;
const float xtc_threshold = params.xtc_threshold;
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;

for (auto sampler_type : samplers_sequence) {
Expand All @@ -249,6 +257,7 @@ static void sampler_queue(
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case llama_sampler_type::XTC : llama_sample_xtc (ctx_main, &cur_p, xtc_probability, xtc_threshold, min_keep); break;
case llama_sampler_type::TEMPERATURE:
if (dynatemp_range > 0) {
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
Expand Down
3 changes: 3 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ enum class llama_sampler_type : char {
TOP_P = 'p',
MIN_P = 'm',
TFS_Z = 'f',
XTC = 'x',
TYPICAL_P = 'y',
TEMPERATURE = 't'
};
Expand All @@ -39,6 +40,8 @@ typedef struct llama_sampling_params {
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
float xtc_probability = 0.0f; // xtc probability
float xtc_threshold = 1.0f; // xtc threashold, disabled if > 0.5
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context

Expand Down
6 changes: 4 additions & 2 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2269,6 +2269,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
ggml_is_quantized(src0->type) &&
ggml_cuda_mmvq_type_supported(src0->type) &&
ggml_backend_buffer_is_cuda(src0->buffer) &&
ggml_backend_buffer_is_cuda(src1->buffer) &&
ggml_backend_buffer_is_cuda(dst->buffer) &&
Expand Down Expand Up @@ -2442,8 +2443,8 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
const ggml_tensor * ids = dst->src[3];

if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
ggml_is_quantized(src0_1->type) &&
ggml_is_quantized(src0_2->type) &&
ggml_is_quantized(src0_1->type) && ggml_cuda_mmvq_type_supported(src0_1->type) &&
ggml_is_quantized(src0_2->type) && ggml_cuda_mmvq_type_supported(src0_2->type) &&
ggml_backend_buffer_is_cuda(src0_1->buffer) &&
ggml_backend_buffer_is_cuda(src0_2->buffer) &&
ggml_backend_buffer_is_cuda(src1->buffer) &&
Expand Down Expand Up @@ -2502,6 +2503,7 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
CUDA_CHECK(cudaGetLastError());

if (next && next->op == GGML_OP_MUL_MAT_ID && ggml_is_quantized(next->src[0]->type) &&
ggml_cuda_mmvq_type_supported(next->src[0]->type) &&
ggml_backend_buffer_is_cuda(next->src[0]->buffer) &&
!ggml_backend_buffer_is_cuda_split(next->src[0]->buffer) &&
((ggml_backend_cuda_buffer_context *)next->src[0]->buffer->context)->device == device_id &&
Expand Down
8 changes: 8 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,14 @@ extern "C" {
llama_token_data_array * candidates,
float temp);

/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
LLAMA_API void llama_sample_xtc(
struct llama_context * ctx,
llama_token_data_array * candidates_p,
float probability,
float threshold,
size_t min_keep);

/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
Expand Down
34 changes: 34 additions & 0 deletions src/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,40 @@ void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array
}
}

void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep) {
if (probability < 0 || threshold > 0.5f || candidates->size < 2) {
return;
}
GGML_ASSERT(smpl);
const int64_t t_start_sample_us = ggml_time_us();
if (probability < 1) {
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
float chance = distribution(smpl->rng);
if (chance > probability) return;
}

llama_sample_softmax_impl(nullptr, candidates);

auto cur_size = candidates->size;

int pos_last = 0;

for (size_t i = 0; i < candidates->size; ++i) {
if (candidates->data[i].p >= threshold) {
pos_last = i;
} else break;
}

if (candidates->size - pos_last >= min_keep && pos_last > 0) {
candidates->data += pos_last;
candidates->size -= pos_last;
}

smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
smpl->n_sample++;

}

void llama_sample_repetition_penalties_impl(
struct llama_sampling * smpl,
llama_token_data_array * candidates,
Expand Down
1 change: 1 addition & 0 deletions src/llama-sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
void llama_sample_xtc_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float probability, float threshold, size_t min_keep);

void llama_sample_repetition_penalties_impl(
struct llama_sampling * smpl,
Expand Down
5 changes: 5 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23265,6 +23265,11 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
}

void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p,
float probability, float threshold, size_t min_keep) {
llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep);
}

void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
Expand Down