Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
b0dd14a
vulkan: Add VK_NV_cooperative_matrix2 support for mul_mat and flash a…
jeffbolznv Dec 5, 2024
d72153e
metal : Extend how Llama.cpp locates metal resources (llama/10676)
ormandi Dec 7, 2024
5e59b3f
Vulkan: VK_KHR_cooperative_matrix support to speed up prompt processi…
0cc4m Dec 7, 2024
f4a6693
ggml : refactor online repacking (llama/10446)
Djip007 Dec 7, 2024
349430c
ggml : disable iq4_nl interleave size 8 (llama/10709)
ggerganov Dec 7, 2024
42a5245
vulkan: compile a test shader in cmake to check for coopmat2 support …
jeffbolznv Dec 8, 2024
0e4eb51
Vulkan: fix NaN in tanh.comp with AMD proprietary driver on Windows (…
stduhpf Dec 8, 2024
37df308
vulkan: fix compile warnings (llama/10731)
jeffbolznv Dec 9, 2024
ce2b75d
CUDA: fix shared memory access condition for mmv (llama/10740)
JohannesGaessler Dec 9, 2024
8541e2c
common : remove old types
ggerganov Dec 10, 2024
ffda777
ggml : add check for grad_accs (ggml/1046)
danbev Dec 13, 2024
a619ddf
ggml : remove return from ggml_gallocr_allocate_node (ggml/1048)
danbev Dec 14, 2024
d241e1b
vulkan: disable spirv-opt for coopmat shaders (llama/10763)
jeffbolznv Dec 10, 2024
5fd6abb
CUDA: rename macros to avoid conflicts with WinAPI (llama/10736)
aendk Dec 10, 2024
e670255
vulkan: dynamic subgroup size for the remaining k quants (llama/10745)
netrunnereve Dec 10, 2024
8368c1d
vulkan: request round-to-even for fp16 in im2col/rope_head (llama/10767)
jeffbolznv Dec 10, 2024
cfeb7f7
ggml: load all backends from a user-provided search path (llama/10699)
giladgd Dec 11, 2024
049e991
Vulkan: Add VK_EXT_subgroup_size_control support to ensure full subgr…
0cc4m Dec 12, 2024
949dd3d
Vulkan: Use improved q4_k and q5_k dequant code in dequant shaders (l…
0cc4m Dec 12, 2024
b79bf1c
remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS (llama/10797)
slaren Dec 12, 2024
88b83a3
CUDA: faster non-contiguous concat (llama/10760)
A3shTnT Dec 12, 2024
3ead635
ggml : Fix compilation issues on ARM platform when building without f…
kkontny Dec 13, 2024
d1abb1e
SYCL: Reduce most of the compiler warnings (llama/10748)
qnixsynapse Dec 13, 2024
dcef9a4
vulkan: small mul_mat_vec optimizations (llama/10665)
netrunnereve Dec 13, 2024
874fcb6
Fix crash caused by ggml_backend_load_all when launching on Android A…
sienaiwun Dec 13, 2024
5538f04
Introducing experimental OpenCL backend with support for Qualcomm Adr…
lhez Dec 13, 2024
6a3b05a
llama : add Qwen2VL support + multimodal RoPE (llama/10361)
HimariO Dec 14, 2024
a26ce8b
rwkv6: add wkv6 support for Vulkan backend (llama/10829)
zhiyuan1i Dec 16, 2024
e7f9dbb
vulkan: bugfixes for small subgroup size systems + llvmpipe test (lla…
netrunnereve Dec 17, 2024
6b07615
ggml : update ggml_backend_cpu_device_supports_op (llama/10867)
ggerganov Dec 17, 2024
59494c0
sync : ggml
ggerganov Dec 17, 2024
2add8c0
talk-llama : sync llama.cpp
ggerganov Dec 17, 2024
f18f7c6
sync : ggml
ggerganov Dec 18, 2024
69fe4b7
files : remove old sources
ggerganov Dec 18, 2024
f311d82
android : try to fix build
ggerganov Dec 18, 2024
b145e9b
ruby : sync ggml (#2643)
KitaitiMakoto Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions bindings/ruby/ext/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@

$OBJ_GGML <<
'ggml/src/ggml.o' <<
'ggml/src/ggml-aarch64.o' <<
'ggml/src/ggml-alloc.o' <<
'ggml/src/ggml-backend.o' <<
'ggml/src/ggml-backend-reg.o' <<
Expand All @@ -172,7 +171,9 @@
'ggml/src/ggml-cpu/ggml-cpu.o' <<
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
'ggml/src/ggml-cpu/ggml-cpu-quants.o'
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
'ggml/src/ggml-cpu/ggml-cpu-traits.o'

$OBJ_WHISPER <<
'src/whisper.o'
Expand Down
7 changes: 0 additions & 7 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M:
case GGML_FTYPE_MOSTLY_BF16:
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -212,12 +209,8 @@ bool ggml_common_quantize_0(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_IQ4_NL_4_4:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
125 changes: 35 additions & 90 deletions examples/talk-llama/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
// penalties

struct llama_sampler_penalties {
const int32_t n_vocab;
const llama_token special_eos_id;
const llama_token linefeed_id;

const int32_t penalty_last_n;
const float penalty_repeat;
const float penalty_freq;
const float penalty_present;

const bool penalize_nl;
const bool ignore_eos;

ring_buffer<llama_token> prev;

// a frequency map to count token occurrences
std::unordered_map<llama_token, int> token_count;
};

static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
Expand All @@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
return;
}

ctx->prev.push_back(token);
}

static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
ctx->token_count[token]++;

if (ctx->ignore_eos) {
assert(ctx->special_eos_id >= 0);
// if the ring buffer is full, remove the oldest token
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
const auto old = ctx->prev.front();

// optimistically check if the candidates are not yet sorted/shuffled/truncated
if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
cur_p->data[ctx->special_eos_id].logit = -INFINITY;
} else {
// else, search for the special EOS token
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].id == ctx->special_eos_id) {
cur_p->data[i].logit = -INFINITY;
break;
}
}
ctx->token_count[old]--;
if (ctx->token_count[old] == 0) {
ctx->token_count.erase(old);
}
}

if ((ctx->penalty_last_n == 0) ||
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
return;
}

bool nl_found = false;
size_t nl_idx = 0;
float nl_logit = -INFINITY;
if (!ctx->penalize_nl) {
assert(ctx->linefeed_id >= 0);
ctx->prev.push_back(token);

// optimistically check if the candidates are not yet sorted/shuffled/truncated
if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
nl_found = true;
nl_idx = ctx->linefeed_id;
nl_logit = cur_p->data[ctx->linefeed_id].logit;
} else {
// else, search for the linefeed token
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].id == ctx->linefeed_id) {
nl_found = true;
nl_idx = i;
nl_logit = cur_p->data[i].logit;
break;
}
}
}
#if 0
// sanity check
std::unordered_map<llama_token, int> tmp;
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
tmp[ctx->prev.rat(i)]++;
}

// Create a frequency map to count occurrences of each token in last_tokens
// TODO: optimize this by maintaining the token count in the sampler context
using llama_token_cnt = std::unordered_map<llama_token, int>;
llama_token_cnt token_count;
assert(ctx->token_count == tmp);
#endif
}

static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_penalties *) smpl->ctx;

for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
token_count[ctx->prev.rat(i)]++;
if ((ctx->penalty_last_n == 0) ||
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
return;
}

// Apply frequency and presence penalties to the cur_p
for (size_t i = 0; i < cur_p->size; ++i) {
const auto token_iter = token_count.find(cur_p->data[i].id);
if (token_iter == token_count.end()) {
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
if (token_iter == ctx->token_count.end()) {
continue;
}

const int count = token_iter->second;

assert(count > 0 && count <= ctx->penalty_last_n);

// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (cur_p->data[i].logit <= 0) {
Expand All @@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
}

cur_p->sorted = false;

if (!ctx->penalize_nl && nl_found) {
// restore the logit of the newline token if it was penalized
cur_p->data[nl_idx].logit = nl_logit;
}
}

static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
ctx->prev.clear();
ctx->token_count.clear();
}

static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
auto * result = llama_sampler_init_penalties(
ctx->n_vocab,
ctx->special_eos_id,
ctx->linefeed_id,
ctx->penalty_last_n,
ctx->penalty_repeat,
ctx->penalty_freq,
ctx->penalty_present,
ctx->penalize_nl,
ctx->ignore_eos);
ctx->penalty_present);

// copy the state
{
Expand All @@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
};

struct llama_sampler * llama_sampler_init_penalties(
int32_t n_vocab,
llama_token special_eos_id,
llama_token linefeed_id,
int32_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present,
bool penalize_nl,
bool ignore_eos) {
if (linefeed_id == LLAMA_TOKEN_NULL) {
penalize_nl = true;
}

if (special_eos_id == LLAMA_TOKEN_NULL) {
ignore_eos = false;
}

float penalty_present) {
penalty_last_n = std::max(penalty_last_n, 0);

return new llama_sampler {
/* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties {
/* .n_vocab = */ n_vocab,
/* .special_eos_id = */ special_eos_id,
/* .linefeed_id = */ linefeed_id,
/* .penalty_last_n = */ penalty_last_n,
/* .penalty_repeat = */ penalty_repeat,
/* .penalty_freq = */ penalty_freq,
/* .penalty_present = */ penalty_present,
/* .penalize_nl = */ penalize_nl,
/* .ignore_eos = */ ignore_eos,
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
/* .token_count = */ {},
},
};
}
Expand Down Expand Up @@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector<llama_token>());
} else {
size_t word_len = word.size(), str_len = str.size();
size_t word_len = word.size();
size_t str_len = str.size();
size_t pos = -1;
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
bool match = true;
Expand Down
3 changes: 2 additions & 1 deletion examples/talk-llama/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
Expand Down Expand Up @@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
std::vector<std::string> words(1, "");

for (const uint32_t cpt : cpts_nfd) {
const auto flags = unicode_cpt_flags(cpt);
const auto flags = unicode_cpt_flags_from_cpt(cpt);

if (flags.is_whitespace) {
if (words.back().size()) { // finish previous word if any
Expand Down
Loading
Loading