Skip to content

Commit 63485bd

Browse files
committed
Merge branch 'master' into qkv
2 parents 36874c0 + c8c07d6 commit 63485bd

File tree

39 files changed

+9203
-4878
lines changed

39 files changed

+9203
-4878
lines changed

CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ endif()
441441
if (WIN32)
442442
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
443443

444+
444445
if (BUILD_SHARED_LIBS)
445446
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
446447
endif()
@@ -485,6 +486,19 @@ if (LLAMA_CCACHE)
485486
endif ()
486487
endif()
487488

489+
# change the default for these ggml options
490+
if (NOT DEFINED GGML_LLAMAFILE)
491+
set(GGML_LLAMAFILE_DEFAULT ON)
492+
endif()
493+
494+
if (NOT DEFINED GGML_AMX)
495+
set(GGML_AMX ON)
496+
endif()
497+
498+
if (NOT DEFINED GGML_CUDA_GRAPHS)
499+
set(GGML_CUDA_GRAPHS_DEFAULT ON)
500+
endif()
501+
488502
# this version of Apple ld64 is buggy
489503
execute_process(
490504
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v

common/arg.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10981098
}
10991099
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
11001100
add_opt(common_arg(
1101-
{"--attention"}, "{causal,non,causal}",
1101+
{"--attention"}, "{causal,non-causal}",
11021102
"attention type for embeddings, use model default if unspecified",
11031103
[](common_params & params, const std::string & value) {
11041104
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
@@ -1696,7 +1696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16961696
).set_examples({LLAMA_EXAMPLE_BENCH}));
16971697
add_opt(common_arg(
16981698
{"--embd-normalize"}, "N",
1699-
string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
1699+
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
17001700
[](common_params & params, int value) {
17011701
params.embd_normalize = value;
17021702
}
@@ -1710,7 +1710,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17101710
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
17111711
add_opt(common_arg(
17121712
{"--embd-separator"}, "STRING",
1713-
"separator of embendings (default \\n) for example \"<#sep#>\"",
1713+
"separator of embeddings (default \\n) for example \"<#sep#>\"",
17141714
[](common_params & params, const std::string & value) {
17151715
params.embd_sep = value;
17161716
}

common/common.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ struct common_init_result common_init_from_params(common_params & params) {
957957
}
958958

959959
if (llama_model_has_encoder(model)) {
960-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
960+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
961961
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
962962
if (decoder_start_token_id == -1) {
963963
decoder_start_token_id = bos;
@@ -966,7 +966,7 @@ struct common_init_result common_init_from_params(common_params & params) {
966966
tmp.push_back(decoder_start_token_id);
967967
}
968968
if (llama_model_has_decoder(model)) {
969-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
969+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
970970
}
971971
llama_kv_cache_clear(lctx);
972972
llama_synchronize(lctx);
@@ -1040,7 +1040,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
10401040
return GGML_TYPE_Q6_0;
10411041
}
10421042

1043-
throw std::runtime_error("Invalid cache type: " + s);
1043+
throw std::runtime_error("Unsupported cache type: " + s);
10441044
}
10451045

10461046
struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1052,7 +1052,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10521052
cparams.n_ubatch = params.n_ubatch;
10531053
cparams.n_threads = params.cpuparams.n_threads;
10541054
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1055-
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1055+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
10561056
cparams.logits_all = params.logits_all;
10571057
cparams.embeddings = params.embedding;
10581058
cparams.rope_scaling_type = params.rope_scaling_type;

common/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,9 @@ struct common_params {
270270

271271
// embedding
272272
bool embedding = false; // get only sentence embedding
273-
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
273+
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
274274
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
275-
std::string embd_sep = "\n"; // separator of embendings
275+
std::string embd_sep = "\n"; // separator of embeddings
276276
bool reranking = false; // enable reranking support on server
277277

278278
// server params

common/sampling.cpp

Lines changed: 37 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -171,60 +171,46 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
171171
params.penalize_nl,
172172
params.ignore_eos));
173173

174-
if (params.temp > 0.0f) {
175-
if (params.mirostat == 0) {
176-
for (const auto & cnstr : params.samplers) {
177-
switch (cnstr) {
178-
case COMMON_SAMPLER_TYPE_TOP_K:
179-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
180-
break;
181-
case COMMON_SAMPLER_TYPE_TOP_P:
182-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
183-
break;
184-
case COMMON_SAMPLER_TYPE_MIN_P:
185-
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
186-
break;
187-
case COMMON_SAMPLER_TYPE_XTC:
188-
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
189-
break;
190-
case COMMON_SAMPLER_TYPE_TFS_Z:
191-
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
192-
break;
193-
case COMMON_SAMPLER_TYPE_TYPICAL_P:
194-
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
195-
break;
196-
case COMMON_SAMPLER_TYPE_TEMPERATURE:
197-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
198-
break;
199-
case COMMON_SAMPLER_TYPE_INFILL:
200-
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
201-
break;
202-
default:
203-
GGML_ASSERT(false && "unknown sampler type");
204-
}
174+
if (params.mirostat == 0) {
175+
for (const auto & cnstr : params.samplers) {
176+
switch (cnstr) {
177+
case COMMON_SAMPLER_TYPE_TOP_K:
178+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
179+
break;
180+
case COMMON_SAMPLER_TYPE_TOP_P:
181+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
182+
break;
183+
case COMMON_SAMPLER_TYPE_MIN_P:
184+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
185+
break;
186+
case COMMON_SAMPLER_TYPE_XTC:
187+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
188+
break;
189+
case COMMON_SAMPLER_TYPE_TFS_Z:
190+
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
191+
break;
192+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
193+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
194+
break;
195+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
196+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
197+
break;
198+
case COMMON_SAMPLER_TYPE_INFILL:
199+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
200+
break;
201+
default:
202+
GGML_ASSERT(false && "unknown sampler type");
205203
}
206-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
207-
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
208-
} else if (params.mirostat == 1) {
209-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
210-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
211-
} else if (params.mirostat == 2) {
212-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
213-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
214-
} else {
215-
GGML_ASSERT(false && "unknown mirostat version");
216204
}
205+
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
206+
} else if (params.mirostat == 1) {
207+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
208+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
209+
} else if (params.mirostat == 2) {
210+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
211+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
217212
} else {
218-
if (params.n_probs > 0) {
219-
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
220-
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
221-
//
222-
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
223-
// it is much faster, since we avoid sorting all tokens and should give a good approximation
224-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
225-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
226-
}
227-
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
213+
GGML_ASSERT(false && "unknown mirostat version");
228214
}
229215

230216
return result;

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2864,6 +2864,9 @@ def set_vocab(self):
28642864
self.gguf_writer.add_token_list(tokens)
28652865
self.gguf_writer.add_token_types(toktypes)
28662866
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2867+
special_vocab.chat_template = "rwkv-world"
2868+
# hack: Add '\n\n' as the EOT token to make it chat normally
2869+
special_vocab._set_special_token("eot", 261)
28672870
special_vocab.add_to_gguf(self.gguf_writer)
28682871

28692872
def set_gguf_parameters(self):

convert_lora_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
348348
if ".base_layer.weight" in name:
349349
continue
350350
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
351+
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
352+
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
353+
logger.error("Hint: if you are using TRL, make sure not to call setup_chat_format()")
351354
sys.exit(1)
352355

353356
if base_name in tensor_map:

examples/batched-bench/batched-bench.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
7474
batch.n_seq_id + i,
7575
batch.seq_id + i,
7676
batch.logits + i,
77-
0, 0, 0, // unused
7877
};
7978

8079
const int ret = llama_decode(ctx, batch_view);

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
339339

340340
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341341
llama_kv_cache_clear(ctx);
342-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
342+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
343343
fprintf(stderr, "%s : failed to eval\n", __func__);
344344
return false;
345345
}

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
131131

132132
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133133

134-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
134+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135135
LOG_ERR("%s : failed to eval\n", __func__);
136136
return false;
137137
}

0 commit comments

Comments
 (0)