Skip to content

Commit 76cf024

Browse files
authored
Merge branch 'master' into cisc/multiple-classifier-outputs
2 parents 38ece05 + 7675c55 commit 76cf024

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+4593
-3631
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
130130
<details>
131131
<summary>Bindings</summary>
132132

133+
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
133134
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
134135
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
135136
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13481348
));
13491349
add_opt(common_arg(
13501350
{"--prio"}, "N",
1351-
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1351+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
13521352
[](common_params & params, int prio) {
1353-
if (prio < 0 || prio > 3) {
1353+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
13541354
throw std::invalid_argument("invalid value");
13551355
}
13561356
params.cpuparams.priority = (enum ggml_sched_priority) prio;

common/chat-parser.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
154154
if (!rest.empty()) {
155155
handle_reasoning(rest, /* closed */ !is_partial());
156156
}
157-
if (!syntax_.thinking_forced_open) {
158-
throw common_chat_msg_partial_exception(end_think);
159-
}
157+
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
158+
// if (!syntax_.thinking_forced_open) {
159+
// throw common_chat_msg_partial_exception(end_think);
160+
// }
160161
return true;
161162
}
162163
}

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203203

204204
DWORD p = NORMAL_PRIORITY_CLASS;
205205
switch (prio) {
206+
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206207
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207208
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208209
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228229

229230
int p = 0;
230231
switch (prio) {
232+
case GGML_SCHED_PRIO_LOW: p = 5; break;
231233
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232234
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233235
case GGML_SCHED_PRIO_HIGH: p = -10; break;

convert_hf_to_gguf.py

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3814,7 +3814,7 @@ def _xlmroberta_set_vocab(self) -> None:
38143814
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
38153815
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
38163816

3817-
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3817+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
38183818
else:
38193819
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
38203820
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3827,7 +3827,7 @@ def _xlmroberta_set_vocab(self) -> None:
38273827
tokenizer = SentencePieceProcessor()
38283828
tokenizer.LoadFromFile(str(tokenizer_path))
38293829

3830-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3830+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
38313831

38323832
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
38333833
scores: list[float] = [-10000.0] * vocab_size
@@ -3857,33 +3857,26 @@ def _xlmroberta_set_vocab(self) -> None:
38573857
unk_token = tokenizer_config_json.get("unk_token")
38583858
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
38593859

3860-
for token_id in range(vocab_size):
3860+
for token_id in range(tokenizer.vocab_size):
38613861
piece = tokenizer._convert_id_to_token(token_id)
3862-
text = piece.encode("utf-8")
3863-
score = tokenizer_json["model"]["vocab"][token_id][1]
3864-
3865-
toktype = SentencePieceTokenTypes.NORMAL
3866-
if token_id == unk_token_id:
3867-
toktype = SentencePieceTokenTypes.UNKNOWN
3868-
elif token_id in tokenizer.all_special_ids:
3869-
toktype = SentencePieceTokenTypes.CONTROL
3870-
elif token_id in added_vocab.values():
3871-
toktype = SentencePieceTokenTypes.USER_DEFINED
3872-
# No reliable way to detect this, but jina doesn't have any
3873-
# elif tokenizer.IsByte(token_id):
3874-
# toktype = SentencePieceTokenTypes.BYTE
3875-
3876-
tokens[token_id] = text
3877-
scores[token_id] = score
3878-
toktypes[token_id] = toktype
3879-
3880-
if vocab_size > len(tokens):
3881-
pad_count = vocab_size - len(tokens)
3882-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3883-
for i in range(1, pad_count + 1):
3884-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3885-
scores.append(-1000.0)
3886-
toktypes.append(SentencePieceTokenTypes.UNUSED)
3862+
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3863+
text = piece.encode("utf-8")
3864+
score = tokenizer_json["model"]["vocab"][token_id][1]
3865+
3866+
toktype = SentencePieceTokenTypes.NORMAL
3867+
if token_id == unk_token_id:
3868+
toktype = SentencePieceTokenTypes.UNKNOWN
3869+
elif token_id in tokenizer.all_special_ids:
3870+
toktype = SentencePieceTokenTypes.CONTROL
3871+
elif token_id in added_vocab.values():
3872+
toktype = SentencePieceTokenTypes.USER_DEFINED
3873+
# No reliable way to detect this, but jina doesn't have any
3874+
# elif tokenizer.IsByte(token_id):
3875+
# toktype = SentencePieceTokenTypes.BYTE
3876+
3877+
tokens[token_id] = text
3878+
scores[token_id] = score
3879+
toktypes[token_id] = toktype
38873880

38883881
if isinstance(tokenizer, SentencePieceProcessor):
38893882
# realign tokens (see HF tokenizer code)
@@ -3896,6 +3889,12 @@ def _xlmroberta_set_vocab(self) -> None:
38963889
SentencePieceTokenTypes.UNKNOWN,
38973890
] + toktypes[3:-1]
38983891

3892+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3893+
# Add mask token missing from sentencepiece.bpe.model
3894+
tokens[250001] = b'<mask>'
3895+
scores[250001] = 0.0
3896+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
3897+
38993898
self.gguf_writer.add_tokenizer_model("t5")
39003899
self.gguf_writer.add_tokenizer_pre("default")
39013900
self.gguf_writer.add_token_list(tokens)

docs/build.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ cmake --build build --config Release
6363
cmake --preset x64-windows-llvm-release
6464
cmake --build build-x64-windows-llvm-release
6565
```
66+
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
6667
6768
## BLAS Build
6869

examples/parallel/parallel.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
158158
common_params params;
159159

160160
params.n_predict = 128;
161-
params.n_junk = 0;
161+
params.n_junk = 1;
162162

163163
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
164164
return 1;
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
182182
const bool is_sp_shared = params.is_pp_shared;
183183

184184
// extra text to insert in each client's prompt in order to make it larger
185-
const int32_t n_junk = params.n_junk;
185+
const int32_t n_junk = std::max(1, params.n_junk);
186186

187187
// init llama.cpp
188188
llama_backend_init();
@@ -362,15 +362,17 @@ int main(int argc, char ** argv) {
362362
// process in chunks of params.n_batch
363363
int32_t n_batch = params.n_batch;
364364

365-
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
365+
int32_t i_next = 0;
366+
367+
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
366368
// experiment: process in powers of 2
367369
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
368370
// n_batch /= 2;
369371
// i -= n_batch;
370372
// continue;
371373
//}
372374

373-
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
375+
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
374376

375377
llama_batch batch_view = {
376378
n_tokens,
@@ -390,19 +392,24 @@ int main(int argc, char ** argv) {
390392
return 1;
391393
}
392394

393-
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
395+
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
394396

395397
n_cache_miss += 1;
396398

397399
// retry with half the batch size to try to find a free slot in the KV cache
398400
n_batch /= 2;
399-
i -= n_batch;
400401

401402
continue;
402403
}
403404

404405
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
405406

407+
// move the head of the batch forward with the number of tokens we just processed
408+
i_next = i + n_tokens;
409+
410+
// on successful decode, restore the original batch size
411+
n_batch = params.n_batch;
412+
406413
for (auto & client : clients) {
407414
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
408415
continue;

examples/passkey/passkey.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
133133
const int ib = i/n_batch - 1;
134134
const int bd = n_batch_grp*(n_grp - 1);
135135

136-
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
137-
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
138-
llama_kv_self_update (ctx);
136+
llama_kv_self_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd);
137+
llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
139138

140139
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
141140
}
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {
169168

170169
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
171170
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
172-
//llama_kv_self_defrag (ctx);
173-
llama_kv_self_update (ctx);
174171

175172
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
176173

@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {
200197

201198
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
202199
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
203-
//llama_kv_self_defrag (ctx);
204-
llama_kv_self_update (ctx);
205200

206201
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
207202
}

ggml/include/ggml.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,9 +2095,6 @@ extern "C" {
20952095
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
20962096
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
20972097

2098-
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
2099-
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
2100-
21012098
// print info and performance information for the graph
21022099
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
21032100

@@ -2181,6 +2178,7 @@ extern "C" {
21812178

21822179
// scheduling priorities
21832180
enum ggml_sched_priority {
2181+
GGML_SCHED_PRIO_LOW = -1,
21842182
GGML_SCHED_PRIO_NORMAL,
21852183
GGML_SCHED_PRIO_MEDIUM,
21862184
GGML_SCHED_PRIO_HIGH,

ggml/src/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ add_library(ggml-base
196196
../include/ggml-opt.h
197197
../include/gguf.h
198198
ggml.c
199+
ggml.cpp
199200
ggml-alloc.c
200201
ggml-backend.cpp
201202
ggml-opt.cpp
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
226227
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
227228
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
228229
add_dependencies(ggml ${backend})
230+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
229231
else()
230232
add_library(${backend} ${ARGN})
231233
target_link_libraries(ggml PUBLIC ${backend})

0 commit comments

Comments
 (0)