Skip to content

Commit e8dc308

Browse files
committed
Merge branch 'concedo_experimental' into croco_nex
2 parents d9f2285 + bc30ebd commit e8dc308

32 files changed

+3234
-2475
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ add_library(ggml
656656
ggml/src/ggml-backend.cpp
657657
ggml/src/ggml-backend-impl.h
658658
ggml/include/ggml-backend.h
659+
ggml/include/ggml-cpp.h
659660
ggml/src/ggml-quants.c
660661
ggml/src/ggml-quants.h
661662
ggml/src/llamafile/sgemm.cpp

convert_lora_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
230230

231231
def parse_args() -> argparse.Namespace:
232232
parser = argparse.ArgumentParser(
233-
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
233+
description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file")
234234
parser.add_argument(
235235
"--outfile", type=Path,
236236
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
@@ -257,11 +257,11 @@ def parse_args() -> argparse.Namespace:
257257
)
258258
parser.add_argument(
259259
"--base", type=Path, required=True,
260-
help="directory containing base model file",
260+
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required",
261261
)
262262
parser.add_argument(
263263
"lora_path", type=Path,
264-
help="directory containing LoRA adapter file",
264+
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
265265
)
266266

267267
return parser.parse_args()

examples/server/server.cpp

Lines changed: 13 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -726,12 +726,12 @@ struct server_context {
726726
return nullptr;
727727
}
728728

729-
server_slot * get_available_slot(const std::string & prompt) {
729+
server_slot * get_available_slot(const server_task & task) {
730730
server_slot * ret = nullptr;
731731

732732
// find the slot that has at least n% prompt similarity
733-
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
734-
int max_lcp_len = 0;
733+
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
734+
int max_lcs_len = 0;
735735
float similarity = 0;
736736

737737
for (server_slot & slot : slots) {
@@ -741,25 +741,25 @@ struct server_context {
741741
}
742742

743743
// skip the slot if it does not contains cached tokens
744-
if (slot.prompt_tokens.empty()) {
744+
if (slot.cache_tokens.empty()) {
745745
continue;
746746
}
747747

748-
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
749-
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
748+
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
749+
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
750750

751-
// fraction of the common substring length compared to the current slot's prompt length
752-
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
751+
// fraction of the common subsequence length compared to the current slot's prompt length
752+
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
753753

754754
// select the current slot if the criteria match
755-
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
756-
max_lcp_len = lcp_len;
755+
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
756+
max_lcs_len = lcs_len;
757757
ret = &slot;
758758
}
759759
}
760760

761761
if (ret != nullptr) {
762-
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
762+
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
763763
}
764764
}
765765

@@ -1515,18 +1515,7 @@ struct server_context {
15151515
{
15161516
const int id_slot = json_value(task.data, "id_slot", -1);
15171517

1518-
server_slot * slot;
1519-
1520-
if (id_slot != -1) {
1521-
slot = get_slot_by_id(id_slot);
1522-
} else {
1523-
std::string prompt;
1524-
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
1525-
prompt = json_value(task.data, "prompt", std::string());
1526-
}
1527-
1528-
slot = get_available_slot(prompt);
1529-
}
1518+
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
15301519

15311520
if (slot == nullptr) {
15321521
// if no slot is available, we defer this task for processing later
@@ -3260,7 +3249,7 @@ int main(int argc, char ** argv) {
32603249
ctx_server.queue_tasks.terminate();
32613250
};
32623251

3263-
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
3252+
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
32643253

32653254
ctx_server.queue_tasks.start_loop();
32663255

examples/server/utils.hpp

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
439439
// other common utils
440440
//
441441

442-
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
442+
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
443443
size_t i;
444444
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
445445

446446
return i;
447447
}
448448

449-
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
450-
size_t i;
451-
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
449+
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
450+
// check for empty sequences
451+
if (a.empty() || b.empty()) {
452+
return 0;
453+
}
452454

453-
return i;
455+
// get the lengths of the input sequences
456+
int a_len = a.size();
457+
int b_len = b.size();
458+
459+
// initialize the maximum length of the longest common subsequence (LCS)
460+
int max_length = 0;
461+
462+
// use two rows instead of a 2D matrix to optimize space
463+
std::vector<int> prev_row(b_len + 1, 0);
464+
std::vector<int> curr_row(b_len + 1, 0);
465+
466+
// iterate through the elements of a
467+
for (int i = 1; i <= a_len; i++) {
468+
// iterate through the elements of b
469+
for (int j = 1; j <= b_len; j++) {
470+
// if elements at the current positions match
471+
if (a[i - 1] == b[j - 1]) {
472+
// if it's the first element of either sequences, set LCS length to 1
473+
if (i == 1 || j == 1) {
474+
curr_row[j] = 1;
475+
} else {
476+
// increment LCS length by 1 compared to the previous element
477+
curr_row[j] = prev_row[j - 1] + 1;
478+
}
479+
480+
// update max_length if necessary
481+
if (curr_row[j] > max_length) {
482+
max_length = curr_row[j];
483+
}
484+
} else {
485+
// reset LCS length if elements don't match
486+
curr_row[j] = 0;
487+
}
488+
}
489+
490+
// update the previous row for the next iteration
491+
prev_row = curr_row;
492+
}
493+
494+
// return the maximum length of the LCS
495+
return max_length;
454496
}
455497

456498
static bool ends_with(const std::string & str, const std::string & suffix) {
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-simple-chat)
2+
add_executable(${TARGET} simple-chat.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/simple-chat/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# llama.cpp/example/simple-chat
2+
3+
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
4+
5+
```bash
6+
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
7+
...
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
#include "llama.h"
2+
#include <cstdio>
3+
#include <cstring>
4+
#include <iostream>
5+
#include <string>
6+
#include <vector>
7+
8+
static void print_usage(int, char ** argv) {
9+
printf("\nexample usage:\n");
10+
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
11+
printf("\n");
12+
}
13+
14+
int main(int argc, char ** argv) {
15+
std::string model_path;
16+
int ngl = 99;
17+
int n_ctx = 2048;
18+
19+
// parse command line arguments
20+
for (int i = 1; i < argc; i++) {
21+
try {
22+
if (strcmp(argv[i], "-m") == 0) {
23+
if (i + 1 < argc) {
24+
model_path = argv[++i];
25+
} else {
26+
print_usage(argc, argv);
27+
return 1;
28+
}
29+
} else if (strcmp(argv[i], "-c") == 0) {
30+
if (i + 1 < argc) {
31+
n_ctx = std::stoi(argv[++i]);
32+
} else {
33+
print_usage(argc, argv);
34+
return 1;
35+
}
36+
} else if (strcmp(argv[i], "-ngl") == 0) {
37+
if (i + 1 < argc) {
38+
ngl = std::stoi(argv[++i]);
39+
} else {
40+
print_usage(argc, argv);
41+
return 1;
42+
}
43+
} else {
44+
print_usage(argc, argv);
45+
return 1;
46+
}
47+
} catch (std::exception & e) {
48+
fprintf(stderr, "error: %s\n", e.what());
49+
print_usage(argc, argv);
50+
return 1;
51+
}
52+
}
53+
if (model_path.empty()) {
54+
print_usage(argc, argv);
55+
return 1;
56+
}
57+
58+
// only print errors
59+
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
60+
if (level >= GGML_LOG_LEVEL_ERROR) {
61+
fprintf(stderr, "%s", text);
62+
}
63+
}, nullptr);
64+
65+
// initialize the model
66+
llama_model_params model_params = llama_model_default_params();
67+
model_params.n_gpu_layers = ngl;
68+
69+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
70+
if (!model) {
71+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
72+
return 1;
73+
}
74+
75+
// initialize the context
76+
llama_context_params ctx_params = llama_context_default_params();
77+
ctx_params.n_ctx = n_ctx;
78+
ctx_params.n_batch = n_ctx;
79+
80+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
81+
if (!ctx) {
82+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
83+
return 1;
84+
}
85+
86+
// initialize the sampler
87+
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
88+
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
89+
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
90+
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
91+
92+
// helper function to evaluate a prompt and generate a response
93+
auto generate = [&](const std::string & prompt) {
94+
std::string response;
95+
96+
// tokenize the prompt
97+
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
98+
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
99+
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
100+
GGML_ABORT("failed to tokenize the prompt\n");
101+
}
102+
103+
// prepare a batch for the prompt
104+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
105+
llama_token new_token_id;
106+
while (true) {
107+
// check if we have enough space in the context to evaluate this batch
108+
int n_ctx = llama_n_ctx(ctx);
109+
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
110+
if (n_ctx_used + batch.n_tokens > n_ctx) {
111+
printf("\033[0m\n");
112+
fprintf(stderr, "context size exceeded\n");
113+
exit(0);
114+
}
115+
116+
if (llama_decode(ctx, batch)) {
117+
GGML_ABORT("failed to decode\n");
118+
}
119+
120+
// sample the next token
121+
new_token_id = llama_sampler_sample(smpl, ctx, -1);
122+
123+
// is it an end of generation?
124+
if (llama_token_is_eog(model, new_token_id)) {
125+
break;
126+
}
127+
128+
// convert the token to a string, print it and add it to the response
129+
char buf[256];
130+
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
131+
if (n < 0) {
132+
GGML_ABORT("failed to convert token to piece\n");
133+
}
134+
std::string piece(buf, n);
135+
printf("%s", piece.c_str());
136+
fflush(stdout);
137+
response += piece;
138+
139+
// prepare the next batch with the sampled token
140+
batch = llama_batch_get_one(&new_token_id, 1);
141+
}
142+
143+
return response;
144+
};
145+
146+
std::vector<llama_chat_message> messages;
147+
std::vector<char> formatted(llama_n_ctx(ctx));
148+
int prev_len = 0;
149+
while (true) {
150+
// get user input
151+
printf("\033[32m> \033[0m");
152+
std::string user;
153+
std::getline(std::cin, user);
154+
155+
if (user.empty()) {
156+
break;
157+
}
158+
159+
// add the user input to the message list and format it
160+
messages.push_back({"user", strdup(user.c_str())});
161+
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
162+
if (new_len > (int)formatted.size()) {
163+
formatted.resize(new_len);
164+
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
165+
}
166+
if (new_len < 0) {
167+
fprintf(stderr, "failed to apply the chat template\n");
168+
return 1;
169+
}
170+
171+
// remove previous messages to obtain the prompt to generate the response
172+
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
173+
174+
// generate a response
175+
printf("\033[33m");
176+
std::string response = generate(prompt);
177+
printf("\n\033[0m");
178+
179+
// add the response to the messages
180+
messages.push_back({"assistant", strdup(response.c_str())});
181+
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
182+
if (prev_len < 0) {
183+
fprintf(stderr, "failed to apply the chat template\n");
184+
return 1;
185+
}
186+
}
187+
188+
// free resources
189+
for (auto & msg : messages) {
190+
free(const_cast<char *>(msg.content));
191+
}
192+
llama_sampler_free(smpl);
193+
llama_free(ctx);
194+
llama_free_model(model);
195+
196+
return 0;
197+
}

ggml-opencl.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2095,7 +2095,6 @@ static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
20952095
}
20962096

20972097
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
2098-
/* .get_name = */ ggml_backend_opencl_buffer_get_name,
20992098
/* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
21002099
/* .get_base = */ ggml_backend_opencl_buffer_get_base,
21012100
/* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,

0 commit comments

Comments
 (0)