Skip to content

Commit 36c2f38

Browse files
committed
Merge branch 'master' into llamacli-tools
2 parents 3437080 + 3ec9fd4 commit 36c2f38

File tree

19 files changed

+300
-92
lines changed

19 files changed

+300
-92
lines changed

.github/workflows/close-issue.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
steps:
1818
- uses: actions/stale@v5
1919
with:
20-
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
20+
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
2121
days-before-issue-stale: 30
2222
days-before-issue-close: 14
2323
stale-issue-label: "stale"

AUTHORS

Lines changed: 82 additions & 1 deletion
Large diffs are not rendered by default.

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
136136
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
137137
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
138138
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
139+
- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client)
139140
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
140141
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
141142
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)

common/arg.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,15 +1465,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14651465
{"--list-devices"},
14661466
"print list of available devices and exit",
14671467
[](common_params &) {
1468-
printf("Available devices:\n");
1468+
std::vector<ggml_backend_dev_t> rpc_devices;
1469+
std::vector<ggml_backend_dev_t> all_devices;
14691470
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
14701471
auto * dev = ggml_backend_dev_get(i);
14711472
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1472-
size_t free, total;
1473-
ggml_backend_dev_memory(dev, &free, &total);
1474-
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1473+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
1474+
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
1475+
rpc_devices.push_back(dev);
1476+
} else {
1477+
all_devices.push_back(dev);
1478+
}
14751479
}
14761480
}
1481+
// insert RPC devices in front
1482+
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
1483+
printf("Available devices:\n");
1484+
for (size_t i = 0; i < all_devices.size(); ++i) {
1485+
auto * dev = all_devices[i];
1486+
size_t free, total;
1487+
ggml_backend_dev_memory(dev, &free, &total);
1488+
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1489+
}
14771490
exit(0);
14781491
}
14791492
));

common/chat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
365365
return data;
366366
}
367367
static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
368-
static std::regex response_regex("<\\|START_RESPONSE\\|>(.*?)<\\|END_RESPONSE\\|>");
368+
static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
369369
static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
370370
std::smatch match;
371371

common/common.cpp

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,11 +1932,19 @@ std::string common_chat_format_example(const common_chat_templates & tmpl, bool
19321932
return common_chat_apply_template(tmpl, msgs, true, use_jinja);
19331933
}
19341934

1935+
#define CHATML_TEMPLATE_SRC \
1936+
"{%- for message in messages -%}\n" \
1937+
" {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1938+
"{%- endfor -%}\n" \
1939+
"{%- if add_generation_prompt -%}\n" \
1940+
" {{- '<|im_start|>assistant\n' -}}\n" \
1941+
"{%- endif -%}"
1942+
19351943
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
19361944
{
1937-
auto vocab = llama_model_get_vocab(model);
1938-
std::string default_template_src = chat_template_override;
1939-
std::string template_tool_use_src = chat_template_override;
1945+
std::string default_template_src;
1946+
std::string template_tool_use_src;
1947+
19401948
bool has_explicit_template = !chat_template_override.empty();
19411949
if (chat_template_override.empty()) {
19421950
auto str = llama_model_chat_template(model, /* name */ nullptr);
@@ -1949,21 +1957,21 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
19491957
template_tool_use_src = str;
19501958
has_explicit_template = true;
19511959
}
1960+
} else {
1961+
default_template_src = chat_template_override;
19521962
}
19531963
if (default_template_src.empty() || default_template_src == "chatml") {
19541964
if (!template_tool_use_src.empty()) {
19551965
default_template_src = template_tool_use_src;
19561966
} else {
1957-
default_template_src = R"(
1958-
{%- for message in messages -%}
1959-
{{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
1960-
{%- endfor -%}
1961-
{%- if add_generation_prompt -%}
1962-
{{- "<|im_start|>assistant\n" -}}
1963-
{%- endif -%}
1964-
)";
1967+
default_template_src = CHATML_TEMPLATE_SRC;
19651968
}
19661969
}
1970+
std::string token_bos;
1971+
std::string token_eos;
1972+
// TODO: update logic that adds BOS and EOS tokens to the tokenized prompt, in favour of the template.
1973+
#if 0
1974+
auto vocab = llama_model_get_vocab(model);
19671975
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
19681976
if (token == LLAMA_TOKEN_NULL) {
19691977
if (default_template_src.find(jinja_variable_name) != std::string::npos
@@ -1975,15 +1983,25 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
19751983
return common_token_to_piece(vocab, token, true);
19761984
}
19771985
};
1978-
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1979-
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1980-
return {
1981-
has_explicit_template,
1982-
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1983-
template_tool_use_src.empty()
1984-
? nullptr
1985-
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos)
1986-
};
1986+
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1987+
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1988+
#endif
1989+
try {
1990+
return {
1991+
has_explicit_template,
1992+
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1993+
template_tool_use_src.empty()
1994+
? nullptr
1995+
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1996+
};
1997+
} catch (const std::exception & e) {
1998+
LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1999+
return {
2000+
has_explicit_template,
2001+
std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
2002+
nullptr,
2003+
};
2004+
}
19872005
}
19882006

19892007
//

examples/batched.swift/Sources/main.swift

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ defer {
3131
llama_model_free(model)
3232
}
3333

34+
guard let vocab = llama_model_get_vocab(model) else {
35+
print("Failed to get vocab")
36+
exit(1)
37+
}
38+
3439
var tokens = tokenize(text: prompt, add_bos: true)
3540

3641
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
@@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
4146
context_params.n_threads = 8
4247
context_params.n_threads_batch = 8
4348

44-
let context = llama_new_context_with_model(model, context_params)
49+
let context = llama_init_from_model(model, context_params)
4550
guard context != nil else {
4651
print("Failed to initialize context")
4752
exit(1)
@@ -141,7 +146,7 @@ while n_cur <= n_len {
141146
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
142147

143148
// is it an end of stream? -> mark the stream as finished
144-
if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
149+
if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
145150
i_batch[i] = -1
146151
// print("")
147152
if n_parallel > 1 {
@@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
207212
let utf8Count = text.utf8.count
208213
let n_tokens = utf8Count + (add_bos ? 1 : 0)
209214
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
210-
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
215+
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
211216
var swiftTokens: [llama_token] = []
212217
for i in 0 ..< tokenCount {
213218
swiftTokens.append(tokens[Int(i)])
@@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
218223

219224
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
220225
var result = [CChar](repeating: 0, count: 8)
221-
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
226+
let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false)
222227
if nTokens < 0 {
223228
let actualTokensCount = -Int(nTokens)
224229
result = .init(repeating: 0, count: actualTokensCount)
225230
let check = llama_token_to_piece(
226-
model,
231+
vocab,
227232
token,
228233
&result,
229234
Int32(result.count),

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
2424
actor LlamaContext {
2525
private var model: OpaquePointer
2626
private var context: OpaquePointer
27+
private var vocab: OpaquePointer
2728
private var sampling: UnsafeMutablePointer<llama_sampler>
2829
private var batch: llama_batch
2930
private var tokens_list: [llama_token]
@@ -47,6 +48,7 @@ actor LlamaContext {
4748
self.sampling = llama_sampler_chain_init(sparams)
4849
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
4950
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
51+
vocab = llama_model_get_vocab(model)
5052
}
5153

5254
deinit {
@@ -79,7 +81,7 @@ actor LlamaContext {
7981
ctx_params.n_threads = Int32(n_threads)
8082
ctx_params.n_threads_batch = Int32(n_threads)
8183

82-
let context = llama_new_context_with_model(model, ctx_params)
84+
let context = llama_init_from_model(model, ctx_params)
8385
guard let context else {
8486
print("Could not load context!")
8587
throw LlamaError.couldNotInitializeContext
@@ -151,7 +153,7 @@ actor LlamaContext {
151153

152154
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
153155

154-
if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
156+
if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len {
155157
print("\n")
156158
is_done = true
157159
let new_token_str = String(cString: temporary_invalid_cchars + [0])
@@ -297,7 +299,7 @@ actor LlamaContext {
297299
let utf8Count = text.utf8.count
298300
let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
299301
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
300-
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
302+
let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
301303

302304
var swiftTokens: [llama_token] = []
303305
for i in 0..<tokenCount {
@@ -316,15 +318,15 @@ actor LlamaContext {
316318
defer {
317319
result.deallocate()
318320
}
319-
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
321+
let nTokens = llama_token_to_piece(vocab, token, result, 8, 0, false)
320322

321323
if nTokens < 0 {
322324
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
323325
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
324326
defer {
325327
newResult.deallocate()
326328
}
327-
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
329+
let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false)
328330
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
329331
return Array(bufferPointer)
330332
} else {
-1 Bytes
Binary file not shown.

examples/server/server.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3353,6 +3353,8 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
33533353
return;
33543354
}
33553355

3356+
// reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
3357+
33563358
LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
33573359

33583360
LOG_DBG("request: %s\n", req.body.c_str());
@@ -3439,9 +3441,13 @@ int main(int argc, char ** argv) {
34393441
message = "Unknown Exception";
34403442
}
34413443

3442-
json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
3443-
LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
3444-
res_error(res, formatted_error);
3444+
try {
3445+
json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
3446+
LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
3447+
res_error(res, formatted_error);
3448+
} catch (const std::exception & e) {
3449+
LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
3450+
}
34453451
});
34463452

34473453
svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) {

0 commit comments

Comments
 (0)