Skip to content

Commit 715f24a

Browse files
author
prima
committed
Merge remote-tracking branch 'origin/concedo_experimental' into remoteManagement
2 parents f97a844 + 979e211 commit 715f24a

38 files changed

+1716
-598
lines changed

.clang-format

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ AllowShortIfStatementsOnASingleLine: Never
2222
AllowShortLambdasOnASingleLine: Inline
2323
AllowShortLoopsOnASingleLine: false
2424
AlwaysBreakBeforeMultilineStrings: true
25-
BinPackArguments: false
25+
BinPackArguments: true
2626
BinPackParameters: false # OnePerLine
2727
BitFieldColonSpacing: Both
2828
BreakBeforeBraces: Custom # Attach

common/arg.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,11 +1550,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15501550
{"-fa", "--flash-attn"}, "FA",
15511551
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
15521552
[](common_params & params, const std::string & value) {
1553-
if (value == "on" || value == "enabled") {
1553+
if (value == "on" || value == "enabled" || value == "1") {
15541554
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1555-
} else if (value == "off" || value == "disabled") {
1555+
} else if (value == "off" || value == "disabled" || value == "0") {
15561556
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1557-
} else if (value == "auto") {
1557+
} else if (value == "auto" || value == "-1") {
15581558
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
15591559
} else {
15601560
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
@@ -2964,20 +2964,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29642964
params.endpoint_metrics = true;
29652965
}
29662966
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2967-
add_opt(common_arg(
2968-
{"--slots"},
2969-
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2970-
[](common_params & params) {
2971-
params.endpoint_slots = true;
2972-
}
2973-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
29742967
add_opt(common_arg(
29752968
{"--props"},
29762969
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
29772970
[](common_params & params) {
29782971
params.endpoint_props = true;
29792972
}
29802973
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2974+
add_opt(common_arg(
2975+
{"--slots"},
2976+
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2977+
[](common_params & params) {
2978+
params.endpoint_slots = true;
2979+
}
2980+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
29812981
add_opt(common_arg(
29822982
{"--no-slots"},
29832983
"disables slots monitoring endpoint",

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ struct common_params {
440440

441441
// "advanced" endpoints are disabled by default for better security
442442
bool webui = true;
443-
bool endpoint_slots = false;
443+
bool endpoint_slots = true;
444444
bool endpoint_props = false; // only control POST requests, not GET
445445
bool endpoint_metrics = false;
446446

common/sampling.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
426426

427427
// helpers
428428

429-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430-
return &gsmpl->cur_p;
429+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
430+
auto * res = &gsmpl->cur_p;
431+
432+
if (do_sort && !res->sorted) {
433+
// remember the selected token before sorting
434+
const llama_token id = res->data[res->selected].id;
435+
436+
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
437+
return a.p > b.p;
438+
});
439+
440+
// restore the selected token after sorting
441+
for (size_t i = 0; i < res->size; ++i) {
442+
if (res->data[i].id == id) {
443+
res->selected = i;
444+
break;
445+
}
446+
}
447+
448+
res->sorted = true;
449+
}
450+
451+
return res;
431452
}
432453

433454
llama_token common_sampler_last(const struct common_sampler * gsmpl) {

common/sampling.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
8686
// helpers
8787

8888
// access the internal list of current candidate tokens
89-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
89+
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
90+
// the .sorted flag of the result indicates whether the returned candidates are sorted
91+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
9092

9193
// get the last accepted token
9294
llama_token common_sampler_last(const struct common_sampler * gsmpl);

common/speculative.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
317317

318318
common_sampler_sample(smpl, ctx_dft, 0, true);
319319

320-
const auto * cur_p = common_sampler_get_candidates(smpl);
320+
const auto * cur_p = common_sampler_get_candidates(smpl, true);
321321

322322
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
323323
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",

convert_hf_to_gguf.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,6 @@ def prepare_tensors(self):
302302
# data = data_torch.squeeze().numpy()
303303
data = data_torch.numpy()
304304

305-
# if data ends up empty, it means data_torch was a scalar tensor -> restore
306-
if len(data.shape) == 0:
307-
data = data_torch.numpy()
308-
309305
n_dims = len(data.shape)
310306
data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
311307

environment-nocuda.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ dependencies:
1717
- ocl-icd-system
1818
- libvulkan-loader
1919
- tk=*=xft_*
20+
- psutil
2021
- pip:
2122
- customtkinter

environment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ dependencies:
2020
- ocl-icd-system
2121
- libvulkan-loader
2222
- tk=*=xft_*
23+
- psutil
2324
- pip:
2425
- customtkinter
2526
- pdfplumber
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
curl --request POST \
3+
--url http://localhost:8080/embedding \
4+
--header "Content-Type: application/json" \
5+
--data '{"input": "Hello world today"}' \
6+
--silent

0 commit comments

Comments
 (0)