Skip to content

Commit 1d72fa4

Browse files
authored
Merge branch 'master' into cisc/server-draft-threads
2 parents b66cdce + c24f4e2 commit 1d72fa4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+3136
-2141
lines changed

.devops/cloud-v-pipeline

Lines changed: 0 additions & 22 deletions
This file was deleted.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Build on RISCV Linux Machine by Cloud-V
2+
on:
3+
workflow_dispatch:
4+
workflow_call:
5+
6+
jobs:
7+
bianbu-riscv64-native: # Bianbu 2.2
8+
runs-on: self-hosted
9+
10+
steps:
11+
- name: Install prerequisites
12+
run: |
13+
sudo apt-get update || true
14+
sudo apt-get install -y libatomic1
15+
- uses: actions/checkout@v4
16+
- name: Setup Riscv
17+
run: |
18+
sudo apt-get update || true
19+
sudo apt-get install -y --no-install-recommends \
20+
build-essential \
21+
gcc-14-riscv64-linux-gnu \
22+
g++-14-riscv64-linux-gnu \
23+
cmake
24+
25+
- name: Build
26+
run: |
27+
cmake -B build -DLLAMA_CURL=OFF \
28+
-DCMAKE_BUILD_TYPE=Release \
29+
-DGGML_OPENMP=OFF \
30+
-DLLAMA_BUILD_EXAMPLES=ON \
31+
-DLLAMA_BUILD_TOOLS=ON \
32+
-DLLAMA_BUILD_TESTS=OFF \
33+
-DCMAKE_SYSTEM_NAME=Linux \
34+
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
35+
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
36+
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
37+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
38+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
39+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
40+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
41+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
42+
43+
cmake --build build --config Release -j $(nproc)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: "Copilot Setup Steps"
2+
3+
# Automatically run the setup steps when they are changed to allow for easy validation, and
4+
# allow manual testing through the repository's "Actions" tab
5+
on:
6+
workflow_dispatch:
7+
push:
8+
paths:
9+
- .github/workflows/copilot-setup-steps.yml
10+
pull_request:
11+
paths:
12+
- .github/workflows/copilot-setup-steps.yml
13+
14+
jobs:
15+
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
16+
copilot-setup-steps:
17+
runs-on: ubuntu-latest
18+
19+
# Set the permissions to the lowest permissions possible needed for your steps.
20+
# Copilot will be given its own token for its operations.
21+
permissions:
22+
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
23+
contents: read
24+
25+
# You can define any steps you want, and they will run before the agent starts.
26+
# If you do not check out your code, Copilot will do this for you.
27+
steps:
28+
- name: Checkout code
29+
uses: actions/checkout@v4
30+
31+
- name: ccache
32+
uses: hendrikmuhs/[email protected]
33+
with:
34+
key: copilot-setup-steps
35+
evict-old-files: 1d
36+
37+
- name: Dependencies
38+
id: depends
39+
run: |
40+
sudo apt-get update
41+
sudo apt-get install build-essential libcurl4-openssl-dev
42+
43+
- name: Set up Python
44+
uses: actions/setup-python@v5
45+
with:
46+
python-version: '3.11'
47+
48+
- name: Install Python dependencies
49+
run: |
50+
python3 -m venv .venv
51+
.venv/bin/activate
52+
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
53+
pip install flake8 pyright

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
240240
<details>
241241
<summary>Infrastructure</summary>
242242

243-
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
243+
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
244244
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
245245
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
246246
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server

common/arg.cpp

Lines changed: 66 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
749749
// utils
750750
//
751751

752+
// Helper function to parse tensor buffer override strings
753+
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
754+
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
755+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
756+
auto * dev = ggml_backend_dev_get(i);
757+
auto * buft = ggml_backend_dev_buffer_type(dev);
758+
if (buft) {
759+
buft_list[ggml_backend_buft_name(buft)] = buft;
760+
}
761+
}
762+
763+
for (const auto & override : string_split<std::string>(value, ',')) {
764+
std::string::size_type pos = override.find('=');
765+
if (pos == std::string::npos) {
766+
throw std::invalid_argument("invalid value");
767+
}
768+
std::string tensor_name = override.substr(0, pos);
769+
std::string buffer_type = override.substr(pos + 1);
770+
771+
if (buft_list.find(buffer_type) == buft_list.end()) {
772+
printf("Available buffer types:\n");
773+
for (const auto & it : buft_list) {
774+
printf(" %s\n", ggml_backend_buft_name(it.second));
775+
}
776+
throw std::invalid_argument("unknown buffer type");
777+
}
778+
// keep strings alive and avoid leaking memory by storing them in a static vector
779+
static std::list<std::string> buft_overrides;
780+
buft_overrides.push_back(tensor_name);
781+
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
782+
}
783+
}
784+
752785
struct handle_model_result {
753786
bool found_mmproj = false;
754787
common_params_model mmproj;
@@ -993,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
9931026
params.tensor_buft_overrides.push_back({nullptr, nullptr});
9941027
}
9951028

1029+
if (!params.speculative.tensor_buft_overrides.empty()) {
1030+
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
1031+
}
1032+
9961033
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
9971034
throw std::runtime_error(string_format(
9981035
"error: the supplied chat template is not supported: %s%s\n",
@@ -2349,40 +2386,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23492386
add_opt(common_arg(
23502387
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
23512388
"override tensor buffer type", [](common_params & params, const std::string & value) {
2352-
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2353-
if (buft_list.empty()) {
2354-
// enumerate all the devices and add their buffer types to the list
2355-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2356-
auto * dev = ggml_backend_dev_get(i);
2357-
auto * buft = ggml_backend_dev_buffer_type(dev);
2358-
if (buft) {
2359-
buft_list[ggml_backend_buft_name(buft)] = buft;
2360-
}
2361-
}
2362-
}
2363-
2364-
for (const auto & override : string_split<std::string>(value, ',')) {
2365-
std::string::size_type pos = override.find('=');
2366-
if (pos == std::string::npos) {
2367-
throw std::invalid_argument("invalid value");
2368-
}
2369-
std::string tensor_name = override.substr(0, pos);
2370-
std::string buffer_type = override.substr(pos + 1);
2371-
2372-
if (buft_list.find(buffer_type) == buft_list.end()) {
2373-
printf("Available buffer types:\n");
2374-
for (const auto & it : buft_list) {
2375-
printf(" %s\n", ggml_backend_buft_name(it.second));
2376-
}
2377-
throw std::invalid_argument("unknown buffer type");
2378-
}
2379-
// keep strings alive and avoid leaking memory by storing them in a static vector
2380-
static std::list<std::string> buft_overrides;
2381-
buft_overrides.push_back(tensor_name);
2382-
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
2383-
}
2389+
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
23842390
}
23852391
));
2392+
add_opt(common_arg(
2393+
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
2394+
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2395+
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2396+
}
2397+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
23862398
add_opt(common_arg(
23872399
{"--cpu-moe", "-cmoe"},
23882400
"keep all Mixture of Experts (MoE) weights in the CPU",
@@ -2405,6 +2417,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24052417
}
24062418
}
24072419
).set_env("LLAMA_ARG_N_CPU_MOE"));
2420+
add_opt(common_arg(
2421+
{"--cpu-moe-draft", "-cmoed"},
2422+
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2423+
[](common_params & params) {
2424+
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2425+
}
2426+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2427+
add_opt(common_arg(
2428+
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
2429+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2430+
[](common_params & params, int value) {
2431+
if (value < 0) {
2432+
throw std::invalid_argument("invalid value");
2433+
}
2434+
for (int i = 0; i < value; ++i) {
2435+
static std::list<std::string> buft_overrides_draft;
2436+
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2437+
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2438+
}
2439+
}
2440+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
24082441
add_opt(common_arg(
24092442
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
24102443
"number of layers to store in VRAM",
@@ -2949,11 +2982,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29492982
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
29502983
"(default: auto)",
29512984
[](common_params & params, const std::string & value) {
2952-
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2953-
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2954-
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955-
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
2956-
else { throw std::invalid_argument("invalid value"); }
2985+
params.reasoning_format = common_reasoning_format_from_name(value);
29572986
}
29582987
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
29592988
add_opt(common_arg(

common/chat.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,17 @@ common_chat_templates_ptr common_chat_templates_init(
552552
default_template_src = CHATML_TEMPLATE_SRC;
553553
}
554554
}
555+
556+
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
557+
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
558+
if (default_template_src.find("<|channel|>") != std::string::npos
559+
// search for the error message and patch it
560+
&& default_template_src.find("in message.content or") != std::string::npos) {
561+
string_replace_all(default_template_src,
562+
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
563+
"{%- if false %}");
564+
}
565+
555566
std::string token_bos = bos_token_override;
556567
std::string token_eos = eos_token_override;
557568
bool add_bos = false;
@@ -625,6 +636,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
625636
}
626637
}
627638

639+
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
640+
if (format == "none") {
641+
return COMMON_REASONING_FORMAT_NONE;
642+
} else if (format == "auto") {
643+
return COMMON_REASONING_FORMAT_AUTO;
644+
} else if (format == "deepseek") {
645+
return COMMON_REASONING_FORMAT_DEEPSEEK;
646+
} else if (format == "deepseek-legacy") {
647+
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
648+
}
649+
throw std::runtime_error("Unknown reasoning format: " + format);
650+
}
651+
628652
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
629653
std::string arguments;
630654
if (builder.is_partial()) {

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ std::string common_chat_format_example(
191191

192192
const char* common_chat_format_name(common_chat_format format);
193193
const char* common_reasoning_format_name(common_reasoning_format format);
194+
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
194195
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
195196

196197
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ struct common_params_speculative {
202202
float p_split = 0.1f; // speculative decoding split probability
203203
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204204
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
205+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
205206

206207
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
207208
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

0 commit comments

Comments
 (0)