Skip to content

Commit 239af52

Browse files
author
prima
committed
Merge remote-tracking branch 'origin/concedo_experimental' into remoteManagement
2 parents 35525c7 + c6f7603 commit 239af52

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3201
-1779
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Build on RISCV Linux Machine by Cloud-V
2+
on:
3+
workflow_dispatch:
4+
workflow_call:
5+
6+
jobs:
7+
bianbu-riscv64-native: # Bianbu 2.2
8+
runs-on: self-hosted
9+
10+
steps:
11+
- name: Install prerequisites
12+
run: |
13+
sudo apt-get update || true
14+
sudo apt-get install -y libatomic1
15+
- uses: actions/checkout@v4
16+
- name: Setup Riscv
17+
run: |
18+
sudo apt-get update || true
19+
sudo apt-get install -y --no-install-recommends \
20+
build-essential \
21+
gcc-14-riscv64-linux-gnu \
22+
g++-14-riscv64-linux-gnu \
23+
cmake
24+
25+
- name: Build
26+
run: |
27+
cmake -B build -DLLAMA_CURL=OFF \
28+
-DCMAKE_BUILD_TYPE=Release \
29+
-DGGML_OPENMP=OFF \
30+
-DLLAMA_BUILD_EXAMPLES=ON \
31+
-DLLAMA_BUILD_TOOLS=ON \
32+
-DLLAMA_BUILD_TESTS=OFF \
33+
-DCMAKE_SYSTEM_NAME=Linux \
34+
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
35+
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
36+
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
37+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
38+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
39+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
40+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
41+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
42+
43+
cmake --build build --config Release -j $(nproc)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: "Copilot Setup Steps"
2+
3+
# Automatically run the setup steps when they are changed to allow for easy validation, and
4+
# allow manual testing through the repository's "Actions" tab
5+
on:
6+
workflow_dispatch:
7+
push:
8+
paths:
9+
- .github/workflows/copilot-setup-steps.yml
10+
pull_request:
11+
paths:
12+
- .github/workflows/copilot-setup-steps.yml
13+
14+
jobs:
15+
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
16+
copilot-setup-steps:
17+
runs-on: ubuntu-latest
18+
19+
# Set the permissions to the lowest permissions possible needed for your steps.
20+
# Copilot will be given its own token for its operations.
21+
permissions:
22+
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
23+
contents: read
24+
25+
# You can define any steps you want, and they will run before the agent starts.
26+
# If you do not check out your code, Copilot will do this for you.
27+
steps:
28+
- name: Checkout code
29+
uses: actions/checkout@v4
30+
31+
- name: ccache
32+
uses: hendrikmuhs/[email protected]
33+
with:
34+
key: copilot-setup-steps
35+
evict-old-files: 1d
36+
37+
- name: Dependencies
38+
id: depends
39+
run: |
40+
sudo apt-get update
41+
sudo apt-get install build-essential libcurl4-openssl-dev
42+
43+
- name: Set up Python
44+
uses: actions/setup-python@v5
45+
with:
46+
python-version: '3.11'
47+
48+
- name: Install Python dependencies
49+
run: |
50+
python3 -m venv .venv
51+
.venv/bin/activate
52+
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
53+
pip install flake8 pyright

android_install.sh

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ elif [ -t 0 ]; then
1919
# Running interactively
2020
echo "[1] - Proceed to install and launch with default options - no model is loaded by default, but can be downloaded in the UI"
2121
echo "[2] - Proceed to install and not run."
22-
echo "[3] - Select existing model to load (Requires already installed)"
23-
echo "[4] - Exit script"
22+
echo "[3] - Download GGUF model from web URL (Requires already installed)"
23+
echo "[4] - Load existing GGUF model from disk (Requires already installed)"
24+
echo "[5] - Exit script"
2425
echo "--------------------------------------------"
25-
read -p "Enter your choice [1-4]: " choice
26+
read -p "Enter your choice [1-5]: " choice
2627
else
2728
# Non-interactive, default to choice 1
2829
echo "Defaulting to normal install and model download. Run script interactively for other options. Install will start in 3 seconds."
@@ -38,10 +39,10 @@ else
3839
fi
3940

4041
# handle user choice
41-
if [ "$choice" = "4" ]; then
42+
if [ "$choice" = "5" ]; then
4243
echo "Exiting script. Goodbye!"
4344
exit 0
44-
elif [ "$choice" = "3" ]; then
45+
elif [ "$choice" = "4" ]; then
4546
echo "[*] Searching for .gguf model files in $SCRIPT_DIR..."
4647
MODEL_FILES=$(find "$SCRIPT_DIR" -type f -maxdepth 1 -name "*.gguf" 2>/dev/null)
4748
if [ -z "$MODEL_FILES" ]; then
@@ -65,7 +66,11 @@ elif [ "$choice" = "3" ]; then
6566
echo "Now launching with model $SELECTED_MODEL"
6667
python koboldcpp.py --model $SELECTED_MODEL
6768
exit 0
68-
69+
elif [ "$choice" = "3" ]; then
70+
read -r -p "Please input FULL URL of model you wish to download and run: " SELECTED_MODEL
71+
echo "Starting download of model $SELECTED_MODEL"
72+
python koboldcpp.py --model $SELECTED_MODEL
73+
exit 0
6974
elif [ "$choice" = "2" ]; then
7075
echo "[*] Install without model download..."
7176
INSTALL_MODEL=false

common/arg.cpp

Lines changed: 124 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
751751
// utils
752752
//
753753

754+
// Helper function to parse tensor buffer override strings
755+
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
756+
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
757+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
758+
auto * dev = ggml_backend_dev_get(i);
759+
auto * buft = ggml_backend_dev_buffer_type(dev);
760+
if (buft) {
761+
buft_list[ggml_backend_buft_name(buft)] = buft;
762+
}
763+
}
764+
765+
for (const auto & override : string_split<std::string>(value, ',')) {
766+
std::string::size_type pos = override.find('=');
767+
if (pos == std::string::npos) {
768+
throw std::invalid_argument("invalid value");
769+
}
770+
std::string tensor_name = override.substr(0, pos);
771+
std::string buffer_type = override.substr(pos + 1);
772+
773+
if (buft_list.find(buffer_type) == buft_list.end()) {
774+
printf("Available buffer types:\n");
775+
for (const auto & it : buft_list) {
776+
printf(" %s\n", ggml_backend_buft_name(it.second));
777+
}
778+
throw std::invalid_argument("unknown buffer type");
779+
}
780+
// keep strings alive and avoid leaking memory by storing them in a static vector
781+
static std::list<std::string> buft_overrides;
782+
buft_overrides.push_back(tensor_name);
783+
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
784+
}
785+
}
786+
754787
struct handle_model_result {
755788
bool found_mmproj = false;
756789
common_params_model mmproj;
@@ -995,6 +1028,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
9951028
params.tensor_buft_overrides.push_back({nullptr, nullptr});
9961029
}
9971030

1031+
if (!params.speculative.tensor_buft_overrides.empty()) {
1032+
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
1033+
}
1034+
9981035
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
9991036
throw std::runtime_error(string_format(
10001037
"error: the supplied chat template is not supported: %s%s\n",
@@ -1203,6 +1240,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
12031240
common_params_print_completion(ctx_arg);
12041241
exit(0);
12051242
}
1243+
params.lr.init();
12061244
} catch (const std::invalid_argument & ex) {
12071245
fprintf(stderr, "%s\n", ex.what());
12081246
ctx_arg.params = params_org;
@@ -1471,6 +1509,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14711509
params.swa_full = true;
14721510
}
14731511
).set_env("LLAMA_ARG_SWA_FULL"));
1512+
add_opt(common_arg(
1513+
{"--swa-checkpoints"}, "N",
1514+
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1515+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1516+
[](common_params & params, int value) {
1517+
params.n_swa_checkpoints = value;
1518+
}
1519+
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
14741520
add_opt(common_arg(
14751521
{"--kv-unified", "-kvu"},
14761522
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2351,40 +2397,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23512397
add_opt(common_arg(
23522398
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
23532399
"override tensor buffer type", [](common_params & params, const std::string & value) {
2354-
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
2355-
if (buft_list.empty()) {
2356-
// enumerate all the devices and add their buffer types to the list
2357-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2358-
auto * dev = ggml_backend_dev_get(i);
2359-
auto * buft = ggml_backend_dev_buffer_type(dev);
2360-
if (buft) {
2361-
buft_list[ggml_backend_buft_name(buft)] = buft;
2362-
}
2363-
}
2364-
}
2365-
2366-
for (const auto & override : string_split<std::string>(value, ',')) {
2367-
std::string::size_type pos = override.find('=');
2368-
if (pos == std::string::npos) {
2369-
throw std::invalid_argument("invalid value");
2370-
}
2371-
std::string tensor_name = override.substr(0, pos);
2372-
std::string buffer_type = override.substr(pos + 1);
2373-
2374-
if (buft_list.find(buffer_type) == buft_list.end()) {
2375-
printf("Available buffer types:\n");
2376-
for (const auto & it : buft_list) {
2377-
printf(" %s\n", ggml_backend_buft_name(it.second));
2378-
}
2379-
throw std::invalid_argument("unknown buffer type");
2380-
}
2381-
// keep strings alive and avoid leaking memory by storing them in a static vector
2382-
static std::list<std::string> buft_overrides;
2383-
buft_overrides.push_back(tensor_name);
2384-
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
2385-
}
2400+
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
23862401
}
23872402
));
2403+
add_opt(common_arg(
2404+
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
2405+
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2406+
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2407+
}
2408+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
23882409
add_opt(common_arg(
23892410
{"--cpu-moe", "-cmoe"},
23902411
"keep all Mixture of Experts (MoE) weights in the CPU",
@@ -2407,6 +2428,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24072428
}
24082429
}
24092430
).set_env("LLAMA_ARG_N_CPU_MOE"));
2431+
add_opt(common_arg(
2432+
{"--cpu-moe-draft", "-cmoed"},
2433+
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2434+
[](common_params & params) {
2435+
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2436+
}
2437+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2438+
add_opt(common_arg(
2439+
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
2440+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2441+
[](common_params & params, int value) {
2442+
if (value < 0) {
2443+
throw std::invalid_argument("invalid value");
2444+
}
2445+
for (int i = 0; i < value; ++i) {
2446+
static std::list<std::string> buft_overrides_draft;
2447+
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2448+
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2449+
}
2450+
}
2451+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
24102452
add_opt(common_arg(
24112453
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
24122454
"number of layers to store in VRAM",
@@ -2657,7 +2699,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26572699
[](common_params & params, const std::string & value) {
26582700
params.out_file = value;
26592701
}
2660-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
2702+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
26612703
add_opt(common_arg(
26622704
{"-ofreq", "--output-frequency"}, "N",
26632705
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2951,11 +2993,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29512993
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
29522994
"(default: auto)",
29532995
[](common_params & params, const std::string & value) {
2954-
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2955-
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2956-
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2957-
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
2958-
else { throw std::invalid_argument("invalid value"); }
2996+
params.reasoning_format = common_reasoning_format_from_name(value);
29592997
}
29602998
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
29612999
add_opt(common_arg(
@@ -3136,7 +3174,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31363174
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
31373175
}
31383176
}
3139-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3177+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
31403178
add_opt(common_arg(
31413179
{"-tbd", "--threads-batch-draft"}, "N",
31423180
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -3146,7 +3184,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31463184
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
31473185
}
31483186
}
3149-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3187+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
31503188
add_opt(common_arg(
31513189
{"-Cd", "--cpu-mask-draft"}, "M",
31523190
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -3539,5 +3577,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35393577
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
35403578

35413579

3580+
add_opt(
3581+
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
3582+
string_format(
3583+
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
3584+
(double) params.lr.lr0),
3585+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
3586+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3587+
add_opt(
3588+
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3589+
string_format(
3590+
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3591+
(double) params.lr.lr_min),
3592+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
3593+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3594+
add_opt(
3595+
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
3596+
string_format(
3597+
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
3598+
(double) params.lr.decay_epochs),
3599+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
3600+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3601+
add_opt(common_arg(
3602+
{ "-wd", "--weight-decay" }, "WD",
3603+
string_format(
3604+
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
3605+
(double) params.lr.wd),
3606+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
3607+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3608+
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
3609+
string_format("fraction of data to use as validation set for training (default: %.2g).",
3610+
(double) params.val_split),
3611+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
3612+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3613+
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
3614+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3615+
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
3616+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3617+
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
3618+
[](common_params & params, const std::string & name) {
3619+
params.optimizer = common_opt_get_optimizer(name.c_str());
3620+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3621+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3622+
}
3623+
})
3624+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3625+
35423626
return ctx_arg;
35433627
}

0 commit comments

Comments
 (0)