Nexesenex
diff --git a/‎.github/workflows/build-riscv-native.yml‎
Lines changed: 43 additions & 0 deletions b/‎.github/workflows/build-riscv-native.yml‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 53 additions & 0 deletions b/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎android_install.sh‎
Lines changed: 11 additions & 6 deletions b/‎android_install.sh‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 124 additions & 40 deletions b/‎common/arg.cpp‎
Lines changed: 124 additions & 40 deletions
@@ -0,0 +1,43 @@
+name: Build on RISCV Linux Machine by Cloud-V
+on:
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  bianbu-riscv64-native: # Bianbu 2.2
+    runs-on: self-hosted
+
+    steps:
+      - name: Install prerequisites
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y libatomic1
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu \
+                  cmake
+
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
@@ -0,0 +1,53 @@
+name: "Copilot Setup Steps"
+
+# Automatically run the setup steps when they are changed to allow for easy validation, and
+# allow manual testing through the repository's "Actions" tab
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+  pull_request:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+
+jobs:
+  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
+  copilot-setup-steps:
+    runs-on: ubuntu-latest
+
+    # Set the permissions to the lowest permissions possible needed for your steps.
+    # Copilot will be given its own token for its operations.
+    permissions:
+      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
+      contents: read
+
+    # You can define any steps you want, and they will run before the agent starts.
+    # If you do not check out your code, Copilot will do this for you.
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/[email protected]
+        with:
+          key: copilot-setup-steps
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libcurl4-openssl-dev
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python3 -m venv .venv
+          .venv/bin/activate
+          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
+          pip install flake8 pyright
@@ -19,10 +19,11 @@ elif [ -t 0 ]; then
     # Running interactively
     echo "[1] - Proceed to install and launch with default options - no model is loaded by default, but can be downloaded in the UI"
     echo "[2] - Proceed to install and not run."
-    echo "[3] - Select existing model to load (Requires already installed)"
-    echo "[4] - Exit script"
+    echo "[3] - Download GGUF model from web URL (Requires already installed)"
+    echo "[4] - Load existing GGUF model from disk (Requires already installed)"
+    echo "[5] - Exit script"
     echo "--------------------------------------------"
-    read -p "Enter your choice [1-4]: " choice
+    read -p "Enter your choice [1-5]: " choice
 else
     # Non-interactive, default to choice 1
     echo "Defaulting to normal install and model download. Run script interactively for other options. Install will start in 3 seconds."
@@ -38,10 +39,10 @@ else
 fi
 
 # handle user choice
-if [ "$choice" = "4" ]; then
+if [ "$choice" = "5" ]; then
     echo "Exiting script. Goodbye!"
     exit 0
-elif [ "$choice" = "3" ]; then
+elif [ "$choice" = "4" ]; then
     echo "[*] Searching for .gguf model files in $SCRIPT_DIR..."
     MODEL_FILES=$(find "$SCRIPT_DIR" -type f -maxdepth 1 -name "*.gguf" 2>/dev/null)
     if [ -z "$MODEL_FILES" ]; then
@@ -65,7 +66,11 @@ elif [ "$choice" = "3" ]; then
     echo "Now launching with model $SELECTED_MODEL"
     python koboldcpp.py --model $SELECTED_MODEL
     exit 0
-
+elif [ "$choice" = "3" ]; then
+    read -r -p "Please input FULL URL of model you wish to download and run: " SELECTED_MODEL
+    echo "Starting download of model $SELECTED_MODEL"
+    python koboldcpp.py --model $SELECTED_MODEL
+    exit 0
 elif [ "$choice" = "2" ]; then
     echo "[*] Install without model download..."
     INSTALL_MODEL=false
 
@@ -751,6 +751,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 // utils
 //
 
+// Helper function to parse tensor buffer override strings
+static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
+    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        auto * buft = ggml_backend_dev_buffer_type(dev);
+        if (buft) {
+            buft_list[ggml_backend_buft_name(buft)] = buft;
+        }
+    }
+
+    for (const auto & override : string_split<std::string>(value, ',')) {
+        std::string::size_type pos = override.find('=');
+        if (pos == std::string::npos) {
+            throw std::invalid_argument("invalid value");
+        }
+        std::string tensor_name = override.substr(0, pos);
+        std::string buffer_type = override.substr(pos + 1);
+
+        if (buft_list.find(buffer_type) == buft_list.end()) {
+            printf("Available buffer types:\n");
+            for (const auto & it : buft_list) {
+                printf("  %s\n", ggml_backend_buft_name(it.second));
+            }
+            throw std::invalid_argument("unknown buffer type");
+        }
+        // keep strings alive and avoid leaking memory by storing them in a static vector
+        static std::list<std::string> buft_overrides;
+        buft_overrides.push_back(tensor_name);
+        overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
+    }
+}
+
 struct handle_model_result {
     bool found_mmproj = false;
     common_params_model mmproj;
@@ -995,6 +1028,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         params.tensor_buft_overrides.push_back({nullptr, nullptr});
     }
 
+    if (!params.speculative.tensor_buft_overrides.empty()) {
+        params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
+
     if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
         throw std::runtime_error(string_format(
             "error: the supplied chat template is not supported: %s%s\n",
@@ -1203,6 +1240,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
             common_params_print_completion(ctx_arg);
             exit(0);
         }
+        params.lr.init();
     } catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         ctx_arg.params = params_org;
@@ -1471,6 +1509,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.swa_full = true;
         }
     ).set_env("LLAMA_ARG_SWA_FULL"));
+    add_opt(common_arg(
+        {"--swa-checkpoints"}, "N",
+        string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
+        [](common_params & params, int value) {
+            params.n_swa_checkpoints = value;
+        }
+    ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--kv-unified", "-kvu"},
         string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -2351,40 +2397,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
         "override tensor buffer type", [](common_params & params, const std::string & value) {
-            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
-            if (buft_list.empty()) {
-                // enumerate all the devices and add their buffer types to the list
-                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-                    auto * dev = ggml_backend_dev_get(i);
-                    auto * buft = ggml_backend_dev_buffer_type(dev);
-                    if (buft) {
-                        buft_list[ggml_backend_buft_name(buft)] = buft;
-                    }
-                }
-            }
-
-            for (const auto & override : string_split<std::string>(value, ',')) {
-                std::string::size_type pos = override.find('=');
-                if (pos == std::string::npos) {
-                    throw std::invalid_argument("invalid value");
-                }
-                std::string tensor_name = override.substr(0, pos);
-                std::string buffer_type = override.substr(pos + 1);
-
-                if (buft_list.find(buffer_type) == buft_list.end()) {
-                    printf("Available buffer types:\n");
-                    for (const auto & it : buft_list) {
-                        printf("  %s\n", ggml_backend_buft_name(it.second));
-                    }
-                    throw std::invalid_argument("unknown buffer type");
-                }
-                // keep strings alive and avoid leaking memory by storing them in a static vector
-                static std::list<std::string> buft_overrides;
-                buft_overrides.push_back(tensor_name);
-                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
-            }
+            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
         }
     ));
+    add_opt(common_arg(
+        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
+        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
+            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--cpu-moe", "-cmoe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -2407,6 +2428,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_env("LLAMA_ARG_N_CPU_MOE"));
+    add_opt(common_arg(
+        {"--cpu-moe-draft", "-cmoed"},
+        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
+        [](common_params & params) {
+            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+    add_opt(common_arg(
+        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
+        [](common_params & params, int value) {
+            if (value < 0) {
+                throw std::invalid_argument("invalid value");
+            }
+            for (int i = 0; i < value; ++i) {
+                static std::list<std::string> buft_overrides_draft;
+                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
@@ -2657,7 +2699,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.out_file = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2951,11 +2993,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
         "(default: auto)",
         [](common_params & params, const std::string & value) {
-            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
-            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
-            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else if (value == "auto") {     params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
-            else { throw std::invalid_argument("invalid value"); }
+            params.reasoning_format = common_reasoning_format_from_name(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
     add_opt(common_arg(
@@ -3136,7 +3174,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@@ -3146,7 +3184,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-Cd", "--cpu-mask-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@@ -3539,5 +3577,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
 
 
+    add_opt(
+        common_arg({ "-lr", "--learning-rate" }, "ALPHA",
+                   string_format(
+                       "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
+                       (double) params.lr.lr0),
+                   [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
+            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(
+        common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
+                   string_format(
+                       "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
+                       (double) params.lr.lr_min),
+                   [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
+            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(
+        common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
+                   string_format(
+                       "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
+                       (double) params.lr.decay_epochs),
+                   [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
+            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg(
+                { "-wd", "--weight-decay" }, "WD",
+                string_format(
+                    "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
+                    (double) params.lr.wd),
+                [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
+                       string_format("fraction of data to use as validation set for training (default: %.2g).",
+                                     (double) params.val_split),
+                       [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg({ "-epochs", "--epochs" }, "N",
+                       string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
+                       [](common_params & params, int epochs) { params.lr.epochs = epochs; })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+    add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
+                       [](common_params & params, const std::string & name) {
+                           params.optimizer = common_opt_get_optimizer(name.c_str());
+                           if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
+                               throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
+                           }
+                       })
+                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+
     return ctx_arg;
 }