From e3edebee0616481dc46c892b9acade851c46ac90 Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 10:23:00 -0400 Subject: [PATCH 1/9] Add comprehensive LoRA adapter support to llamafile server Implements full LoRA (Low-Rank Adaptation) adapter support compatible with llama.cpp, enabling fine-tuning capabilities in llamafile server mode. Features: - Multiple LoRA adapter support with individual scaling factors - New command-line flags: --lora, --lora-scaled, --lora-base - Automatic memory mapping disabling for LoRA compatibility - Per-slot adapter application during initialization - Clean resource management and cleanup on shutdown Changes: - flags.cpp: Add LoRA flag parsing and global adapter management - prog.cpp: Implement adapter loading, validation, and cleanup - slot.cpp/slot.h: Add slot-level adapter application logic - llamafile.h: Define LoRA adapter data structures and constants - README.md: Add comprehensive LoRA usage documentation - RELEASE.md: Document new LoRA features for release notes The implementation follows llama.cpp patterns for maximum compatibility and provides a solid foundation for advanced fine-tuning workflows. Tested with Llama 3 8B + LoRA adapters, supporting both single and multiple adapter configurations with custom scaling factors. Resolves #697 --- .gitignore | 1 + README.md | 32 ++++++++++++++++++++ RELEASE.md | 26 ++++++++++++++++ llamafile/flags.cpp | 53 ++++++++++++++++++++++++++++++++ llamafile/llamafile.h | 12 ++++++++ llamafile/server/prog.cpp | 63 +++++++++++++++++++++++++++++++++++++-- llamafile/server/slot.cpp | 19 ++++++++++++ llamafile/server/slot.h | 8 +++++ 8 files changed, 212 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d928dde4e9..16feca060d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ /trace.json /*.log +/.models \ No newline at end of file diff --git a/README.md b/README.md index e19d0a6014..25f1962314 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,38 @@ llamafile --server --v2 --help llamafile --server --v2 ``` +## LoRA Adapter Support + +Llamafile supports LoRA (Low-Rank Adaptation) adapters, allowing you to fine-tune models with adapter layers applied on top of the base model. This is compatible with adapters created for llama.cpp. + +### Using LoRA Adapters + +To use LoRA adapters with llamafile server, use the `--lora` and `--lora-scaled` flags: + +```bash +# Single adapter with default scale (1.0) +llamafile -m base_model.gguf --lora adapter.gguf --server + +# Single adapter with custom scale +llamafile -m base_model.gguf --lora-scaled adapter.gguf 0.8 --server + +# Multiple adapters with different scales +llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server +``` + +### LoRA Adapter Flags + +- `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0) +- `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor +- `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed) + +### Important Notes + +- LoRA adapters are automatically applied to each inference slot when they start +- Memory mapping (`mmap`) is automatically disabled when using LoRA adapters for compatibility +- Multiple adapters can be combined by using multiple `--lora` and `--lora-scaled` flags +- The base model specified by `-m` serves as the foundation for the LoRA adapter(s) + ## Other example llamafiles We also provide example llamafiles for other models, so you can easily diff --git a/RELEASE.md b/RELEASE.md index 76a1c5a1ae..b7fb2634e4 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,6 +2,32 @@ There are a few steps in making a Llamafile release which will be detailed in this document. +## What's New in This Release + +### LoRA Adapter Support + +This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include: + +- **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors +- **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference +- **Compatible Flags**: + - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0) + - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor + - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases) +- **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility +- **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle + +Example usage: +```bash +# Single adapter with default scale +llamafile -m base_model.gguf --lora adapter.gguf --server + +# Multiple adapters with different scales +llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server +``` + +This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows. + The two primary artifacts of the release are the `llamafile-.zip` and the binaries for the GitHub release. ## Release Process diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp index c0e3bb3b74..37e25cb4d0 100644 --- a/llamafile/flags.cpp +++ b/llamafile/flags.cpp @@ -65,6 +65,12 @@ const char *FLAG_model = nullptr; const char *FLAG_prompt = nullptr; const char *FLAG_url_prefix = ""; const char *FLAG_www_root = "/zip/www"; +const char *FLAG_lora = nullptr; +const char *FLAG_lora_base = nullptr; + +// Multiple LoRA adapters support +struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0}; +int FLAG_lora_adapters_count = 0; double FLAG_token_rate = 1; float FLAG_decay_growth = .01; float FLAG_frequency_penalty = 0; @@ -385,6 +391,53 @@ void llamafile_get_flags(int argc, char **argv) { continue; } + ////////////////////////////////////////////////////////////////////// + // LoRA flags + + if (!strcmp(flag, "--lora")) { + if (i == argc) + missing("--lora"); + if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) { + error("too many LoRA adapters (max 8)"); + } + FLAG_lora_adapters[FLAG_lora_adapters_count].path = argv[i++]; + FLAG_lora_adapters[FLAG_lora_adapters_count].scale = 1.0f; + FLAG_lora_adapters_count++; + + // Keep FLAG_lora for backward compatibility + if (!FLAG_lora) { + FLAG_lora = FLAG_lora_adapters[0].path; + } + continue; + } + + if (!strcmp(flag, "--lora-scaled")) { + if (i == argc) + missing("--lora-scaled"); + const char* lora_adapter = argv[i++]; + if (i == argc) + missing("--lora-scaled scale value"); + if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) { + error("too many LoRA adapters (max 8)"); + } + FLAG_lora_adapters[FLAG_lora_adapters_count].path = lora_adapter; + FLAG_lora_adapters[FLAG_lora_adapters_count].scale = atof(argv[i++]); + FLAG_lora_adapters_count++; + + // Keep FLAG_lora for backward compatibility + if (!FLAG_lora) { + FLAG_lora = FLAG_lora_adapters[0].path; + } + continue; + } + + if (!strcmp(flag, "--lora-base")) { + if (i == argc) + missing("--lora-base"); + FLAG_lora_base = argv[i++]; + continue; + } + ////////////////////////////////////////////////////////////////////// // model flags diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index b74dda60dd..c5fd428439 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -36,6 +36,18 @@ extern const char *FLAG_prompt; extern const char *FLAG_url_prefix; extern const char *FLAG_www_root; extern double FLAG_token_rate; +extern const char *FLAG_lora; +extern const char *FLAG_lora_base; + +// LoRA adapter info structure to match llama.cpp +struct llamafile_lora_adapter_info { + const char* path; + float scale; +}; + +#define MAX_LORA_ADAPTERS 8 +extern struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS]; +extern int FLAG_lora_adapters_count; extern float FLAG_decay_growth; extern float FLAG_frequency_penalty; extern float FLAG_presence_penalty; diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp index bd6e6b6a24..237be622de 100644 --- a/llamafile/server/prog.cpp +++ b/llamafile/server/prog.cpp @@ -29,6 +29,31 @@ #include #include +// Global LoRA adapter storage for multiple adapters +#define MAX_LORA_ADAPTERS 8 +struct lora_adapter_container { + struct llama_lora_adapter* adapter; + float scale; +}; + +static struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {0}; +static int g_lora_adapters_count = 0; + +// Function to get the first global LoRA adapter for backward compatibility +extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { + return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr; +} + +// Function to get all LoRA adapters and their count +extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) { + int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters; + for (int i = 0; i < count; i++) { + adapters[i] = g_lora_adapters[i].adapter; + scales[i] = g_lora_adapters[i].scale; + } + return count; +} + namespace lf { namespace server { @@ -69,6 +94,8 @@ main(int argc, char* argv[]) FLAG_log_disable = true; // load model + // --lora implies --no-mmap (as per llama.cpp server) + bool use_mmap = FLAG_mmap && (FLAG_lora_adapters_count == 0); llama_model_params mparams = { .n_gpu_layers = FLAG_n_gpu_layers, .split_mode = (enum llama_split_mode)FLAG_split_mode, @@ -79,8 +106,8 @@ main(int argc, char* argv[]) .progress_callback_user_data = nullptr, .kv_overrides = nullptr, .vocab_only = false, - .use_mmap = true, - .use_mlock = false, + .use_mmap = use_mmap, + .use_mlock = FLAG_mlock, .check_tensors = false, }; llama_model* model = llama_load_model_from_file(FLAG_model, mparams); @@ -89,6 +116,30 @@ main(int argc, char* argv[]) exit(1); } + // load LoRA adapters if specified + if (FLAG_lora_adapters_count > 0) { + SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count); + for (int i = 0; i < FLAG_lora_adapters_count; i++) { + SLOG("loading LoRA adapter %d from %s with scale %.2f", i + 1, + FLAG_lora_adapters[i].path, FLAG_lora_adapters[i].scale); + g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path); + g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; + if (!g_lora_adapters[i].adapter) { + fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, FLAG_lora_adapters[i].path); + // Cleanup previously loaded adapters + for (int j = 0; j < i; j++) { + if (g_lora_adapters[j].adapter) { + llama_lora_adapter_free(g_lora_adapters[j].adapter); + } + } + llama_free_model(model); + exit(1); + } + g_lora_adapters_count++; + } + SLOG("all LoRA adapters loaded successfully"); + } + // create slots Slots* slots = new Slots(model); if (!slots->start(FLAG_slots)) { @@ -120,6 +171,14 @@ main(int argc, char* argv[]) g_server->close(); delete g_server; delete slots; + + // Cleanup LoRA adapters + for (int i = 0; i < g_lora_adapters_count; i++) { + if (g_lora_adapters[i].adapter) { + llama_lora_adapter_free(g_lora_adapters[i].adapter); + } + } + llama_free_model(model); tokenbucket_destroy(); time_destroy(); diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index c57ca2541c..d11d9bdf4e 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -125,6 +125,25 @@ Slot::start() system_fingerprint_ = generate_system_fingerprint(&cparams); if (!(ctx_ = llama_new_context_with_model(model_, cparams))) return false; + + // Apply LoRA adapters if available + struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; + float scales[MAX_LORA_ADAPTERS]; + int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); + + if (adapter_count > 0) { + SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_); + for (int i = 0; i < adapter_count; i++) { + if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) { + SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_); + llama_free(ctx_); + ctx_ = nullptr; + return false; + } + SLOG("applied LoRA adapter %d to slot #%d with scale %.2f", i + 1, id_, scales[i]); + } + } + if (FLAG_mmproj) if (!(clip_ctx_ = clip_model_load(FLAG_mmproj, FLAG_verbose))) return false; diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h index 7fdd7bf881..e8816c9009 100644 --- a/llamafile/server/slot.h +++ b/llamafile/server/slot.h @@ -23,11 +23,19 @@ #include #define SLOT(e) DLL_CONTAINER(Slot, elem_, e) +#define MAX_LORA_ADAPTERS 8 struct llama_context; struct llama_model; +struct llama_lora_adapter; struct clip_ctx; +// Function to get the global LoRA adapter +extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter(); + +// Function to get multiple LoRA adapters with their scales +extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters); + namespace lf { namespace server { From dc8a203dceac8b570e25af5d87cb37b0595026a0 Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 10:56:35 -0400 Subject: [PATCH 2/9] fixes scale printing in server log --- llamafile/server/prog.cpp | 6 ++++-- llamafile/server/slot.cpp | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp index 237be622de..6377cabc78 100644 --- a/llamafile/server/prog.cpp +++ b/llamafile/server/prog.cpp @@ -120,8 +120,10 @@ main(int argc, char* argv[]) if (FLAG_lora_adapters_count > 0) { SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count); for (int i = 0; i < FLAG_lora_adapters_count; i++) { - SLOG("loading LoRA adapter %d from %s with scale %.2f", i + 1, - FLAG_lora_adapters[i].path, FLAG_lora_adapters[i].scale); + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); + SLOG("loading LoRA adapter %d from %s with scale %s", i + 1, + FLAG_lora_adapters[i].path, scale_buf); g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path); g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; if (!g_lora_adapters[i].adapter) { diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index d11d9bdf4e..55138417b1 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -140,7 +140,9 @@ Slot::start() ctx_ = nullptr; return false; } - SLOG("applied LoRA adapter %d to slot #%d with scale %.2f", i + 1, id_, scales[i]); + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]); + SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf); } } From e3288bcba9d294cbe83601a251fca8a7e85c5ffc Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 19:29:47 -0400 Subject: [PATCH 3/9] adds multi-lora hot-swapping functionality with --lora-init-without-apply (mirroring llama.cpp functionality) --- README.md | 193 ++++++---- RELEASE.md | 29 +- llamafile/flags.cpp | 8 +- llamafile/llamafile.h | 3 +- llamafile/server/client.cpp | 2 + llamafile/server/client.h | 22 ++ llamafile/server/lora_adapters.cpp | 325 +++++++++++++++++ llamafile/server/main.1 | 25 ++ llamafile/server/main.1.asc | 561 +++++++++++++++-------------- llamafile/server/prog.cpp | 44 ++- 10 files changed, 842 insertions(+), 370 deletions(-) create mode 100644 llamafile/server/lora_adapters.cpp diff --git a/README.md b/README.md index 25f1962314..3c6cfb127f 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ ever leaves your computer. 2. Open your computer's terminal. 3. If you're using macOS, Linux, or BSD, you'll need to grant permission -for your computer to execute this new file. (You only need to do this -once.) + for your computer to execute this new file. (You only need to do this + once.) ```sh chmod +x llava-v1.5-7b-q4.llamafile @@ -48,10 +48,10 @@ chmod +x llava-v1.5-7b-q4.llamafile ``` 6. Your browser should open automatically and display a chat interface. -(If it doesn't, just open your browser and point it at http://localhost:8080) + (If it doesn't, just open your browser and point it at http://localhost:8080) 7. When you're done chatting, return to your terminal and hit -`Control-C` to shut down llamafile. + `Control-C` to shut down llamafile. **Having trouble? See the "Gotchas" section below.** @@ -103,25 +103,25 @@ The response that's printed should look like the following: ```json { - "choices" : [ - { - "finish_reason" : "stop", - "index" : 0, - "message" : { - "content" : "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.", - "role" : "assistant" - } + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.", + "role": "assistant" } - ], - "created" : 1704199256, - "id" : "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU", - "model" : "LLaMA_CPP", - "object" : "chat.completion", - "usage" : { - "completion_tokens" : 38, - "prompt_tokens" : 78, - "total_tokens" : 116 - } + } + ], + "created": 1704199256, + "id": "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU", + "model": "LLaMA_CPP", + "object": "chat.completion", + "usage": { + "completion_tokens": 38, + "prompt_tokens": 78, + "total_tokens": 116 + } } ``` @@ -201,39 +201,77 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0. - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor - `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed) +### Dynamic LoRA Adapter Management (Hot-Swapping) + +When running llamafile in server mode, you can dynamically adjust LoRA adapter scales during inference without restarting the server. This enables hot-swapping of adapter configurations on the fly. + +#### API Endpoints + +- **GET** `/lora-adapters`: Returns current LoRA adapter configuration +- **POST** `/lora-adapters`: Updates LoRA adapter scales with JSON payload + +#### Examples + +**View current LoRA adapter configuration:** + +```bash +curl http://localhost:8080/lora-adapters +``` + +**Update LoRA adapter scales:** + +```bash +curl -X POST http://localhost:8080/lora-adapters \ + -H "Content-Type: application/json" \ + -d '[ + {"id": 0, "scale": 0.8}, + {"id": 1, "scale": 1.2} + ]' +``` + +The API returns JSON responses with the current adapter configuration: + +```json +[ + { "id": 0, "path": "adapter1.gguf", "scale": 0.8 }, + { "id": 1, "path": "adapter2.gguf", "scale": 1.2 } +] +``` + ### Important Notes - LoRA adapters are automatically applied to each inference slot when they start - Memory mapping (`mmap`) is automatically disabled when using LoRA adapters for compatibility - Multiple adapters can be combined by using multiple `--lora` and `--lora-scaled` flags - The base model specified by `-m` serves as the foundation for the LoRA adapter(s) +- Scale changes via the API take effect immediately for new inference requests +- Hot-swapping allows real-time fine-tuning of model behavior without server restart ## Other example llamafiles We also provide example llamafiles for other models, so you can easily try out llamafile with different kinds of LLMs. -| Model | Size | License | llamafile | other quants | -| --- | --- | --- | --- | --- | -| LLaMA 3.2 1B Instruct | 1.11 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-1B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile) | -| LLaMA 3.2 3B Instruct | 2.62 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-3B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile) | -| LLaMA 3.1 8B Instruct | 5.23 GB | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.1-8B-Instruct.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile) | -| Gemma 3 1B Instruct | 1.32 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-1b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile) | -| Gemma 3 4B Instruct | 3.50 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-4b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile) | -| Gemma 3 12B Instruct | 7.61 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-12b-it.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile) | -| QwQ 32B | 7.61 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen\_QwQ-32B-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile) | -| R1 Distill Qwen 14B | 9.30 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Qwen-14B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile)| -| R1 Distill Llama 8B | 5.23 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Llama-8B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile)| -| LLaVA 1.5 | 3.97 GB | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile) | -| Mistral-7B-Instruct v0.3| 4.42 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mistral-7b-instruct-v0.3.Q4\_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile) | -| Granite 3.2 8B Instruct | 5.25 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [granite-3.2-8b-instruct-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile) | -| Phi-3-mini-4k-instruct | 7.67 GB | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE) | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile) | -| Mixtral-8x7B-Instruct | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mixtral-8x7b-instruct-v0.1.Q5\_K\_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile) | -| OLMo-7B | 5.68 GB | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE) | [OLMo-7B-0424.Q6\_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile) | -| *Text Embedding Models* | | | | | -| E5-Mistral-7B-Instruct | 5.16 GB | [MIT](https://choosealicense.com/licenses/mit/) | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct) | -| mxbai-embed-large-v1 | 0.7 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true) | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile) | - +| Model | Size | License | llamafile | other quants | +| ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | +| LLaMA 3.2 1B Instruct | 1.11 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-1B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile) | +| LLaMA 3.2 3B Instruct | 2.62 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-3B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile) | +| LLaMA 3.1 8B Instruct | 5.23 GB | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.1-8B-Instruct.Q4_K_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile) | +| Gemma 3 1B Instruct | 1.32 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-1b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile) | +| Gemma 3 4B Instruct | 3.50 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-4b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile) | +| Gemma 3 12B Instruct | 7.61 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-12b-it.Q4_K_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile) | +| QwQ 32B | 7.61 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen_QwQ-32B-Q4_K_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile) | +| R1 Distill Qwen 14B | 9.30 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Qwen-14B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile) | +| R1 Distill Llama 8B | 5.23 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile) | +| LLaVA 1.5 | 3.97 GB | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile) | +| Mistral-7B-Instruct v0.3 | 4.42 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mistral-7b-instruct-v0.3.Q4_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile) | +| Granite 3.2 8B Instruct | 5.25 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [granite-3.2-8b-instruct-Q4_K_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile) | +| Phi-3-mini-4k-instruct | 7.67 GB | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE) | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile) | +| Mixtral-8x7B-Instruct | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile) | +| OLMo-7B | 5.68 GB | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE) | [OLMo-7B-0424.Q6_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile) | +| _Text Embedding Models_ | | | | | +| E5-Mistral-7B-Instruct | 5.16 GB | [MIT](https://choosealicense.com/licenses/mit/) | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct) | +| mxbai-embed-large-v1 | 0.7 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true) | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile) | Here is an example for the Mistral command-line llamafile: @@ -243,9 +281,9 @@ Here is an example for the Mistral command-line llamafile: And here is an example for WizardCoder-Python command-line llamafile: -```sh +````sh ./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n' -``` +```` And here's an example for the LLaVA command-line llamafile: @@ -278,38 +316,38 @@ This is all accomplished by combining llama.cpp with Cosmopolitan Libc, which provides some useful capabilities: 1. llamafiles can run on multiple CPU microarchitectures. We -added runtime dispatching to llama.cpp that lets new Intel systems use -modern CPU features without trading away support for older computers. + added runtime dispatching to llama.cpp that lets new Intel systems use + modern CPU features without trading away support for older computers. 2. llamafiles can run on multiple CPU architectures. We do -that by concatenating AMD64 and ARM64 builds with a shell script that -launches the appropriate one. Our file format is compatible with WIN32 -and most UNIX shells. It's also able to be easily converted (by either -you or your users) to the platform-native format, whenever required. + that by concatenating AMD64 and ARM64 builds with a shell script that + launches the appropriate one. Our file format is compatible with WIN32 + and most UNIX shells. It's also able to be easily converted (by either + you or your users) to the platform-native format, whenever required. 3. llamafiles can run on six OSes (macOS, Windows, Linux, -FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll -only need to build your code once, using a Linux-style toolchain. The -GCC-based compiler we provide is itself an Actually Portable Executable, -so you can build your software for all six OSes from the comfort of -whichever one you prefer most for development. + FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll + only need to build your code once, using a Linux-style toolchain. The + GCC-based compiler we provide is itself an Actually Portable Executable, + so you can build your software for all six OSes from the comfort of + whichever one you prefer most for development. 4. The weights for an LLM can be embedded within the llamafile. -We added support for PKZIP to the GGML library. This lets uncompressed -weights be mapped directly into memory, similar to a self-extracting -archive. It enables quantized weights distributed online to be prefixed -with a compatible version of the llama.cpp software, thereby ensuring -its originally observed behaviors can be reproduced indefinitely. + We added support for PKZIP to the GGML library. This lets uncompressed + weights be mapped directly into memory, similar to a self-extracting + archive. It enables quantized weights distributed online to be prefixed + with a compatible version of the llama.cpp software, thereby ensuring + its originally observed behaviors can be reproduced indefinitely. 5. Finally, with the tools included in this project you can create your -*own* llamafiles, using any compatible model weights you want. You can -then distribute these llamafiles to other people, who can easily make -use of them regardless of what kind of computer they have. + _own_ llamafiles, using any compatible model weights you want. You can + then distribute these llamafiles to other people, who can easily make + use of them regardless of what kind of computer they have. ## Using llamafile with external weights Even though our example llamafiles have the weights built-in, you don't -*have* to use llamafile that way. Instead, you can download *just* the +_have_ to use llamafile that way. Instead, you can download _just_ the llamafile software (without any weights included) from our releases page. You can then use it alongside any external weights you may have on hand. External weights are particularly useful for Windows users because they @@ -326,7 +364,6 @@ curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1 Windows users may need to change `./llamafile.exe` to `.\llamafile.exe` when running the above command. - ## Gotchas and troubleshooting On any platform, if your llamafile process is immediately killed, check @@ -341,13 +378,12 @@ If you use zsh and have trouble running llamafile, try saying `sh -c ./llamafile`. This is due to a bug that was fixed in zsh 5.9+. The same is the case for Python `subprocess`, old versions of Fish, etc. - #### Mac error "... cannot be opened because the developer cannot be verified" 1. Immediately launch System Settings, then go to Privacy & Security. llamafile should be listed at the bottom, with a button to Allow. -2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama. +2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama. -### Linux +### Linux On some Linux systems, you might get errors relating to `run-detectors` or WINE. This is due to `binfmt_misc` registrations. You can fix that by @@ -362,6 +398,7 @@ sudo sh -c "echo ':APE-jart:M::jartsr::/usr/bin/ape:' >/proc/sys/fs/binfmt_misc/ ``` ### Windows + As mentioned above, on Windows you may need to rename your llamafile by adding `.exe` to the filename. @@ -438,8 +475,8 @@ systems, llamafile extracts a small loader program named `ape` to `$TMPDIR/.llamafile` or `~/.ape-1.9` which is used to map your model into memory. -[1] Darwin kernel versions 15.6+ *should* be supported, but we currently - have no way of testing that. +[1] Darwin kernel versions 15.6+ _should_ be supported, but we currently +have no way of testing that. ## Supported CPUs @@ -496,7 +533,7 @@ On Linux, NVIDIA users will need to install the CUDA SDK (ideally using the shell script installer) and ROCm users need to install the HIP SDK. They're detected by looking to see if `nvcc` or `hipcc` are on the PATH. -If you have both an AMD GPU *and* an NVIDIA GPU in your machine, then +If you have both an AMD GPU _and_ an NVIDIA GPU in your machine, then you may need to qualify which one you want used, by passing either `--gpu amd` or `--gpu nvidia`. @@ -521,12 +558,12 @@ Here's an example of how to generate code for a libc function using the llama.cpp command line interface, utilizing WizardCoder-Python-13B weights: -```sh +````sh llamafile \ -m wizardcoder-python-13b-v1.0.Q8_0.gguf \ --temp 0 -r '}\n' -r '```\n' \ -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n' -``` +```` Here's a similar example that instead utilizes Mistral-7B-Instruct weights for prose composition: @@ -680,12 +717,13 @@ commands will display that information when passing the `--help` flag. ## Running llamafile with models downloaded by third-party applications -This section answers the question *"I already have a model downloaded locally by application X, can I use it with llamafile?"*. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow. +This section answers the question _"I already have a model downloaded locally by application X, can I use it with llamafile?"_. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow. ### LM Studio + [LM Studio](https://lmstudio.ai/) stores downloaded models in `~/.cache/lm-studio/models`, in subdirectories with the same name of the models (following HuggingFace's `account_name/model_name` format), with the same filename you saw when you chose to download the file. - So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows: +So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows: ``` cd ~/.cache/lm-studio/models/TheBloke/Llama-2-7B-GGUF @@ -698,7 +736,7 @@ When you download a new model with [ollama](https://ollama.com), all its metadat The manifest maps each file related to the model (e.g. GGUF weights, license, prompt template, etc) to a sha256 digest. The digest corresponding to the element whose `mediaType` is `application/vnd.ollama.image.model` is the one referring to the model's GGUF file. -Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see *only* those sha256-* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows: +Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see _only_ those sha256-\* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows: ``` cd ~/.ollama/models/blobs @@ -847,5 +885,4 @@ should that be desired. The llamafile logo on this page was generated with the assistance of DALLĀ·E 3. - [![Star History Chart](https://api.star-history.com/svg?repos=Mozilla-Ocho/llamafile&type=Date)](https://star-history.com/#Mozilla-Ocho/llamafile&Date) diff --git a/RELEASE.md b/RELEASE.md index b7fb2634e4..c3b727fa4f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -9,24 +9,35 @@ There are a few steps in making a Llamafile release which will be detailed in th This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include: - **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors +- **Dynamic Hot-Swapping API**: Adjust LoRA adapter scales in real-time during inference without restarting the server - **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference -- **Compatible Flags**: +- **REST API Endpoints**: + - `GET /lora-adapters`: View current adapter configuration + - `POST /lora-adapters`: Update adapter scales dynamically +- **Compatible Flags**: - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0) - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases) - **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility +- **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety - **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle Example usage: + ```bash # Single adapter with default scale llamafile -m base_model.gguf --lora adapter.gguf --server # Multiple adapters with different scales llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server + +# Dynamic scale adjustment via API +curl -X POST http://localhost:8080/lora-adapters \ + -H "Content-Type: application/json" \ + -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1.2}]' ``` -This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows. +This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities. The two primary artifacts of the release are the `llamafile-.zip` and the binaries for the GitHub release. @@ -36,13 +47,13 @@ Note: Step 2 and 3 are only needed if you are making a new release of the ggml-c 1. Update the version number in `version.h` 2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA. - - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively. - - For LocalScore you can do this by running the script `./localscore/cuda.sh`. - - The files will be built and placed your home directory. + - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively. + - For LocalScore you can do this by running the script `./localscore/cuda.sh`. + - The files will be built and placed your home directory. 3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore. - - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively. - - For LocalScore you can do this by running the script `./localscore/cuda.bat`. - - The files will be built and placed in the `build/release` directory. + - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively. + - For LocalScore you can do this by running the script `./localscore/cuda.bat`. + - The files will be built and placed in the `build/release` directory. 4. Build the project with `make -j8` 5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local` @@ -152,4 +163,4 @@ You can use the script to create the appropriately named binaries: `./llamafile/release.sh -v -s -d ` -Make sure to move the llamafile-.zip file to the as well, and you are good to release after you've tested. \ No newline at end of file +Make sure to move the llamafile-.zip file to the as well, and you are good to release after you've tested. diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp index 37e25cb4d0..17fa46e229 100644 --- a/llamafile/flags.cpp +++ b/llamafile/flags.cpp @@ -53,6 +53,7 @@ bool FLAG_tinyblas = false; bool FLAG_trace = false; bool FLAG_unsecure = false; bool FLAG_v2 = false; +bool FLAG_lora_init_without_apply = false; const char *FLAG_chat_template = ""; const char *FLAG_db = nullptr; const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;" @@ -66,7 +67,6 @@ const char *FLAG_prompt = nullptr; const char *FLAG_url_prefix = ""; const char *FLAG_www_root = "/zip/www"; const char *FLAG_lora = nullptr; -const char *FLAG_lora_base = nullptr; // Multiple LoRA adapters support struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0}; @@ -431,10 +431,8 @@ void llamafile_get_flags(int argc, char **argv) { continue; } - if (!strcmp(flag, "--lora-base")) { - if (i == argc) - missing("--lora-base"); - FLAG_lora_base = argv[i++]; + if (!strcmp(flag, "--lora-init-without-apply")) { + FLAG_lora_init_without_apply = true; continue; } diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index c5fd428439..7226ec051d 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -24,6 +24,7 @@ extern bool FLAG_trace; extern bool FLAG_trap; extern bool FLAG_unsecure; extern bool FLAG_v2; +extern bool FLAG_lora_init_without_apply; extern const char *FLAG_chat_template; extern const char *FLAG_db; extern const char *FLAG_db_startup_sql; @@ -37,11 +38,11 @@ extern const char *FLAG_url_prefix; extern const char *FLAG_www_root; extern double FLAG_token_rate; extern const char *FLAG_lora; -extern const char *FLAG_lora_base; // LoRA adapter info structure to match llama.cpp struct llamafile_lora_adapter_info { const char* path; + const char* name; // Model/adapter name for identification float scale; }; diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index e142a5a219..364348a54b 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -705,6 +705,8 @@ Client::dispatcher() return slotz(); if (p1 == "flagz") return flagz(); + if (p1 == "lora-adapters") + return lora_adapters(); #if 0 // TODO: implement frontend for database diff --git a/llamafile/server/client.h b/llamafile/server/client.h index b9e00da41b..74d1314e62 100644 --- a/llamafile/server/client.h +++ b/llamafile/server/client.h @@ -35,6 +35,11 @@ SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H)) struct llama_model; +struct llama_lora_adapter; + +namespace jt { +struct Json; +} namespace lf { namespace server { @@ -121,6 +126,11 @@ struct Client bool slotz() __wur; bool flagz() __wur; + bool lora_adapters() __wur; + bool handle_apply_adapters(jt::Json&) __wur; + bool handle_load_adapter(jt::Json&) __wur; + bool handle_clear_adapters() __wur; + bool handle_upstream_lora_apply(jt::Json&) __wur; bool db_chat(int64_t) __wur; bool db_chats() __wur; bool db_message(int64_t) __wur; @@ -129,3 +139,15 @@ struct Client } // namespace server } // namespace lf + +// Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp) +#define MAX_LORA_ADAPTERS 8 +struct lora_adapter_container { + struct llama_lora_adapter* adapter; + float scale; + std::string name; // Model/adapter name for identification + bool applied; // Whether this adapter is currently applied to slots +}; + +extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; +extern int g_lora_adapters_count; diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp new file mode 100644 index 0000000000..35e55198df --- /dev/null +++ b/llamafile/server/lora_adapters.cpp @@ -0,0 +1,325 @@ +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "client.h" +#include "llama.cpp/llama.h" +#include "llamafile/json.h" +#include "llamafile/llamafile.h" +#include "llamafile/server/log.h" +#include "llamafile/server/server.h" +#include "llamafile/server/worker.h" +#include "llamafile/server/slots.h" +#include "llamafile/server/slot.h" +#include + +using jt::Json; + +// External declarations for global LoRA adapter storage from prog.cpp (outside namespace) +// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h +extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; +extern int g_lora_adapters_count; + +namespace lf { +namespace server { + +bool +Client::lora_adapters() +{ + // Support both GET and POST methods + if (msg_.method == kHttpGet) { + // GET: Return current adapter configuration (upstream llama.cpp format) + Json json; + json.setArray(); + std::vector& json_array = json.getArray(); + + for (int i = 0; i < g_lora_adapters_count; i++) { + Json adapter; + adapter.setObject(); + adapter["id"] = i; + adapter["path"] = g_lora_adapters[i].name; // Use name as path for now + adapter["scale"] = g_lora_adapters[i].scale; + json_array.push_back(adapter); + } + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, json.toString()); + + } else if (msg_.method == kHttpPost) { + // POST: Apply LoRA adapters by ID and scale (upstream llama.cpp format) + + // Validate content type + if (!HasHeader(kHttpContentType) || + !IsMimeType(HeaderData(kHttpContentType), + HeaderLength(kHttpContentType), + "application/json")) { + return send_error(400, "Content-Type must be application/json"); + } + + // Read the payload + if (!read_payload()) + return false; + + // Parse JSON payload - expecting an array of {id, scale} objects + auto [status, json] = Json::parse(std::string(payload_)); + if (status != Json::success) + return send_error(400, Json::StatusToString(status)); + if (!json.isArray()) + return send_error(400, "Request body must be an array"); + + // Apply the LoRA configuration + return handle_upstream_lora_apply(json); + + } else { + return send_error(405, "Method Not Allowed"); + } +} + +bool +Client::handle_apply_adapters(Json& json) +{ + // Get active slots and apply current adapters to them + if (g_lora_adapters_count == 0) { + Json response; + response["success"] = false; + response["message"] = "No adapters loaded to apply"; + + char* p = append_http_response_message(obuf_.p, 400); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Apply adapters to all slots via the server + // Note: This would require coordination with the slot management system + SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count); + + Json response; + response["success"] = true; + response["message"] = "Adapters applied to active slots"; + response["adapters_applied"] = g_lora_adapters_count; + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_load_adapter(Json& json) +{ + // Load a new adapter from file + if (!json.contains("path")) { + return send_error(400, "Missing 'path' field for load operation"); + } + + std::string adapter_path = json["path"].getString(); + float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f; + + // Check if we have room for more adapters + if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) { + Json response; + response["success"] = false; + response["message"] = "Maximum number of adapters already loaded"; + response["max_adapters"] = MAX_LORA_ADAPTERS; + + char* p = append_http_response_message(obuf_.p, 400); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Check if file exists + if (!std::filesystem::exists(adapter_path)) { + Json response; + response["success"] = false; + response["message"] = "Adapter file not found: " + adapter_path; + + char* p = append_http_response_message(obuf_.p, 404); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Load the adapter + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); + SLOG("loading LoRA adapter from %s with scale %s", adapter_path.c_str(), scale_buf); + + struct llama_lora_adapter* adapter = llama_lora_adapter_init(model_, adapter_path.c_str()); + if (!adapter) { + Json response; + response["success"] = false; + response["message"] = "Failed to load adapter from " + adapter_path; + + char* p = append_http_response_message(obuf_.p, 500); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Store the adapter + int index = g_lora_adapters_count; + g_lora_adapters[index].adapter = adapter; + g_lora_adapters[index].scale = scale; + g_lora_adapters_count++; + + SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str()); + + Json response; + response["success"] = true; + response["message"] = "Adapter loaded successfully"; + response["index"] = index; + response["path"] = adapter_path; + response["scale"] = scale; + response["total_adapters"] = g_lora_adapters_count; + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_clear_adapters() +{ + // Clear all loaded adapters + SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count); + + for (int i = 0; i < g_lora_adapters_count; i++) { + if (g_lora_adapters[i].adapter) { + llama_lora_adapter_free(g_lora_adapters[i].adapter); + g_lora_adapters[i].adapter = nullptr; + g_lora_adapters[i].scale = 0.0f; + } + } + + int cleared_count = g_lora_adapters_count; + g_lora_adapters_count = 0; + + SLOG("cleared %d LoRA adapter(s)", cleared_count); + + Json response; + response["success"] = true; + response["message"] = "All adapters cleared"; + response["cleared_count"] = cleared_count; + response["remaining_count"] = 0; + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_upstream_lora_apply(Json& json) +{ + // Handle upstream llama.cpp LoRA API format: array of {id, scale} objects + std::vector& json_array = json.getArray(); + SLOG("applying LoRA configuration with %d entries", (int)json_array.size()); + + // First, reset all adapter scales to 0.0 (disabled) + for (int i = 0; i < g_lora_adapters_count; i++) { + g_lora_adapters[i].applied = false; + } + + // Process each entry in the array + for (size_t i = 0; i < json_array.size(); i++) { + Json& entry = json_array[i]; + + if (!entry.isObject()) { + return send_error(400, "Each entry must be an object with 'id' and 'scale' fields"); + } + + if (!entry.contains("id") || !entry.contains("scale")) { + return send_error(400, "Each entry must have 'id' and 'scale' fields"); + } + + int id = entry["id"].getNumber(); + float scale = entry["scale"].getNumber(); + + // Validate ID range + if (id < 0 || id >= g_lora_adapters_count) { + return send_error(400, "Invalid adapter ID"); + } + + // Update the adapter configuration + g_lora_adapters[id].scale = scale; + g_lora_adapters[id].applied = (scale > 0.0f); + + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); + SLOG("set LoRA adapter %d ('%s') scale to %s", + id, g_lora_adapters[id].name.c_str(), scale_buf); + } + + // Re-apply LoRA adapters to all active slots with updated scales + SLOG("re-applying LoRA adapters to all active slots"); + Slots* slots = worker_->server_->slots_; + + // Lock the slots to prevent concurrent access during LoRA re-application + pthread_mutex_lock(&slots->lock_); + + for (size_t i = 0; i < slots->slots_.size(); ++i) { + Slot* slot = slots->slots_[i].get(); + if (slot->ctx_) { + SLOG("re-applying LoRA adapters to slot #%d", slot->id_); + + // Clear existing LoRA adapters from this context + llama_lora_adapter_clear(slot->ctx_); + + // Use the same approach as slot initialization: get all adapters via the function + struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; + float scales[MAX_LORA_ADAPTERS]; + int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); + + SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_); + + // Re-apply all adapters with their current scales + for (int j = 0; j < adapter_count; ++j) { + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]); + SLOG("processing LoRA adapter %d with scale %s", j, scale_buf); + if (scales[j] > 0.0f) { + if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) { + SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_); + } else { + SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf); + } + } else { + SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf); + } + } + } + } + + pthread_mutex_unlock(&slots->lock_); + SLOG("finished re-applying LoRA adapters to all slots"); + + // Return updated adapter configuration + Json response; + response.setArray(); + std::vector& response_array = response.getArray(); + for (int i = 0; i < g_lora_adapters_count; i++) { + Json adapter; + adapter.setObject(); + adapter["id"] = i; + adapter["path"] = g_lora_adapters[i].name; + adapter["scale"] = g_lora_adapters[i].scale; + response_array.push_back(adapter); + } + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +} // namespace server +} // namespace lf diff --git a/llamafile/server/main.1 b/llamafile/server/main.1 index e5d01adc2a..60c6d3adb7 100644 --- a/llamafile/server/main.1 +++ b/llamafile/server/main.1 @@ -29,6 +29,23 @@ recommended that you run multiple instances of llamafiler behind a reverse proxy such as NGINX or Redbean. .It Fl mm Ar FNAME , Fl Fl mmproj Ar FNAME Path of vision model weights. +.It Fl Fl lora Ar FNAME +Path to LoRA adapter weights. This flag may be repeated to load multiple +LoRA adapters. Each adapter will be applied with a default scale of 1.0. +The base model specified by +.Fl m +will be used as the foundation for all LoRA adaptations. +.It Fl Fl lora-scaled Ar FNAME Ar SCALE +Path to LoRA adapter weights with custom scaling factor. The +.Ar SCALE +parameter is a floating point number that controls the strength of the +LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced strength). +This flag may be repeated to load multiple scaled LoRA adapters. +.It Fl Fl lora-init-without-apply +Load LoRA adapters at startup without automatically applying them. When +this flag is used, adapters are initialized but not active until +explicitly applied via the API. This is useful for dynamic LoRA adapter +management through the HTTP endpoints. .It Fl Fl db Ar FILE Specifies path of sqlite3 database. .Pp @@ -215,6 +232,14 @@ Here's an example of how you might start this server: .Pp .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf" .Pp +Here's how to start with a LoRA adapter: +.Pp +.Dl "llamafiler -m base_model.gguf --lora adapter.gguf" +.Pp +Here's how to use multiple LoRA adapters with custom scaling: +.Pp +.Dl "llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.8" +.Pp Here's how to send a tokenization request: .Pp .Dl "curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world" diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc index ab99e21913..b12ee187eb 100644 --- a/llamafile/server/main.1.asc +++ b/llamafile/server/main.1.asc @@ -1,269 +1,292 @@ -LLAMAFILER(1) General Commands Manual LLAMAFILER(1) - -NAME - llamafiler — fast reliable large language model server - -SYNOPSIS - llamafiler -m model.gguf [flags...] - -DESCRIPTION - llamafiler llamafiler is an HTTP server for Large Language Models - (LLMs). It includes a web GUI for both chatbot and text completion. It - can be your OpenAI API compatible embeddings / completions / chat com‐ - pletions server. It's able to more intelligently recycle context win‐ - dows across multiple slots serving multiple clients. - -OPTIONS - The following options are available: - - --version - Print version and exit. - - -h, --help - Show help message and exit. - - -m FNAME, --model FNAME - Path of GGUF model weights. Each server process is currently - limited to serving only one model. If you need to host multiple - models, then it's recommended that you run multiple instances - of llamafiler behind a reverse proxy such as NGINX or Redbean. - - -mm FNAME, --mmproj FNAME - Path of vision model weights. - - --db FILE - Specifies path of sqlite3 database. - - The default is ~/.llamafile/llamafile.sqlite3 - - -ngl N, --gpu-layers N, --n-gpu-layers N - Specifies number of layers to offload to GPU. - - This flag must be passed in order to use GPU on systems with - NVIDIA or AMD GPUs. If you're confident that you have enough - VRAM, then you can pass -ngl 999 to enable full offloading, - since this number is automatically downtuned to however many - number of layers the model has. If VRAM is limited, then the - --verbose flag may be passed to learn how many layers the model - has, e.g. 35, which can then be down-tuned until the out of - memory error goes away. - - On Apple Silicon systems with Metal, GPU offloading is enabled - by default. Since these GPUs use unified memory, they're - treated as having a single layer; therefore, using values - higher than 1 will be treated as 1. You can pass -ngl 0 to dis‐ - able GPU offloading and run in CPU mode on Apple Metal systems. - - -l HOSTPORT, --listen HOSTPORT - Specifies the local [HOST:]PORT on which the HTTP server should - listen. By default this is 0.0.0.0:8080 which means llamafiler - will bind to port 8080 on every locally available IPv4 network - interface. This option may currently only be specified once. - - -c TOKENS, --ctx-size TOKENS - Specifies context size. This specifies how long a completion - can get before it runs out of space. It defaults to 8k which - means 8192 tokens. Many models support a larger context size, - like 128k, but that'll need much more RAM or VRAM per slot. If - this value is larger than the trained context size of the - model, it'll be tuned down to the maximum. If this value is 0 - or negative, the maximum number of tokens will be used. - - -s COUNT, --slots COUNT - Specifies how many slots to maintain. This defaults to 1. Slots - are used by chat completions requests. When such a request - comes in, the client needs to take control of a slot. When the - completion is finished, the slot is relinquished back to the - server. HTTP clients will wait for a slot to be relinquished if - none are available. Tuning this parameter to nicely fit avail‐ - able RAM or VRAM can help you manage your server resources, and - control how much completion parallelism can happen. Please - note that --ctx-size has a strong influence on how many slots - can be created. - - --decay-delay INT - Number of seconds a context window slot needs to be inactive - before the system starts to strongly consider giving it to - other clients. The default is 300 which is five minutes. - - --decay-growth FLOAT - Sets slot decay growth factor. Context window slots are as‐ - signed in a least recently used fashion, based on the formula - age + e sup {growth * (age - delay)} - - -p TEXT, --prompt TEXT, --system-prompt TEXT - Specifies system prompt. This value is passed along to the web - frontend. - - --no-display-prompt - Hide system prompt from web user interface. - - --nologo - Hide llamafile logo icon from web ui. - - --url-prefix URLPREFIX - Specifies a URL prefix (subdirectory) under which the HTTP - server will make the API accessible, e.g. /lamafiler. Useful - when running llamafiler behind a reverse proxy such as NGINX or - Redbean. By default, this is set to / (root). - - --verbose - Enable logging of diagnostic information. This flag is useful - for learning more about the model and hardware. It can also be - helpful for troubleshooting errors. We currently recommend that - this flag be avoided in production since the llama.cpp logger - may disrupt thread cancelation. - - -w N, --workers N - Number of HTTP client handling threads. - - --trust CIDR - Adds a network to the trusted network list. This argument is - specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By - default, all clients are untrusted, which means they're subject - to token bucket throttling, and additional security precautions - that may cause request handling to go slightly slower. There‐ - fore this flag is important to use if you want to accurately - benchmark llamafiler, since the server will otherwise see the - benchmark as a DDOS and deprioritize its traffic accordingly. - - --ip-header STR - If this flag is passed a value, e.g. X-Forwarded-For, then any - trusted may send this header to your llamafile server to let it - know what the true effective client IPv4 address actually is. - After this happens the default security restrictions, e.g. to‐ - ken bucket, will be measured and applied against that IPv4 ad‐ - dress and its adjacent networks. - - --token-rate N - Specifies how many times per second a token is dropped in each - bucket. This setting is used to define a limitation on how - many TCP connects and HTTP messages each chunk of the IPv4 ad‐ - dress space is permitted to send to llamafiler over a sustained - period of time. The default token rate is 1, which means that, - on a long enough timeline, a class-C network will be depriori‐ - tized if it sends more than one request per second. No real - penalty actually applies though until the server runs out of - resources, e.g. HTTP request workers. - - --token-burst N - Specifies how many HTTP requests and TCP connects a given slice - of the IPv4 address space is permitted to send within a short - period of time, before token bucket restrictions kick in, and - cause the client to be deprioritized. By default, this value is - set to 100. It may be tuned to any value between 1 and 127 in‐ - clusive. - - --token-cidr N - Specifies IPv4 address space granularity of token bucket algo‐ - rithm, in network bits. By default, this value is set to 24 - which means individual IPv4 addresses are viewed as being rep‐ - resentative members of a class-C network, or in other words, - each group of 256 IPv4 addresses is lumped together. If one IP - in the group does something bad, then bad things happen to all - the other IPv4 addresses in that granule. This number may be - set to any integer between 3 and 32 inclusive. Specifying a - higher number will trade away system memory to increase network - specificity. For example, using 32 means that 4 billion indi‐ - vidual token buckets will be created. By default, a background - thread drops one token in each bucket every second, so that - could potentially be a lot of busy work. A value of three means - that everyone on the Internet who talks to your server will - have to fight over only eight token buckets in total. - - --unsecure - Disables sandboxing. By default, llamafiler puts itself in a - SECCOMP BPF sandbox, so that even if your server gets hacked in - the worst possible way (some kind of C++ memory bug) then - there's very little damage an attacker will be able to do. This - works by restricting system calls using Cosmopolitan Libc's im‐ - plementation of pledge() which is currently only supported on - Linux (other OSes will simply be unsecured by default). The - pledge security policy that's used by default is "stdio anet" - which means that only relatively harmless system calls like - read(), write(), and accept() are allowed once the server has - finished initializing. It's not possible for remotely executed - code to do things like launch subprocesses, read or write to - the filesystem, or initiate a new connection to a server. - - -k N, --keepalive N - Specifies the TCP keepalive interval in seconds. This value is - passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're - supported by the host operating system. If this value is - greater than 0, then the the SO_KEEPALIVE and TCP_NODELAY op‐ - tions are enabled on network sockets, if supported by the host - operating system. The default keepalive is 5. - - --http-obuf-size N - Size of HTTP output buffer size, in bytes. Default is 1048576. - - --http-ibuf-size N - Size of HTTP input buffer size, in bytes. Default is 1048576. - - --chat-template NAME - Specifies or overrides chat template for model. - - Normally the GGUF metadata tokenizer.chat_template will specify - this value for instruct models. This flag may be used to either - override the chat template, or specify one when the GGUF meta‐ - data field is absent, which effectively forces the web ui to - enable chatbot mode. - - Supported chat template names are: chatml, llama2, llama3, mis‐ - tral (alias for llama2), phi3, zephyr, monarch, gemma, gemma2 - (alias for gemma), orion, openchat, vicuna, vicuna-orca, - deepseek, command-r, chatglm3, chatglm4, minicpm, deepseek2, or - exaone3. - - It is also possible to pass the jinja2 template itself to this - argument. Since llamafiler doesn't currently support jinja2, a - heuristic will be used to guess which of the above templates - the template represents. - - --completion-mode - Forces web ui to operate in completion mode, rather than chat - mode. Normally the web ui chooses its mode based on the GGUF - metadata. Base models normally don't define tokenizer.chat_tem‐ - plate whereas instruct models do. If it's a base model, then - the web ui will automatically use completion mode only, without - needing to specify this flag. This flag is useful in cases - where a prompt template is defined by the gguf, but it is de‐ - sirable for the chat interface to be disabled. - - --db-startup-sql CODE - Specifies SQL code that should be executed whenever connecting - to the SQLite database. The default is the following code, - which enables the write-ahead log. - - PRAGMA journal_mode=WAL; - PRAGMA synchronous=NORMAL; - - --reserve-tokens N - Percent of context window to reserve for predicted tokens. When - the server runs out of context window, old chat messages will - be forgotten until this percent of the context is empty. The - default is 15%. If this is specified as a floating point num‐ - ber, e.g. 0.15, then it'll be multiplied by 100 to get the per‐ - cent. - -EXAMPLES - Here's an example of how you might start this server: - - llamafiler -m all-MiniLM-L6-v2.F32.gguf - - Here's how to send a tokenization request: - - curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world - - Here's how to send a embedding request: - - curl -v http://127.0.0.1:8080/embedding?content=hello+world - -DOCUMENTATION - Read our Markdown documentation for additional help and tutorials. See - llamafile/server/doc/index.md in the source repository on GitHub. - -SEE ALSO - llamafile(1), whisperfile(1) - -Mozilla Ocho November 30, 2024 LLAMAFILER(1) +LLAMAFILER(1) General Commands Manual LLAMAFILER(1) + +NNAAMMEE + llllaammaaffiilleerr – fast reliable large language model server + +SSYYNNOOPPSSIISS + llllaammaaffiilleerr --mm _m_o_d_e_l_._g_g_u_f [flags...] + +DDEESSCCRRIIPPTTIIOONN + llllaammaaffiilleerr llamafiler is an HTTP server for Large Language Models (LLMs). + It includes a web GUI for both chatbot and text completion. It can be your + OpenAI API compatible embeddings / completions / chat completions server. + It's able to more intelligently recycle context windows across multiple + slots serving multiple clients. + +OOPPTTIIOONNSS + The following options are available: + + ----vveerrssiioonn + Print version and exit. + + --hh, ----hheellpp + Show help message and exit. + + --mm _F_N_A_M_E, ----mmooddeell _F_N_A_M_E + Path of GGUF model weights. Each server process is currently + limited to serving only one model. If you need to host multiple + models, then it's recommended that you run multiple instances of + llamafiler behind a reverse proxy such as NGINX or Redbean. + + --mmmm _F_N_A_M_E, ----mmmmpprroojj _F_N_A_M_E + Path of vision model weights. + + ----lloorraa _F_N_A_M_E + Path to LoRA adapter weights. This flag may be repeated to load + multiple LoRA adapters. Each adapter will be applied with a default + scale of 1.0. The base model specified by --mm will be used as the + foundation for all LoRA adaptations. + + ----lloorraa--ssccaalleedd _F_N_A_M_E _S_C_A_L_E + Path to LoRA adapter weights with custom scaling factor. The _S_C_A_L_E + parameter is a floating point number that controls the strength of + the LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced + strength). This flag may be repeated to load multiple scaled LoRA + adapters. + + ----lloorraa--iinniitt--wwiitthhoouutt--aappppllyy + Load LoRA adapters at startup without automatically applying them. + When this flag is used, adapters are initialized but not active + until explicitly applied via the API. This is useful for dynamic + LoRA adapter management through the HTTP endpoints. + + ----ddbb _F_I_L_E + Specifies path of sqlite3 database. + + The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3 + + --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N + Specifies number of layers to offload to GPU. + + This flag must be passed in order to use GPU on systems with NVIDIA + or AMD GPUs. If you're confident that you have enough VRAM, then + you can pass --nnggll _9_9_9 to enable full offloading, since this number + is automatically downtuned to however many number of layers the + model has. If VRAM is limited, then the ----vveerrbboossee flag may be + passed to learn how many layers the model has, e.g. 35, which can + then be down-tuned until the out of memory error goes away. + + On Apple Silicon systems with Metal, GPU offloading is enabled by + default. Since these GPUs use unified memory, they're treated as + having a single layer; therefore, using values higher than 1 will + be treated as 1. You can pass --nnggll _0 to disable GPU offloading and + run in CPU mode on Apple Metal systems. + + --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T + Specifies the local [HOST:]PORT on which the HTTP server should + listen. By default this is 0.0.0.0:8080 which means llamafiler + will bind to port 8080 on every locally available IPv4 network + interface. This option may currently only be specified once. + + --cc _T_O_K_E_N_S, ----ccttxx--ssiizzee _T_O_K_E_N_S + Specifies context size. This specifies how long a completion can + get before it runs out of space. It defaults to 8k which means 8192 + tokens. Many models support a larger context size, like 128k, but + that'll need much more RAM or VRAM per slot. If this value is + larger than the trained context size of the model, it'll be tuned + down to the maximum. If this value is 0 or negative, the maximum + number of tokens will be used. + + --ss _C_O_U_N_T, ----sslloottss _C_O_U_N_T + Specifies how many slots to maintain. This defaults to 1. Slots are + used by chat completions requests. When such a request comes in, + the client needs to take control of a slot. When the completion is + finished, the slot is relinquished back to the server. HTTP clients + will wait for a slot to be relinquished if none are available. + Tuning this parameter to nicely fit available RAM or VRAM can help + you manage your server resources, and control how much completion + parallelism can happen. Please note that ----ccttxx--ssiizzee has a strong + influence on how many slots can be created. + + ----ddeeccaayy--ddeellaayy _I_N_T + Number of seconds a context window slot needs to be inactive before + the system starts to strongly consider giving it to other clients. + The default is 300 which is five minutes. + + ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T + Sets slot decay growth factor. Context window slots are assigned in + a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h + * (_a_g_e āˆ’ _d_e_l_a_y)) + + --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T + Specifies system prompt. This value is passed along to the web + frontend. + + ----nnoo--ddiissppllaayy--pprroommpptt + Hide system prompt from web user interface. + + ----nnoollooggoo + Hide llamafile logo icon from web ui. + + ----uurrll--pprreeffiixx _U_R_L_P_R_E_F_I_X + Specifies a URL prefix (subdirectory) under which the HTTP server + will make the API accessible, e.g. /lamafiler. Useful when running + llamafiler behind a reverse proxy such as NGINX or Redbean. By + default, this is set to / (root). + + ----vveerrbboossee + Enable logging of diagnostic information. This flag is useful for + learning more about the model and hardware. It can also be helpful + for troubleshooting errors. We currently recommend that this flag + be avoided in production since the llama.cpp logger may disrupt + thread cancelation. + + --ww _N, ----wwoorrkkeerrss _N + Number of HTTP client handling threads. + + ----ttrruusstt _C_I_D_R + Adds a network to the trusted network list. This argument is + specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By + default, all clients are untrusted, which means they're subject to + token bucket throttling, and additional security precautions that + may cause request handling to go slightly slower. Therefore this + flag is important to use if you want to accurately benchmark + llamafiler, since the server will otherwise see the benchmark as a + DDOS and deprioritize its traffic accordingly. + + ----iipp--hheeaaddeerr _S_T_R + If this flag is passed a value, e.g. X-Forwarded-For, then any + trusted may send this header to your llamafile server to let it + know what the true effective client IPv4 address actually is. After + this happens the default security restrictions, e.g. token bucket, + will be measured and applied against that IPv4 address and its + adjacent networks. + + ----ttookkeenn--rraattee _N + Specifies how many times per second a token is dropped in each + bucket. This setting is used to define a limitation on how many + TCP connects and HTTP messages each chunk of the IPv4 address space + is permitted to send to llamafiler over a sustained period of time. + The default token rate is 1, which means that, on a long enough + timeline, a class-C network will be deprioritized if it sends more + than one request per second. No real penalty actually applies + though until the server runs out of resources, e.g. HTTP request + workers. + + ----ttookkeenn--bbuurrsstt _N + Specifies how many HTTP requests and TCP connects a given slice of + the IPv4 address space is permitted to send within a short period + of time, before token bucket restrictions kick in, and cause the + client to be deprioritized. By default, this value is set to 100. + It may be tuned to any value between 1 and 127 inclusive. + + ----ttookkeenn--cciiddrr _N + Specifies IPv4 address space granularity of token bucket algorithm, + in network bits. By default, this value is set to 24 which means + individual IPv4 addresses are viewed as being representative + members of a class-C network, or in other words, each group of 256 + IPv4 addresses is lumped together. If one IP in the group does + something bad, then bad things happen to all the other IPv4 + addresses in that granule. This number may be set to any integer + between 3 and 32 inclusive. Specifying a higher number will trade + away system memory to increase network specificity. For example, + using 32 means that 4 billion individual token buckets will be + created. By default, a background thread drops one token in each + bucket every second, so that could potentially be a lot of busy + work. A value of three means that everyone on the Internet who + talks to your server will have to fight over only eight token + buckets in total. + + ----uunnsseeccuurree + Disables sandboxing. By default, llamafiler puts itself in a + SECCOMP BPF sandbox, so that even if your server gets hacked in the + worst possible way (some kind of C++ memory bug) then there's very + little damage an attacker will be able to do. This works by + restricting system calls using Cosmopolitan Libc's implementation + of pledge() which is currently only supported on Linux (other OSes + will simply be unsecured by default). The pledge security policy + that's used by default is "stdio anet" which means that only + relatively harmless system calls like read(), write(), and accept() + are allowed once the server has finished initializing. It's not + possible for remotely executed code to do things like launch + subprocesses, read or write to the filesystem, or initiate a new + connection to a server. + + --kk _N, ----kkeeeeppaalliivvee _N + Specifies the TCP keepalive interval in seconds. This value is + passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're + supported by the host operating system. If this value is greater + than 0, then the the SO_KEEPALIVE and TCP_NODELAY options are + enabled on network sockets, if supported by the host operating + system. The default keepalive is 5. + + ----hhttttpp--oobbuuff--ssiizzee _N + Size of HTTP output buffer size, in bytes. Default is 1048576. + + ----hhttttpp--iibbuuff--ssiizzee _N + Size of HTTP input buffer size, in bytes. Default is 1048576. + + ----cchhaatt--tteemmppllaattee _N_A_M_E + Specifies or overrides chat template for model. + + Normally the GGUF metadata tokenizer.chat_template will specify + this value for instruct models. This flag may be used to either + override the chat template, or specify one when the GGUF metadata + field is absent, which effectively forces the web ui to enable + chatbot mode. + + Supported chat template names are: chatml, llama2, llama3, mistral + (alias for llama2), phi3, zephyr, monarch, gemma, gemma2 (alias for + gemma), orion, openchat, vicuna, vicuna-orca, deepseek, command-r, + chatglm3, chatglm4, minicpm, deepseek2, or exaone3. + + It is also possible to pass the jinja2 template itself to this + argument. Since llamafiler doesn't currently support jinja2, a + heuristic will be used to guess which of the above templates the + template represents. + + ----ccoommpplleettiioonn--mmooddee + Forces web ui to operate in completion mode, rather than chat mode. + Normally the web ui chooses its mode based on the GGUF metadata. + Base models normally don't define tokenizer.chat_template whereas + instruct models do. If it's a base model, then the web ui will + automatically use completion mode only, without needing to specify + this flag. This flag is useful in cases where a prompt template is + defined by the gguf, but it is desirable for the chat interface to + be disabled. + + ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E + Specifies SQL code that should be executed whenever connecting to + the SQLite database. The default is the following code, which + enables the write-ahead log. + + PRAGMA journal_mode=WAL; + PRAGMA synchronous=NORMAL; + + ----rreesseerrvvee--ttookkeennss _N + Percent of context window to reserve for predicted tokens. When the + server runs out of context window, old chat messages will be + forgotten until this percent of the context is empty. The default + is 15%. If this is specified as a floating point number, e.g. 0.15, + then it'll be multiplied by 100 to get the percent. + +EEXXAAMMPPLLEESS + Here's an example of how you might start this server: + + llamafiler -m all-MiniLM-L6-v2.F32.gguf + + Here's how to start with a LoRA adapter: + + llamafiler -m base_model.gguf --lora adapter.gguf + + Here's how to use multiple LoRA adapters with custom scaling: + + llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled + adapter2.gguf 0.8 + + Here's how to send a tokenization request: + + curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world + + Here's how to send a embedding request: + + curl -v http://127.0.0.1:8080/embedding?content=hello+world + +DDOOCCUUMMEENNTTAATTIIOONN + Read our Markdown documentation for additional help and tutorials. See + llamafile/server/doc/index.md in the source repository on GitHub. + +SSEEEE AALLSSOO + llamafile(1), whisperfile(1) + +Mozilla Ocho November 30, 2024 Mozilla Ocho diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp index 6377cabc78..a21c809614 100644 --- a/llamafile/server/prog.cpp +++ b/llamafile/server/prog.cpp @@ -31,13 +31,17 @@ // Global LoRA adapter storage for multiple adapters #define MAX_LORA_ADAPTERS 8 +#include struct lora_adapter_container { struct llama_lora_adapter* adapter; float scale; + std::string name; // Model/adapter name for identification + bool applied; // Whether this adapter is currently applied to slots }; -static struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {0}; -static int g_lora_adapters_count = 0; +// Make these externally accessible for HTTP endpoint +struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {}; +int g_lora_adapters_count = 0; // Function to get the first global LoRA adapter for backward compatibility extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { @@ -118,16 +122,35 @@ main(int argc, char* argv[]) // load LoRA adapters if specified if (FLAG_lora_adapters_count > 0) { - SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count); + const char* apply_mode = FLAG_lora_init_without_apply ? "without applying" : "and applying"; + SLOG("loading %d LoRA adapter(s) %s", FLAG_lora_adapters_count, apply_mode); + for (int i = 0; i < FLAG_lora_adapters_count; i++) { char scale_buf[32]; snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); - SLOG("loading LoRA adapter %d from %s with scale %s", i + 1, - FLAG_lora_adapters[i].path, scale_buf); - g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path); + + // Generate model name from filename + const char* path = FLAG_lora_adapters[i].path; + const char* filename = strrchr(path, '/'); + filename = filename ? filename + 1 : path; + + // Remove file extension for cleaner name + std::string model_name(filename); + size_t dot_pos = model_name.find_last_of('.'); + if (dot_pos != std::string::npos) { + model_name = model_name.substr(0, dot_pos); + } + + SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, + model_name.c_str(), path, scale_buf); + + g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path); g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; + g_lora_adapters[i].name = model_name; + g_lora_adapters[i].applied = !FLAG_lora_init_without_apply; // Apply unless flag is set + if (!g_lora_adapters[i].adapter) { - fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, FLAG_lora_adapters[i].path); + fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path); // Cleanup previously loaded adapters for (int j = 0; j < i; j++) { if (g_lora_adapters[j].adapter) { @@ -139,7 +162,12 @@ main(int argc, char* argv[]) } g_lora_adapters_count++; } - SLOG("all LoRA adapters loaded successfully"); + + if (FLAG_lora_init_without_apply) { + SLOG("all LoRA adapters loaded successfully but not applied (use /lora-adapters API to apply)"); + } else { + SLOG("all LoRA adapters loaded and applied successfully"); + } } // create slots From 78a3b7632201995565340a91580039e9d4b86e6b Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 19:31:42 -0400 Subject: [PATCH 4/9] hk --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index c3b727fa4f..9a1519948c 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -17,7 +17,7 @@ This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, - **Compatible Flags**: - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0) - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor - - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases) + - `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping) - **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility - **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety - **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle From 95f6887e3f62d329bb9b8254f23e014b76205cfd Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 19:40:23 -0400 Subject: [PATCH 5/9] moar hk --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c6cfb127f..2b56dcf7a0 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0. - `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0) - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor -- `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed) +- `--lora-init-without-apply [FNAME]`: Load LoRA adapters without applying (lora hot-swapping) ### Dynamic LoRA Adapter Management (Hot-Swapping) From 069024d862e46cd01f9df8c2b3b76523a5f19fc1 Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Fri, 8 Aug 2025 19:41:08 -0400 Subject: [PATCH 6/9] hk... sorry --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b56dcf7a0..259b462ecb 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0. - `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0) - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor -- `--lora-init-without-apply [FNAME]`: Load LoRA adapters without applying (lora hot-swapping) +- `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping) ### Dynamic LoRA Adapter Management (Hot-Swapping) From 09632b7be9456993982dc75457917fab1efe97b9 Mon Sep 17 00:00:00 2001 From: Logan Powell Date: Wed, 13 Aug 2025 14:51:18 -0400 Subject: [PATCH 7/9] fix: add intelligent slot refresh for LoRA adapter updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Removes redundant code by deferring to llama.cpp for lora structures - Add Slot::mark_for_refresh() to flag slots for context refresh after LoRA changes - Integrate needs_refresh_ flag and logic into Slot class and prefill() method - Update LoRA adapter API handlers to call mark_for_refresh() after applying or updating adapters - Ensure system prompts and context are preserved using slot’s intelligent prefill mechanism - Remove naive KV cache clearing logic in favor of slot-managed refresh - Improves runtime LoRA scale update reliability --- .vscode/c_cpp_properties.json | 60 ++++ .vscode/launch.json | 43 +++ .vscode/settings.json | 38 +++ .vscode/tasks.json | 175 ++++++++++ diff.txt | 512 +++++++++++++++++++++++++++++ llamafile/server/client.h | 12 +- llamafile/server/lora_adapters.cpp | 139 ++++---- llamafile/server/prog.cpp | 57 ++-- llamafile/server/slot.cpp | 49 +-- llamafile/server/slot.h | 3 +- 10 files changed, 944 insertions(+), 144 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 diff.txt diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000000..d9cbe110f5 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,60 @@ +{ + // Simplified IntelliSense config without compile_commands.json + // Uses broad include/define coverage; may be less exact per-file. + "env": { + "cosmoccPath": "${workspaceFolder}/.cosmocc" + }, + "configurations": [ + { + "name": "Cosmopolitan", + "compilerPath": "${cosmoccPath}/3.9.7/bin/cosmocc", + "intelliSenseMode": "clang-x64", + "cppStandard": "gnu++23", + "cStandard": "gnu11", + "defines": [ + "GGML_MULTIPLATFORM", + "LLAMAFILE_DEBUG", + "_LIBCPP_HAS_NO_XLOCALE", + "_LIBCPP_HAS_MUSL_LIBC" + ], + "compilerArgs": [ + "-j8" + // "-std=gnu++23", + // "-Wall", + // "-Wextra", + // Force C++ mode and reassert libc++ include even if driver fallback fails + // "-nostdinc++", + // "-I${cosmoccPath}/3.9.7/include/c++/v1" + ], + "includePath": [ + "${workspaceFolder}", + "${workspaceFolder}/llamafile", + "${workspaceFolder}/llama.cpp", + "${workspaceFolder}/whisper.cpp", + "${workspaceFolder}/stable-diffusion.cpp", + "${workspaceFolder}/localscore", + "${workspaceFolder}/third_party", + "${cosmoccPath}/include", + "${cosmoccPath}/3.9.7/include", + "${cosmoccPath}/3.9.7/include/c++/v1" + ], + "forcedInclude": [ + // Normalizes some Cosmopolitan integral typedefs early. + "${cosmoccPath}/include/libc/integral/normalize.inc" + ], + "browse": { + "path": [ + "${workspaceFolder}", + "${workspaceFolder}/llamafile", + "${workspaceFolder}/llama.cpp", + "${workspaceFolder}/whisper.cpp", + "${workspaceFolder}/stable-diffusion.cpp", + "${workspaceFolder}/localscore", + "${workspaceFolder}/third_party" + ], + "limitSymbolsToIncludedHeaders": false + } + } + ], + "version": 4 +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000..c50ac4842c --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,43 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Server (llamafiler)", + "type": "cppdbg", + "request": "launch", + "MIMode": "lldb", + "program": "${workspaceFolder}/o/opt/llamafile/server/main", + "args": [], + "preLaunchTask": "Build (fast)", + "stopAtEntry": false + }, + { + "name": "Debug Llama CLI (llamafile)", + "type": "cppdbg", + "request": "launch", + "MIMode": "lldb", + "program": "${workspaceFolder}/o/opt/llama.cpp/main/main", + "args": [], + "preLaunchTask": "Build (fast)", + "stopAtEntry": false + }, + { + "name": "Debug Quantize Tool", + "type": "cppdbg", + "request": "launch", + "MIMode": "lldb", + "program": "${workspaceFolder}/o/opt/llama.cpp/quantize/quantize", + "args": [], + "preLaunchTask": "Build (fast)", + "stopAtEntry": false + }, + { + "name": "Attach to PID", + "type": "cppdbg", + "request": "attach", + "MIMode": "lldb", + "processId": "${command:pickProcess}", + "program": "${workspaceFolder}/o/opt/llamafile/server/main" + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..0948ad7be5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,38 @@ +{ + // Core C/C++ extension behavior + "C_Cpp.default.configurationProvider": "ms-vscode.cpptools", // use c_cpp_properties.json + "C_Cpp.intelliSenseEngine": "default", + "C_Cpp.errorSquiggles": "disabled", + "C_Cpp.autoAddFileAssociations": false, + "C_Cpp.default.browse.limitSymbolsToIncludedHeaders": false, + "C_Cpp.workspaceParsingPriority": "highest", + "C_Cpp.loggingLevel": "Warning", + // Speed: avoid re-indexing node_modules or build/artifacts if present + "files.watcherExclude": { + "**/o/**": true, + "**/.git/**": true, + "**/.cosmocc/**": true + }, + "search.exclude": { + "o": true, + "**/o/**": true + }, + // Formatting / style (adjust to project preference) + "editor.formatOnSave": false, + "C_Cpp.formatting": "disabled", + // Diagnostics tuning: treat missing headers as warnings (since we simplified config) + "C_Cpp.codeAnalysis.clangTidy.enabled": false, + "C_Cpp.intelliSenseCacheSize": 512, + // Optional UI niceties + "C_Cpp.enhancedColorization": "enabled", + "C_Cpp.dimInactiveRegions": true, + // File associations + "files.associations": { + "*.cpp": "cpp", + "*.c": "c", + "*.h": "c", + "*.hpp": "cpp", + "*.mk": "makefile", + "BUILD.mk": "makefile" + } +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000000..894f7a924f --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,175 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Build (fast)", + "type": "shell", + "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "problemMatcher": [ + "$gcc" + ], + "presentation": { + "reveal": "always", + "panel": "shared" + } + }, + { + "label": "Rebuild (clean + all)", + "type": "shell", + "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "args": [ + "clean", + "&&", + "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "-j8" + ], + "problemMatcher": [ + "$gcc" + ], + "presentation": { + "reveal": "always", + "panel": "shared" + } + }, + { + "label": "Build vmathf_test", + "type": "shell", + "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8", + "o/opt/llamafile/vmathf_test" + ], + "problemMatcher": [ + "$gcc" + ], + "presentation": { + "reveal": "always", + "panel": "shared" + } + }, + { + "label": "Run vmathf_test", + "type": "shell", + "command": "o/opt/llamafile/vmathf_test", + "dependsOn": "Build vmathf_test", + "presentation": { + "reveal": "always", + "panel": "shared" + } + }, + { + "label": "Clean", + "type": "shell", + "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "args": [ + "clean" + ], + "group": "build", + "presentation": { + "reveal": "always", + "panel": "shared" + } + }, + { + "label": "Watch (incremental)", + "type": "shell", + "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "isBackground": true, + "problemMatcher": [ + { + "owner": "cpp", + "pattern": { + "regexp": "^(.*):(\\d+):(\\d+): (warning|error): (.*)$", + "file": 1, + "line": 2, + "column": 3, + "severity": 4, + "message": 5 + }, + "background": { + "activeOnStart": true, + "beginsPattern": "^.*Building.*$", + "endsPattern": "^.*(error|warning|linking).*$" + } + } + ], + "presentation": { + "reveal": "never", + "panel": "dedicated" + } + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + }, + { + "label": "Build (fast)", + "type": "shell", + "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", + "args": [ + "-j8" + ], + "group": "build" + } + ] +} \ No newline at end of file diff --git a/diff.txt b/diff.txt new file mode 100644 index 0000000000..c6be66ce7e --- /dev/null +++ b/diff.txt @@ -0,0 +1,512 @@ +diff --git a/llamafile/server/client.h b/llamafile/server/client.h +index 74d1314e6..f82eed422 100644 +--- a/llamafile/server/client.h ++++ b/llamafile/server/client.h +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include "llama.cpp/common.h" + + #define HasHeader(H) (!!msg_.headers[H].a) + #define HeaderData(H) (ibuf_.p + msg_.headers[H].a) +@@ -141,13 +142,4 @@ struct Client + } // namespace lf + + // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp) +-#define MAX_LORA_ADAPTERS 8 +-struct lora_adapter_container { +- struct llama_lora_adapter* adapter; +- float scale; +- std::string name; // Model/adapter name for identification +- bool applied; // Whether this adapter is currently applied to slots +-}; +- +-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; +-extern int g_lora_adapters_count; ++// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead +diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp +index 35e55198d..c048a2a4e 100644 +--- a/llamafile/server/lora_adapters.cpp ++++ b/llamafile/server/lora_adapters.cpp +@@ -17,6 +17,7 @@ + + #include "client.h" + #include "llama.cpp/llama.h" ++#include "llama.cpp/common.h" + #include "llamafile/json.h" + #include "llamafile/llamafile.h" + #include "llamafile/server/log.h" +@@ -29,9 +30,7 @@ + using jt::Json; + + // External declarations for global LoRA adapter storage from prog.cpp (outside namespace) +-// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h +-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; +-extern int g_lora_adapters_count; ++extern std::vector g_lora_adapters; + + namespace lf { + namespace server { +@@ -46,12 +45,12 @@ Client::lora_adapters() + json.setArray(); + std::vector& json_array = json.getArray(); + +- for (int i = 0; i < g_lora_adapters_count; i++) { ++ for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { + Json adapter; + adapter.setObject(); +- adapter["id"] = i; +- adapter["path"] = g_lora_adapters[i].name; // Use name as path for now +- adapter["scale"] = g_lora_adapters[i].scale; ++ adapter["id"] = (int)i; ++ adapter["path"] = ::g_lora_adapters[i].path; ++ adapter["scale"] = ::g_lora_adapters[i].scale; + json_array.push_back(adapter); + } + +@@ -93,7 +92,7 @@ bool + Client::handle_apply_adapters(Json& json) + { + // Get active slots and apply current adapters to them +- if (g_lora_adapters_count == 0) { ++ if (::g_lora_adapters.empty()) { + Json response; + response["success"] = false; + response["message"] = "No adapters loaded to apply"; +@@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json) + return send_response(obuf_.p, p, response.toString()); + } + +- // Apply adapters to all slots via the server +- // Note: This would require coordination with the slot management system +- SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count); ++ // Apply adapters to all slots via the server using llama.cpp unified function ++ SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", ++ (int)::g_lora_adapters.size()); ++ ++ // Apply to all active slots ++ Slots* slots = worker_->server_->slots_; ++ pthread_mutex_lock(&slots->lock_); ++ ++ for (size_t i = 0; i < slots->slots_.size(); ++i) { ++ Slot* slot = slots->slots_[i].get(); ++ if (slot->ctx_) { ++ SLOG("applying LoRA adapters to slot #%d", slot->id_); ++ llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); ++ ++ // CRITICAL: Mark slot for refresh to handle LoRA changes properly ++ // The slot's prefill() mechanism will intelligently preserve system prompts ++ // and only re-evaluate what's necessary when the next request comes in ++ slot->mark_for_refresh(); ++ SLOG("marked slot #%d for refresh after LoRA application", slot->id_); ++ } ++ } ++ ++ pthread_mutex_unlock(&slots->lock_); + + Json response; + response["success"] = true; + response["message"] = "Adapters applied to active slots"; +- response["adapters_applied"] = g_lora_adapters_count; ++ response["adapters_applied"] = (int)::g_lora_adapters.size(); + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); +@@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json) + std::string adapter_path = json["path"].getString(); + float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f; + +- // Check if we have room for more adapters +- if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) { +- Json response; +- response["success"] = false; +- response["message"] = "Maximum number of adapters already loaded"; +- response["max_adapters"] = MAX_LORA_ADAPTERS; +- +- char* p = append_http_response_message(obuf_.p, 400); +- p = stpcpy(p, "Content-Type: application/json\r\n"); +- return send_response(obuf_.p, p, response.toString()); +- } +- + // Check if file exists + if (!std::filesystem::exists(adapter_path)) { + Json response; +@@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json) + return send_response(obuf_.p, p, response.toString()); + } + ++ // Create the adapter container ++ llama_lora_adapter_container adapter_container; ++ adapter_container.path = adapter_path; ++ adapter_container.scale = scale; ++ adapter_container.adapter = adapter; ++ + // Store the adapter +- int index = g_lora_adapters_count; +- g_lora_adapters[index].adapter = adapter; +- g_lora_adapters[index].scale = scale; +- g_lora_adapters_count++; ++ int index = (int)::g_lora_adapters.size(); ++ ::g_lora_adapters.push_back(adapter_container); + + SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str()); + +@@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json) + response["index"] = index; + response["path"] = adapter_path; + response["scale"] = scale; +- response["total_adapters"] = g_lora_adapters_count; ++ response["total_adapters"] = (int)::g_lora_adapters.size(); + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); +@@ -192,18 +203,16 @@ bool + Client::handle_clear_adapters() + { + // Clear all loaded adapters +- SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count); ++ SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size()); + +- for (int i = 0; i < g_lora_adapters_count; i++) { +- if (g_lora_adapters[i].adapter) { +- llama_lora_adapter_free(g_lora_adapters[i].adapter); +- g_lora_adapters[i].adapter = nullptr; +- g_lora_adapters[i].scale = 0.0f; ++ int cleared_count = (int)::g_lora_adapters.size(); ++ for (auto& la : ::g_lora_adapters) { ++ if (la.adapter) { ++ llama_lora_adapter_free(la.adapter); + } + } + +- int cleared_count = g_lora_adapters_count; +- g_lora_adapters_count = 0; ++ ::g_lora_adapters.clear(); + + SLOG("cleared %d LoRA adapter(s)", cleared_count); + +@@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json) + std::vector& json_array = json.getArray(); + SLOG("applying LoRA configuration with %d entries", (int)json_array.size()); + +- // First, reset all adapter scales to 0.0 (disabled) +- for (int i = 0; i < g_lora_adapters_count; i++) { +- g_lora_adapters[i].applied = false; +- } +- + // Process each entry in the array + for (size_t i = 0; i < json_array.size(); i++) { + Json& entry = json_array[i]; +@@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json) + float scale = entry["scale"].getNumber(); + + // Validate ID range +- if (id < 0 || id >= g_lora_adapters_count) { ++ if (id < 0 || id >= (int)::g_lora_adapters.size()) { + return send_error(400, "Invalid adapter ID"); + } + + // Update the adapter configuration +- g_lora_adapters[id].scale = scale; +- g_lora_adapters[id].applied = (scale > 0.0f); ++ ::g_lora_adapters[id].scale = scale; + + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); + SLOG("set LoRA adapter %d ('%s') scale to %s", +- id, g_lora_adapters[id].name.c_str(), scale_buf); ++ id, ::g_lora_adapters[id].path.c_str(), scale_buf); + } + +- // Re-apply LoRA adapters to all active slots with updated scales +- SLOG("re-applying LoRA adapters to all active slots"); ++ // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function ++ SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function"); + Slots* slots = worker_->server_->slots_; + + // Lock the slots to prevent concurrent access during LoRA re-application +@@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json) + Slot* slot = slots->slots_[i].get(); + if (slot->ctx_) { + SLOG("re-applying LoRA adapters to slot #%d", slot->id_); ++ llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); + +- // Clear existing LoRA adapters from this context +- llama_lora_adapter_clear(slot->ctx_); +- +- // Use the same approach as slot initialization: get all adapters via the function +- struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; +- float scales[MAX_LORA_ADAPTERS]; +- int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); +- +- SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_); +- +- // Re-apply all adapters with their current scales +- for (int j = 0; j < adapter_count; ++j) { +- char scale_buf[32]; +- snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]); +- SLOG("processing LoRA adapter %d with scale %s", j, scale_buf); +- if (scales[j] > 0.0f) { +- if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) { +- SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_); +- } else { +- SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf); +- } +- } else { +- SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf); +- } +- } ++ // CRITICAL: Mark slot for refresh to handle LoRA changes properly ++ // The slot's prefill() mechanism will intelligently preserve system prompts ++ // and only re-evaluate what's necessary when the next request comes in ++ slot->mark_for_refresh(); ++ SLOG("marked slot #%d for refresh after LoRA update", slot->id_); + } + } + +@@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json) + Json response; + response.setArray(); + std::vector& response_array = response.getArray(); +- for (int i = 0; i < g_lora_adapters_count; i++) { ++ ++ for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { + Json adapter; + adapter.setObject(); +- adapter["id"] = i; +- adapter["path"] = g_lora_adapters[i].name; +- adapter["scale"] = g_lora_adapters[i].scale; ++ adapter["id"] = (int)i; ++ adapter["path"] = ::g_lora_adapters[i].path; ++ adapter["scale"] = ::g_lora_adapters[i].scale; + response_array.push_back(adapter); + } + +diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp +index a21c80961..89ae3f12d 100644 +--- a/llamafile/server/prog.cpp ++++ b/llamafile/server/prog.cpp +@@ -26,31 +26,21 @@ + #include "llamafile/server/tokenbucket.h" + #include "llamafile/server/utils.h" + #include "llamafile/version.h" ++#include "llama.cpp/common.h" + #include + #include + +-// Global LoRA adapter storage for multiple adapters +-#define MAX_LORA_ADAPTERS 8 +-#include +-struct lora_adapter_container { +- struct llama_lora_adapter* adapter; +- float scale; +- std::string name; // Model/adapter name for identification +- bool applied; // Whether this adapter is currently applied to slots +-}; +- +-// Make these externally accessible for HTTP endpoint +-struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {}; +-int g_lora_adapters_count = 0; ++// Global LoRA adapter storage using llama.cpp structures ++std::vector g_lora_adapters; + + // Function to get the first global LoRA adapter for backward compatibility + extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { +- return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr; ++ return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter; + } + + // Function to get all LoRA adapters and their count + extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) { +- int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters; ++ int count = std::min((int)g_lora_adapters.size(), max_adapters); + for (int i = 0; i < count; i++) { + adapters[i] = g_lora_adapters[i].adapter; + scales[i] = g_lora_adapters[i].scale; +@@ -129,38 +119,31 @@ main(int argc, char* argv[]) + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); + +- // Generate model name from filename ++ // Generate model name from filename for identification + const char* path = FLAG_lora_adapters[i].path; + const char* filename = strrchr(path, '/'); + filename = filename ? filename + 1 : path; + +- // Remove file extension for cleaner name +- std::string model_name(filename); +- size_t dot_pos = model_name.find_last_of('.'); +- if (dot_pos != std::string::npos) { +- model_name = model_name.substr(0, dot_pos); +- } +- + SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, +- model_name.c_str(), path, scale_buf); +- +- g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path); +- g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; +- g_lora_adapters[i].name = model_name; +- g_lora_adapters[i].applied = !FLAG_lora_init_without_apply; // Apply unless flag is set ++ filename, path, scale_buf); ++ ++ llama_lora_adapter_container adapter_container; ++ adapter_container.path = std::string(path); ++ adapter_container.scale = FLAG_lora_adapters[i].scale; ++ adapter_container.adapter = llama_lora_adapter_init(model, path); + +- if (!g_lora_adapters[i].adapter) { ++ if (!adapter_container.adapter) { + fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path); + // Cleanup previously loaded adapters +- for (int j = 0; j < i; j++) { +- if (g_lora_adapters[j].adapter) { +- llama_lora_adapter_free(g_lora_adapters[j].adapter); ++ for (auto& la : g_lora_adapters) { ++ if (la.adapter) { ++ llama_lora_adapter_free(la.adapter); + } + } + llama_free_model(model); + exit(1); + } +- g_lora_adapters_count++; ++ g_lora_adapters.push_back(adapter_container); + } + + if (FLAG_lora_init_without_apply) { +@@ -203,9 +186,9 @@ main(int argc, char* argv[]) + delete slots; + + // Cleanup LoRA adapters +- for (int i = 0; i < g_lora_adapters_count; i++) { +- if (g_lora_adapters[i].adapter) { +- llama_lora_adapter_free(g_lora_adapters[i].adapter); ++ for (auto& la : g_lora_adapters) { ++ if (la.adapter) { ++ llama_lora_adapter_free(la.adapter); + } + } + +diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp +index 55138417b..a081d69df 100644 +--- a/llamafile/server/slot.cpp ++++ b/llamafile/server/slot.cpp +@@ -18,6 +18,7 @@ + #include "slot.h" + #include "llama.cpp/llava/clip.h" + #include "llama.cpp/llava/llava.h" ++#include "llama.cpp/common.h" + #include "llamafile/image.h" + #include "llamafile/llama.h" + #include "llamafile/llamafile.h" +@@ -32,6 +33,9 @@ + #include + #include + ++// External declaration for global LoRA adapter storage ++extern std::vector g_lora_adapters; ++ + namespace lf { + namespace server { + +@@ -79,7 +83,7 @@ Slot::describe_error(int err) + } + } + +-Slot::Slot(int id, llama_model* model) : id_(id), model_(model) ++Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false) + { + dll_init(&elem_); + last_used_ = time(0); +@@ -126,24 +130,16 @@ Slot::start() + if (!(ctx_ = llama_new_context_with_model(model_, cparams))) + return false; + +- // Apply LoRA adapters if available +- struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; +- float scales[MAX_LORA_ADAPTERS]; +- int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); +- +- if (adapter_count > 0) { +- SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_); +- for (int i = 0; i < adapter_count; i++) { +- if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) { +- SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_); +- llama_free(ctx_); +- ctx_ = nullptr; +- return false; +- } +- char scale_buf[32]; +- snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]); +- SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf); +- } ++ // Apply LoRA adapters if available using llama.cpp's unified function ++ if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) { ++ SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", ++ (int)::g_lora_adapters.size(), id_); ++ llama_lora_adapters_apply(ctx_, ::g_lora_adapters); ++ } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) { ++ // When --lora-init-without-apply is set, explicitly clear any LoRA state ++ // to ensure no residual LoRA effects from model initialization ++ SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_); ++ llama_lora_adapter_clear(ctx_); + } + + if (FLAG_mmproj) +@@ -314,6 +310,15 @@ Slot::prefill(const std::vector& atoms, const ProgressCallback& progress) + if (!ctx_) + return uninitialized; + ++ // Check if we need to refresh due to LoRA adapter changes ++ if (needs_refresh_) { ++ SLOG("Refreshing slot due to LoRA adapter changes"); ++ llama_kv_cache_clear(ctx_); ++ history_.clear(); ++ needs_refresh_ = false; ++ // Fall through to normal prefill logic with cleared state ++ } ++ + // handle special case of empty prefill + if (atoms.empty()) { + llama_kv_cache_clear(ctx_); +@@ -458,5 +463,11 @@ Slot::dump(std::string* result) + } + } + ++void ++Slot::mark_for_refresh() ++{ ++ needs_refresh_ = true; ++} ++ + } // namespace server + } // namespace lf +diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h +index e8816c900..104aa7623 100644 +--- a/llamafile/server/slot.h ++++ b/llamafile/server/slot.h +@@ -23,7 +23,6 @@ + #include + + #define SLOT(e) DLL_CONTAINER(Slot, elem_, e) +-#define MAX_LORA_ADAPTERS 8 + + struct llama_context; + struct llama_model; +@@ -66,6 +65,7 @@ struct Slot + llama_context* ctx_ = nullptr; + std::vector history_; + std::string system_fingerprint_; ++ bool needs_refresh_ = false; + + ~Slot(); + Slot(int, llama_model*); +@@ -79,6 +79,7 @@ struct Slot + int prefill(const std::vector&, const ProgressCallback& = nullptr); + void tokenize(std::vector*, std::string_view, bool); + void dump(std::string*); ++ void mark_for_refresh(); + }; + + } // namespace server diff --git a/llamafile/server/client.h b/llamafile/server/client.h index 74d1314e62..f82eed4225 100644 --- a/llamafile/server/client.h +++ b/llamafile/server/client.h @@ -25,6 +25,7 @@ #include #include #include +#include "llama.cpp/common.h" #define HasHeader(H) (!!msg_.headers[H].a) #define HeaderData(H) (ibuf_.p + msg_.headers[H].a) @@ -141,13 +142,4 @@ struct Client } // namespace lf // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp) -#define MAX_LORA_ADAPTERS 8 -struct lora_adapter_container { - struct llama_lora_adapter* adapter; - float scale; - std::string name; // Model/adapter name for identification - bool applied; // Whether this adapter is currently applied to slots -}; - -extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; -extern int g_lora_adapters_count; +// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp index 35e55198df..c048a2a4ef 100644 --- a/llamafile/server/lora_adapters.cpp +++ b/llamafile/server/lora_adapters.cpp @@ -17,6 +17,7 @@ #include "client.h" #include "llama.cpp/llama.h" +#include "llama.cpp/common.h" #include "llamafile/json.h" #include "llamafile/llamafile.h" #include "llamafile/server/log.h" @@ -29,9 +30,7 @@ using jt::Json; // External declarations for global LoRA adapter storage from prog.cpp (outside namespace) -// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h -extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; -extern int g_lora_adapters_count; +extern std::vector g_lora_adapters; namespace lf { namespace server { @@ -46,12 +45,12 @@ Client::lora_adapters() json.setArray(); std::vector& json_array = json.getArray(); - for (int i = 0; i < g_lora_adapters_count; i++) { + for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { Json adapter; adapter.setObject(); - adapter["id"] = i; - adapter["path"] = g_lora_adapters[i].name; // Use name as path for now - adapter["scale"] = g_lora_adapters[i].scale; + adapter["id"] = (int)i; + adapter["path"] = ::g_lora_adapters[i].path; + adapter["scale"] = ::g_lora_adapters[i].scale; json_array.push_back(adapter); } @@ -93,7 +92,7 @@ bool Client::handle_apply_adapters(Json& json) { // Get active slots and apply current adapters to them - if (g_lora_adapters_count == 0) { + if (::g_lora_adapters.empty()) { Json response; response["success"] = false; response["message"] = "No adapters loaded to apply"; @@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json) return send_response(obuf_.p, p, response.toString()); } - // Apply adapters to all slots via the server - // Note: This would require coordination with the slot management system - SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count); + // Apply adapters to all slots via the server using llama.cpp unified function + SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", + (int)::g_lora_adapters.size()); + + // Apply to all active slots + Slots* slots = worker_->server_->slots_; + pthread_mutex_lock(&slots->lock_); + + for (size_t i = 0; i < slots->slots_.size(); ++i) { + Slot* slot = slots->slots_[i].get(); + if (slot->ctx_) { + SLOG("applying LoRA adapters to slot #%d", slot->id_); + llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); + + // CRITICAL: Mark slot for refresh to handle LoRA changes properly + // The slot's prefill() mechanism will intelligently preserve system prompts + // and only re-evaluate what's necessary when the next request comes in + slot->mark_for_refresh(); + SLOG("marked slot #%d for refresh after LoRA application", slot->id_); + } + } + + pthread_mutex_unlock(&slots->lock_); Json response; response["success"] = true; response["message"] = "Adapters applied to active slots"; - response["adapters_applied"] = g_lora_adapters_count; + response["adapters_applied"] = (int)::g_lora_adapters.size(); char* p = append_http_response_message(obuf_.p, 200); p = stpcpy(p, "Content-Type: application/json\r\n"); @@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json) std::string adapter_path = json["path"].getString(); float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f; - // Check if we have room for more adapters - if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) { - Json response; - response["success"] = false; - response["message"] = "Maximum number of adapters already loaded"; - response["max_adapters"] = MAX_LORA_ADAPTERS; - - char* p = append_http_response_message(obuf_.p, 400); - p = stpcpy(p, "Content-Type: application/json\r\n"); - return send_response(obuf_.p, p, response.toString()); - } - // Check if file exists if (!std::filesystem::exists(adapter_path)) { Json response; @@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json) return send_response(obuf_.p, p, response.toString()); } + // Create the adapter container + llama_lora_adapter_container adapter_container; + adapter_container.path = adapter_path; + adapter_container.scale = scale; + adapter_container.adapter = adapter; + // Store the adapter - int index = g_lora_adapters_count; - g_lora_adapters[index].adapter = adapter; - g_lora_adapters[index].scale = scale; - g_lora_adapters_count++; + int index = (int)::g_lora_adapters.size(); + ::g_lora_adapters.push_back(adapter_container); SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str()); @@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json) response["index"] = index; response["path"] = adapter_path; response["scale"] = scale; - response["total_adapters"] = g_lora_adapters_count; + response["total_adapters"] = (int)::g_lora_adapters.size(); char* p = append_http_response_message(obuf_.p, 200); p = stpcpy(p, "Content-Type: application/json\r\n"); @@ -192,18 +203,16 @@ bool Client::handle_clear_adapters() { // Clear all loaded adapters - SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count); + SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size()); - for (int i = 0; i < g_lora_adapters_count; i++) { - if (g_lora_adapters[i].adapter) { - llama_lora_adapter_free(g_lora_adapters[i].adapter); - g_lora_adapters[i].adapter = nullptr; - g_lora_adapters[i].scale = 0.0f; + int cleared_count = (int)::g_lora_adapters.size(); + for (auto& la : ::g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); } } - int cleared_count = g_lora_adapters_count; - g_lora_adapters_count = 0; + ::g_lora_adapters.clear(); SLOG("cleared %d LoRA adapter(s)", cleared_count); @@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json) std::vector& json_array = json.getArray(); SLOG("applying LoRA configuration with %d entries", (int)json_array.size()); - // First, reset all adapter scales to 0.0 (disabled) - for (int i = 0; i < g_lora_adapters_count; i++) { - g_lora_adapters[i].applied = false; - } - // Process each entry in the array for (size_t i = 0; i < json_array.size(); i++) { Json& entry = json_array[i]; @@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json) float scale = entry["scale"].getNumber(); // Validate ID range - if (id < 0 || id >= g_lora_adapters_count) { + if (id < 0 || id >= (int)::g_lora_adapters.size()) { return send_error(400, "Invalid adapter ID"); } // Update the adapter configuration - g_lora_adapters[id].scale = scale; - g_lora_adapters[id].applied = (scale > 0.0f); + ::g_lora_adapters[id].scale = scale; char scale_buf[32]; snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); SLOG("set LoRA adapter %d ('%s') scale to %s", - id, g_lora_adapters[id].name.c_str(), scale_buf); + id, ::g_lora_adapters[id].path.c_str(), scale_buf); } - // Re-apply LoRA adapters to all active slots with updated scales - SLOG("re-applying LoRA adapters to all active slots"); + // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function + SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function"); Slots* slots = worker_->server_->slots_; // Lock the slots to prevent concurrent access during LoRA re-application @@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json) Slot* slot = slots->slots_[i].get(); if (slot->ctx_) { SLOG("re-applying LoRA adapters to slot #%d", slot->id_); + llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); - // Clear existing LoRA adapters from this context - llama_lora_adapter_clear(slot->ctx_); - - // Use the same approach as slot initialization: get all adapters via the function - struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; - float scales[MAX_LORA_ADAPTERS]; - int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); - - SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_); - - // Re-apply all adapters with their current scales - for (int j = 0; j < adapter_count; ++j) { - char scale_buf[32]; - snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]); - SLOG("processing LoRA adapter %d with scale %s", j, scale_buf); - if (scales[j] > 0.0f) { - if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) { - SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_); - } else { - SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf); - } - } else { - SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf); - } - } + // CRITICAL: Mark slot for refresh to handle LoRA changes properly + // The slot's prefill() mechanism will intelligently preserve system prompts + // and only re-evaluate what's necessary when the next request comes in + slot->mark_for_refresh(); + SLOG("marked slot #%d for refresh after LoRA update", slot->id_); } } @@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json) Json response; response.setArray(); std::vector& response_array = response.getArray(); - for (int i = 0; i < g_lora_adapters_count; i++) { + + for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { Json adapter; adapter.setObject(); - adapter["id"] = i; - adapter["path"] = g_lora_adapters[i].name; - adapter["scale"] = g_lora_adapters[i].scale; + adapter["id"] = (int)i; + adapter["path"] = ::g_lora_adapters[i].path; + adapter["scale"] = ::g_lora_adapters[i].scale; response_array.push_back(adapter); } diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp index a21c809614..89ae3f12d7 100644 --- a/llamafile/server/prog.cpp +++ b/llamafile/server/prog.cpp @@ -26,31 +26,21 @@ #include "llamafile/server/tokenbucket.h" #include "llamafile/server/utils.h" #include "llamafile/version.h" +#include "llama.cpp/common.h" #include #include -// Global LoRA adapter storage for multiple adapters -#define MAX_LORA_ADAPTERS 8 -#include -struct lora_adapter_container { - struct llama_lora_adapter* adapter; - float scale; - std::string name; // Model/adapter name for identification - bool applied; // Whether this adapter is currently applied to slots -}; - -// Make these externally accessible for HTTP endpoint -struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {}; -int g_lora_adapters_count = 0; +// Global LoRA adapter storage using llama.cpp structures +std::vector g_lora_adapters; // Function to get the first global LoRA adapter for backward compatibility extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { - return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr; + return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter; } // Function to get all LoRA adapters and their count extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) { - int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters; + int count = std::min((int)g_lora_adapters.size(), max_adapters); for (int i = 0; i < count; i++) { adapters[i] = g_lora_adapters[i].adapter; scales[i] = g_lora_adapters[i].scale; @@ -129,38 +119,31 @@ main(int argc, char* argv[]) char scale_buf[32]; snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); - // Generate model name from filename + // Generate model name from filename for identification const char* path = FLAG_lora_adapters[i].path; const char* filename = strrchr(path, '/'); filename = filename ? filename + 1 : path; - // Remove file extension for cleaner name - std::string model_name(filename); - size_t dot_pos = model_name.find_last_of('.'); - if (dot_pos != std::string::npos) { - model_name = model_name.substr(0, dot_pos); - } - SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, - model_name.c_str(), path, scale_buf); - - g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path); - g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; - g_lora_adapters[i].name = model_name; - g_lora_adapters[i].applied = !FLAG_lora_init_without_apply; // Apply unless flag is set + filename, path, scale_buf); + + llama_lora_adapter_container adapter_container; + adapter_container.path = std::string(path); + adapter_container.scale = FLAG_lora_adapters[i].scale; + adapter_container.adapter = llama_lora_adapter_init(model, path); - if (!g_lora_adapters[i].adapter) { + if (!adapter_container.adapter) { fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path); // Cleanup previously loaded adapters - for (int j = 0; j < i; j++) { - if (g_lora_adapters[j].adapter) { - llama_lora_adapter_free(g_lora_adapters[j].adapter); + for (auto& la : g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); } } llama_free_model(model); exit(1); } - g_lora_adapters_count++; + g_lora_adapters.push_back(adapter_container); } if (FLAG_lora_init_without_apply) { @@ -203,9 +186,9 @@ main(int argc, char* argv[]) delete slots; // Cleanup LoRA adapters - for (int i = 0; i < g_lora_adapters_count; i++) { - if (g_lora_adapters[i].adapter) { - llama_lora_adapter_free(g_lora_adapters[i].adapter); + for (auto& la : g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); } } diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index 55138417b1..a081d69dfb 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -18,6 +18,7 @@ #include "slot.h" #include "llama.cpp/llava/clip.h" #include "llama.cpp/llava/llava.h" +#include "llama.cpp/common.h" #include "llamafile/image.h" #include "llamafile/llama.h" #include "llamafile/llamafile.h" @@ -32,6 +33,9 @@ #include #include +// External declaration for global LoRA adapter storage +extern std::vector g_lora_adapters; + namespace lf { namespace server { @@ -79,7 +83,7 @@ Slot::describe_error(int err) } } -Slot::Slot(int id, llama_model* model) : id_(id), model_(model) +Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false) { dll_init(&elem_); last_used_ = time(0); @@ -126,24 +130,16 @@ Slot::start() if (!(ctx_ = llama_new_context_with_model(model_, cparams))) return false; - // Apply LoRA adapters if available - struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; - float scales[MAX_LORA_ADAPTERS]; - int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); - - if (adapter_count > 0) { - SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_); - for (int i = 0; i < adapter_count; i++) { - if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) { - SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_); - llama_free(ctx_); - ctx_ = nullptr; - return false; - } - char scale_buf[32]; - snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]); - SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf); - } + // Apply LoRA adapters if available using llama.cpp's unified function + if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) { + SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", + (int)::g_lora_adapters.size(), id_); + llama_lora_adapters_apply(ctx_, ::g_lora_adapters); + } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) { + // When --lora-init-without-apply is set, explicitly clear any LoRA state + // to ensure no residual LoRA effects from model initialization + SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_); + llama_lora_adapter_clear(ctx_); } if (FLAG_mmproj) @@ -314,6 +310,15 @@ Slot::prefill(const std::vector& atoms, const ProgressCallback& progress) if (!ctx_) return uninitialized; + // Check if we need to refresh due to LoRA adapter changes + if (needs_refresh_) { + SLOG("Refreshing slot due to LoRA adapter changes"); + llama_kv_cache_clear(ctx_); + history_.clear(); + needs_refresh_ = false; + // Fall through to normal prefill logic with cleared state + } + // handle special case of empty prefill if (atoms.empty()) { llama_kv_cache_clear(ctx_); @@ -458,5 +463,11 @@ Slot::dump(std::string* result) } } +void +Slot::mark_for_refresh() +{ + needs_refresh_ = true; +} + } // namespace server } // namespace lf diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h index e8816c9009..104aa7623c 100644 --- a/llamafile/server/slot.h +++ b/llamafile/server/slot.h @@ -23,7 +23,6 @@ #include #define SLOT(e) DLL_CONTAINER(Slot, elem_, e) -#define MAX_LORA_ADAPTERS 8 struct llama_context; struct llama_model; @@ -66,6 +65,7 @@ struct Slot llama_context* ctx_ = nullptr; std::vector history_; std::string system_fingerprint_; + bool needs_refresh_ = false; ~Slot(); Slot(int, llama_model*); @@ -79,6 +79,7 @@ struct Slot int prefill(const std::vector&, const ProgressCallback& = nullptr); void tokenize(std::vector*, std::string_view, bool); void dump(std::string*); + void mark_for_refresh(); }; } // namespace server From f9204e755eee091f667ce7e365e6d2c0a2630849 Mon Sep 17 00:00:00 2001 From: loganpowell Date: Wed, 20 Aug 2025 14:00:15 -0400 Subject: [PATCH 8/9] removes .vscode setup --- .gitignore | 3 +- .vscode/c_cpp_properties.json | 60 ------------ .vscode/launch.json | 43 --------- .vscode/settings.json | 38 -------- .vscode/tasks.json | 175 ---------------------------------- RELEASE.md | 2 +- 6 files changed, 3 insertions(+), 318 deletions(-) delete mode 100644 .vscode/c_cpp_properties.json delete mode 100644 .vscode/launch.json delete mode 100644 .vscode/settings.json delete mode 100644 .vscode/tasks.json diff --git a/.gitignore b/.gitignore index 16feca060d..dd97ddc5b8 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ /trace.json /*.log -/.models \ No newline at end of file +/.models +/.vscode \ No newline at end of file diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json deleted file mode 100644 index d9cbe110f5..0000000000 --- a/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - // Simplified IntelliSense config without compile_commands.json - // Uses broad include/define coverage; may be less exact per-file. - "env": { - "cosmoccPath": "${workspaceFolder}/.cosmocc" - }, - "configurations": [ - { - "name": "Cosmopolitan", - "compilerPath": "${cosmoccPath}/3.9.7/bin/cosmocc", - "intelliSenseMode": "clang-x64", - "cppStandard": "gnu++23", - "cStandard": "gnu11", - "defines": [ - "GGML_MULTIPLATFORM", - "LLAMAFILE_DEBUG", - "_LIBCPP_HAS_NO_XLOCALE", - "_LIBCPP_HAS_MUSL_LIBC" - ], - "compilerArgs": [ - "-j8" - // "-std=gnu++23", - // "-Wall", - // "-Wextra", - // Force C++ mode and reassert libc++ include even if driver fallback fails - // "-nostdinc++", - // "-I${cosmoccPath}/3.9.7/include/c++/v1" - ], - "includePath": [ - "${workspaceFolder}", - "${workspaceFolder}/llamafile", - "${workspaceFolder}/llama.cpp", - "${workspaceFolder}/whisper.cpp", - "${workspaceFolder}/stable-diffusion.cpp", - "${workspaceFolder}/localscore", - "${workspaceFolder}/third_party", - "${cosmoccPath}/include", - "${cosmoccPath}/3.9.7/include", - "${cosmoccPath}/3.9.7/include/c++/v1" - ], - "forcedInclude": [ - // Normalizes some Cosmopolitan integral typedefs early. - "${cosmoccPath}/include/libc/integral/normalize.inc" - ], - "browse": { - "path": [ - "${workspaceFolder}", - "${workspaceFolder}/llamafile", - "${workspaceFolder}/llama.cpp", - "${workspaceFolder}/whisper.cpp", - "${workspaceFolder}/stable-diffusion.cpp", - "${workspaceFolder}/localscore", - "${workspaceFolder}/third_party" - ], - "limitSymbolsToIncludedHeaders": false - } - } - ], - "version": 4 -} diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index c50ac4842c..0000000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "version": "0.2.0", - "configurations": [ - { - "name": "Debug Server (llamafiler)", - "type": "cppdbg", - "request": "launch", - "MIMode": "lldb", - "program": "${workspaceFolder}/o/opt/llamafile/server/main", - "args": [], - "preLaunchTask": "Build (fast)", - "stopAtEntry": false - }, - { - "name": "Debug Llama CLI (llamafile)", - "type": "cppdbg", - "request": "launch", - "MIMode": "lldb", - "program": "${workspaceFolder}/o/opt/llama.cpp/main/main", - "args": [], - "preLaunchTask": "Build (fast)", - "stopAtEntry": false - }, - { - "name": "Debug Quantize Tool", - "type": "cppdbg", - "request": "launch", - "MIMode": "lldb", - "program": "${workspaceFolder}/o/opt/llama.cpp/quantize/quantize", - "args": [], - "preLaunchTask": "Build (fast)", - "stopAtEntry": false - }, - { - "name": "Attach to PID", - "type": "cppdbg", - "request": "attach", - "MIMode": "lldb", - "processId": "${command:pickProcess}", - "program": "${workspaceFolder}/o/opt/llamafile/server/main" - } - ] -} diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 0948ad7be5..0000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - // Core C/C++ extension behavior - "C_Cpp.default.configurationProvider": "ms-vscode.cpptools", // use c_cpp_properties.json - "C_Cpp.intelliSenseEngine": "default", - "C_Cpp.errorSquiggles": "disabled", - "C_Cpp.autoAddFileAssociations": false, - "C_Cpp.default.browse.limitSymbolsToIncludedHeaders": false, - "C_Cpp.workspaceParsingPriority": "highest", - "C_Cpp.loggingLevel": "Warning", - // Speed: avoid re-indexing node_modules or build/artifacts if present - "files.watcherExclude": { - "**/o/**": true, - "**/.git/**": true, - "**/.cosmocc/**": true - }, - "search.exclude": { - "o": true, - "**/o/**": true - }, - // Formatting / style (adjust to project preference) - "editor.formatOnSave": false, - "C_Cpp.formatting": "disabled", - // Diagnostics tuning: treat missing headers as warnings (since we simplified config) - "C_Cpp.codeAnalysis.clangTidy.enabled": false, - "C_Cpp.intelliSenseCacheSize": 512, - // Optional UI niceties - "C_Cpp.enhancedColorization": "enabled", - "C_Cpp.dimInactiveRegions": true, - // File associations - "files.associations": { - "*.cpp": "cpp", - "*.c": "c", - "*.h": "c", - "*.hpp": "cpp", - "*.mk": "makefile", - "BUILD.mk": "makefile" - } -} diff --git a/.vscode/tasks.json b/.vscode/tasks.json deleted file mode 100644 index 894f7a924f..0000000000 --- a/.vscode/tasks.json +++ /dev/null @@ -1,175 +0,0 @@ -{ - "version": "2.0.0", - "tasks": [ - { - "label": "Build (fast)", - "type": "shell", - "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": { - "kind": "build", - "isDefault": true - }, - "problemMatcher": [ - "$gcc" - ], - "presentation": { - "reveal": "always", - "panel": "shared" - } - }, - { - "label": "Rebuild (clean + all)", - "type": "shell", - "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "args": [ - "clean", - "&&", - "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "-j8" - ], - "problemMatcher": [ - "$gcc" - ], - "presentation": { - "reveal": "always", - "panel": "shared" - } - }, - { - "label": "Build vmathf_test", - "type": "shell", - "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8", - "o/opt/llamafile/vmathf_test" - ], - "problemMatcher": [ - "$gcc" - ], - "presentation": { - "reveal": "always", - "panel": "shared" - } - }, - { - "label": "Run vmathf_test", - "type": "shell", - "command": "o/opt/llamafile/vmathf_test", - "dependsOn": "Build vmathf_test", - "presentation": { - "reveal": "always", - "panel": "shared" - } - }, - { - "label": "Clean", - "type": "shell", - "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "args": [ - "clean" - ], - "group": "build", - "presentation": { - "reveal": "always", - "panel": "shared" - } - }, - { - "label": "Watch (incremental)", - "type": "shell", - "command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "isBackground": true, - "problemMatcher": [ - { - "owner": "cpp", - "pattern": { - "regexp": "^(.*):(\\d+):(\\d+): (warning|error): (.*)$", - "file": 1, - "line": 2, - "column": 3, - "severity": 4, - "message": 5 - }, - "background": { - "activeOnStart": true, - "beginsPattern": "^.*Building.*$", - "endsPattern": "^.*(error|warning|linking).*$" - } - } - ], - "presentation": { - "reveal": "never", - "panel": "dedicated" - } - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - }, - { - "label": "Build (fast)", - "type": "shell", - "command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make", - "args": [ - "-j8" - ], - "group": "build" - } - ] -} \ No newline at end of file diff --git a/RELEASE.md b/RELEASE.md index 9a1519948c..8af0a68a40 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -34,7 +34,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0. # Dynamic scale adjustment via API curl -X POST http://localhost:8080/lora-adapters \ -H "Content-Type: application/json" \ - -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1.2}]' + -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1}]' ``` This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities. From 86cae3d8f6e087920642262957ebd34c4dde8b02 Mon Sep 17 00:00:00 2001 From: loganpowell Date: Wed, 20 Aug 2025 14:43:58 -0400 Subject: [PATCH 9/9] hk --- .gitignore | 4 +- diff.txt | 512 ----------------------------------------------------- 2 files changed, 2 insertions(+), 514 deletions(-) delete mode 100644 diff.txt diff --git a/.gitignore b/.gitignore index dd97ddc5b8..95709537cd 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,5 @@ /trace.json /*.log -/.models -/.vscode \ No newline at end of file +.models +.vscode \ No newline at end of file diff --git a/diff.txt b/diff.txt deleted file mode 100644 index c6be66ce7e..0000000000 --- a/diff.txt +++ /dev/null @@ -1,512 +0,0 @@ -diff --git a/llamafile/server/client.h b/llamafile/server/client.h -index 74d1314e6..f82eed422 100644 ---- a/llamafile/server/client.h -+++ b/llamafile/server/client.h -@@ -25,6 +25,7 @@ - #include - #include - #include -+#include "llama.cpp/common.h" - - #define HasHeader(H) (!!msg_.headers[H].a) - #define HeaderData(H) (ibuf_.p + msg_.headers[H].a) -@@ -141,13 +142,4 @@ struct Client - } // namespace lf - - // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp) --#define MAX_LORA_ADAPTERS 8 --struct lora_adapter_container { -- struct llama_lora_adapter* adapter; -- float scale; -- std::string name; // Model/adapter name for identification -- bool applied; // Whether this adapter is currently applied to slots --}; -- --extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; --extern int g_lora_adapters_count; -+// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead -diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp -index 35e55198d..c048a2a4e 100644 ---- a/llamafile/server/lora_adapters.cpp -+++ b/llamafile/server/lora_adapters.cpp -@@ -17,6 +17,7 @@ - - #include "client.h" - #include "llama.cpp/llama.h" -+#include "llama.cpp/common.h" - #include "llamafile/json.h" - #include "llamafile/llamafile.h" - #include "llamafile/server/log.h" -@@ -29,9 +30,7 @@ - using jt::Json; - - // External declarations for global LoRA adapter storage from prog.cpp (outside namespace) --// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h --extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS]; --extern int g_lora_adapters_count; -+extern std::vector g_lora_adapters; - - namespace lf { - namespace server { -@@ -46,12 +45,12 @@ Client::lora_adapters() - json.setArray(); - std::vector& json_array = json.getArray(); - -- for (int i = 0; i < g_lora_adapters_count; i++) { -+ for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { - Json adapter; - adapter.setObject(); -- adapter["id"] = i; -- adapter["path"] = g_lora_adapters[i].name; // Use name as path for now -- adapter["scale"] = g_lora_adapters[i].scale; -+ adapter["id"] = (int)i; -+ adapter["path"] = ::g_lora_adapters[i].path; -+ adapter["scale"] = ::g_lora_adapters[i].scale; - json_array.push_back(adapter); - } - -@@ -93,7 +92,7 @@ bool - Client::handle_apply_adapters(Json& json) - { - // Get active slots and apply current adapters to them -- if (g_lora_adapters_count == 0) { -+ if (::g_lora_adapters.empty()) { - Json response; - response["success"] = false; - response["message"] = "No adapters loaded to apply"; -@@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json) - return send_response(obuf_.p, p, response.toString()); - } - -- // Apply adapters to all slots via the server -- // Note: This would require coordination with the slot management system -- SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count); -+ // Apply adapters to all slots via the server using llama.cpp unified function -+ SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", -+ (int)::g_lora_adapters.size()); -+ -+ // Apply to all active slots -+ Slots* slots = worker_->server_->slots_; -+ pthread_mutex_lock(&slots->lock_); -+ -+ for (size_t i = 0; i < slots->slots_.size(); ++i) { -+ Slot* slot = slots->slots_[i].get(); -+ if (slot->ctx_) { -+ SLOG("applying LoRA adapters to slot #%d", slot->id_); -+ llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); -+ -+ // CRITICAL: Mark slot for refresh to handle LoRA changes properly -+ // The slot's prefill() mechanism will intelligently preserve system prompts -+ // and only re-evaluate what's necessary when the next request comes in -+ slot->mark_for_refresh(); -+ SLOG("marked slot #%d for refresh after LoRA application", slot->id_); -+ } -+ } -+ -+ pthread_mutex_unlock(&slots->lock_); - - Json response; - response["success"] = true; - response["message"] = "Adapters applied to active slots"; -- response["adapters_applied"] = g_lora_adapters_count; -+ response["adapters_applied"] = (int)::g_lora_adapters.size(); - - char* p = append_http_response_message(obuf_.p, 200); - p = stpcpy(p, "Content-Type: application/json\r\n"); -@@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json) - std::string adapter_path = json["path"].getString(); - float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f; - -- // Check if we have room for more adapters -- if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) { -- Json response; -- response["success"] = false; -- response["message"] = "Maximum number of adapters already loaded"; -- response["max_adapters"] = MAX_LORA_ADAPTERS; -- -- char* p = append_http_response_message(obuf_.p, 400); -- p = stpcpy(p, "Content-Type: application/json\r\n"); -- return send_response(obuf_.p, p, response.toString()); -- } -- - // Check if file exists - if (!std::filesystem::exists(adapter_path)) { - Json response; -@@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json) - return send_response(obuf_.p, p, response.toString()); - } - -+ // Create the adapter container -+ llama_lora_adapter_container adapter_container; -+ adapter_container.path = adapter_path; -+ adapter_container.scale = scale; -+ adapter_container.adapter = adapter; -+ - // Store the adapter -- int index = g_lora_adapters_count; -- g_lora_adapters[index].adapter = adapter; -- g_lora_adapters[index].scale = scale; -- g_lora_adapters_count++; -+ int index = (int)::g_lora_adapters.size(); -+ ::g_lora_adapters.push_back(adapter_container); - - SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str()); - -@@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json) - response["index"] = index; - response["path"] = adapter_path; - response["scale"] = scale; -- response["total_adapters"] = g_lora_adapters_count; -+ response["total_adapters"] = (int)::g_lora_adapters.size(); - - char* p = append_http_response_message(obuf_.p, 200); - p = stpcpy(p, "Content-Type: application/json\r\n"); -@@ -192,18 +203,16 @@ bool - Client::handle_clear_adapters() - { - // Clear all loaded adapters -- SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count); -+ SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size()); - -- for (int i = 0; i < g_lora_adapters_count; i++) { -- if (g_lora_adapters[i].adapter) { -- llama_lora_adapter_free(g_lora_adapters[i].adapter); -- g_lora_adapters[i].adapter = nullptr; -- g_lora_adapters[i].scale = 0.0f; -+ int cleared_count = (int)::g_lora_adapters.size(); -+ for (auto& la : ::g_lora_adapters) { -+ if (la.adapter) { -+ llama_lora_adapter_free(la.adapter); - } - } - -- int cleared_count = g_lora_adapters_count; -- g_lora_adapters_count = 0; -+ ::g_lora_adapters.clear(); - - SLOG("cleared %d LoRA adapter(s)", cleared_count); - -@@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json) - std::vector& json_array = json.getArray(); - SLOG("applying LoRA configuration with %d entries", (int)json_array.size()); - -- // First, reset all adapter scales to 0.0 (disabled) -- for (int i = 0; i < g_lora_adapters_count; i++) { -- g_lora_adapters[i].applied = false; -- } -- - // Process each entry in the array - for (size_t i = 0; i < json_array.size(); i++) { - Json& entry = json_array[i]; -@@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json) - float scale = entry["scale"].getNumber(); - - // Validate ID range -- if (id < 0 || id >= g_lora_adapters_count) { -+ if (id < 0 || id >= (int)::g_lora_adapters.size()) { - return send_error(400, "Invalid adapter ID"); - } - - // Update the adapter configuration -- g_lora_adapters[id].scale = scale; -- g_lora_adapters[id].applied = (scale > 0.0f); -+ ::g_lora_adapters[id].scale = scale; - - char scale_buf[32]; - snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); - SLOG("set LoRA adapter %d ('%s') scale to %s", -- id, g_lora_adapters[id].name.c_str(), scale_buf); -+ id, ::g_lora_adapters[id].path.c_str(), scale_buf); - } - -- // Re-apply LoRA adapters to all active slots with updated scales -- SLOG("re-applying LoRA adapters to all active slots"); -+ // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function -+ SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function"); - Slots* slots = worker_->server_->slots_; - - // Lock the slots to prevent concurrent access during LoRA re-application -@@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json) - Slot* slot = slots->slots_[i].get(); - if (slot->ctx_) { - SLOG("re-applying LoRA adapters to slot #%d", slot->id_); -+ llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); - -- // Clear existing LoRA adapters from this context -- llama_lora_adapter_clear(slot->ctx_); -- -- // Use the same approach as slot initialization: get all adapters via the function -- struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; -- float scales[MAX_LORA_ADAPTERS]; -- int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); -- -- SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_); -- -- // Re-apply all adapters with their current scales -- for (int j = 0; j < adapter_count; ++j) { -- char scale_buf[32]; -- snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]); -- SLOG("processing LoRA adapter %d with scale %s", j, scale_buf); -- if (scales[j] > 0.0f) { -- if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) { -- SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_); -- } else { -- SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf); -- } -- } else { -- SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf); -- } -- } -+ // CRITICAL: Mark slot for refresh to handle LoRA changes properly -+ // The slot's prefill() mechanism will intelligently preserve system prompts -+ // and only re-evaluate what's necessary when the next request comes in -+ slot->mark_for_refresh(); -+ SLOG("marked slot #%d for refresh after LoRA update", slot->id_); - } - } - -@@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json) - Json response; - response.setArray(); - std::vector& response_array = response.getArray(); -- for (int i = 0; i < g_lora_adapters_count; i++) { -+ -+ for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { - Json adapter; - adapter.setObject(); -- adapter["id"] = i; -- adapter["path"] = g_lora_adapters[i].name; -- adapter["scale"] = g_lora_adapters[i].scale; -+ adapter["id"] = (int)i; -+ adapter["path"] = ::g_lora_adapters[i].path; -+ adapter["scale"] = ::g_lora_adapters[i].scale; - response_array.push_back(adapter); - } - -diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp -index a21c80961..89ae3f12d 100644 ---- a/llamafile/server/prog.cpp -+++ b/llamafile/server/prog.cpp -@@ -26,31 +26,21 @@ - #include "llamafile/server/tokenbucket.h" - #include "llamafile/server/utils.h" - #include "llamafile/version.h" -+#include "llama.cpp/common.h" - #include - #include - --// Global LoRA adapter storage for multiple adapters --#define MAX_LORA_ADAPTERS 8 --#include --struct lora_adapter_container { -- struct llama_lora_adapter* adapter; -- float scale; -- std::string name; // Model/adapter name for identification -- bool applied; // Whether this adapter is currently applied to slots --}; -- --// Make these externally accessible for HTTP endpoint --struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {}; --int g_lora_adapters_count = 0; -+// Global LoRA adapter storage using llama.cpp structures -+std::vector g_lora_adapters; - - // Function to get the first global LoRA adapter for backward compatibility - extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { -- return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr; -+ return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter; - } - - // Function to get all LoRA adapters and their count - extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) { -- int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters; -+ int count = std::min((int)g_lora_adapters.size(), max_adapters); - for (int i = 0; i < count; i++) { - adapters[i] = g_lora_adapters[i].adapter; - scales[i] = g_lora_adapters[i].scale; -@@ -129,38 +119,31 @@ main(int argc, char* argv[]) - char scale_buf[32]; - snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); - -- // Generate model name from filename -+ // Generate model name from filename for identification - const char* path = FLAG_lora_adapters[i].path; - const char* filename = strrchr(path, '/'); - filename = filename ? filename + 1 : path; - -- // Remove file extension for cleaner name -- std::string model_name(filename); -- size_t dot_pos = model_name.find_last_of('.'); -- if (dot_pos != std::string::npos) { -- model_name = model_name.substr(0, dot_pos); -- } -- - SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, -- model_name.c_str(), path, scale_buf); -- -- g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path); -- g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale; -- g_lora_adapters[i].name = model_name; -- g_lora_adapters[i].applied = !FLAG_lora_init_without_apply; // Apply unless flag is set -+ filename, path, scale_buf); -+ -+ llama_lora_adapter_container adapter_container; -+ adapter_container.path = std::string(path); -+ adapter_container.scale = FLAG_lora_adapters[i].scale; -+ adapter_container.adapter = llama_lora_adapter_init(model, path); - -- if (!g_lora_adapters[i].adapter) { -+ if (!adapter_container.adapter) { - fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path); - // Cleanup previously loaded adapters -- for (int j = 0; j < i; j++) { -- if (g_lora_adapters[j].adapter) { -- llama_lora_adapter_free(g_lora_adapters[j].adapter); -+ for (auto& la : g_lora_adapters) { -+ if (la.adapter) { -+ llama_lora_adapter_free(la.adapter); - } - } - llama_free_model(model); - exit(1); - } -- g_lora_adapters_count++; -+ g_lora_adapters.push_back(adapter_container); - } - - if (FLAG_lora_init_without_apply) { -@@ -203,9 +186,9 @@ main(int argc, char* argv[]) - delete slots; - - // Cleanup LoRA adapters -- for (int i = 0; i < g_lora_adapters_count; i++) { -- if (g_lora_adapters[i].adapter) { -- llama_lora_adapter_free(g_lora_adapters[i].adapter); -+ for (auto& la : g_lora_adapters) { -+ if (la.adapter) { -+ llama_lora_adapter_free(la.adapter); - } - } - -diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp -index 55138417b..a081d69df 100644 ---- a/llamafile/server/slot.cpp -+++ b/llamafile/server/slot.cpp -@@ -18,6 +18,7 @@ - #include "slot.h" - #include "llama.cpp/llava/clip.h" - #include "llama.cpp/llava/llava.h" -+#include "llama.cpp/common.h" - #include "llamafile/image.h" - #include "llamafile/llama.h" - #include "llamafile/llamafile.h" -@@ -32,6 +33,9 @@ - #include - #include - -+// External declaration for global LoRA adapter storage -+extern std::vector g_lora_adapters; -+ - namespace lf { - namespace server { - -@@ -79,7 +83,7 @@ Slot::describe_error(int err) - } - } - --Slot::Slot(int id, llama_model* model) : id_(id), model_(model) -+Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false) - { - dll_init(&elem_); - last_used_ = time(0); -@@ -126,24 +130,16 @@ Slot::start() - if (!(ctx_ = llama_new_context_with_model(model_, cparams))) - return false; - -- // Apply LoRA adapters if available -- struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS]; -- float scales[MAX_LORA_ADAPTERS]; -- int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS); -- -- if (adapter_count > 0) { -- SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_); -- for (int i = 0; i < adapter_count; i++) { -- if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) { -- SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_); -- llama_free(ctx_); -- ctx_ = nullptr; -- return false; -- } -- char scale_buf[32]; -- snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]); -- SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf); -- } -+ // Apply LoRA adapters if available using llama.cpp's unified function -+ if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) { -+ SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", -+ (int)::g_lora_adapters.size(), id_); -+ llama_lora_adapters_apply(ctx_, ::g_lora_adapters); -+ } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) { -+ // When --lora-init-without-apply is set, explicitly clear any LoRA state -+ // to ensure no residual LoRA effects from model initialization -+ SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_); -+ llama_lora_adapter_clear(ctx_); - } - - if (FLAG_mmproj) -@@ -314,6 +310,15 @@ Slot::prefill(const std::vector& atoms, const ProgressCallback& progress) - if (!ctx_) - return uninitialized; - -+ // Check if we need to refresh due to LoRA adapter changes -+ if (needs_refresh_) { -+ SLOG("Refreshing slot due to LoRA adapter changes"); -+ llama_kv_cache_clear(ctx_); -+ history_.clear(); -+ needs_refresh_ = false; -+ // Fall through to normal prefill logic with cleared state -+ } -+ - // handle special case of empty prefill - if (atoms.empty()) { - llama_kv_cache_clear(ctx_); -@@ -458,5 +463,11 @@ Slot::dump(std::string* result) - } - } - -+void -+Slot::mark_for_refresh() -+{ -+ needs_refresh_ = true; -+} -+ - } // namespace server - } // namespace lf -diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h -index e8816c900..104aa7623 100644 ---- a/llamafile/server/slot.h -+++ b/llamafile/server/slot.h -@@ -23,7 +23,6 @@ - #include - - #define SLOT(e) DLL_CONTAINER(Slot, elem_, e) --#define MAX_LORA_ADAPTERS 8 - - struct llama_context; - struct llama_model; -@@ -66,6 +65,7 @@ struct Slot - llama_context* ctx_ = nullptr; - std::vector history_; - std::string system_fingerprint_; -+ bool needs_refresh_ = false; - - ~Slot(); - Slot(int, llama_model*); -@@ -79,6 +79,7 @@ struct Slot - int prefill(const std::vector&, const ProgressCallback& = nullptr); - void tokenize(std::vector*, std::string_view, bool); - void dump(std::string*); -+ void mark_for_refresh(); - }; - - } // namespace server