From e3edebee0616481dc46c892b9acade851c46ac90 Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 10:23:00 -0400
Subject: [PATCH 1/9] Add comprehensive LoRA adapter support to llamafile
 server

Implements full LoRA (Low-Rank Adaptation) adapter support compatible with
llama.cpp, enabling fine-tuning capabilities in llamafile server mode.

Features:
- Multiple LoRA adapter support with individual scaling factors
- New command-line flags: --lora, --lora-scaled, --lora-base
- Automatic memory mapping disabling for LoRA compatibility
- Per-slot adapter application during initialization
- Clean resource management and cleanup on shutdown

Changes:
- flags.cpp: Add LoRA flag parsing and global adapter management
- prog.cpp: Implement adapter loading, validation, and cleanup
- slot.cpp/slot.h: Add slot-level adapter application logic
- llamafile.h: Define LoRA adapter data structures and constants
- README.md: Add comprehensive LoRA usage documentation
- RELEASE.md: Document new LoRA features for release notes

The implementation follows llama.cpp patterns for maximum compatibility
and provides a solid foundation for advanced fine-tuning workflows.

Tested with Llama 3 8B + LoRA adapters, supporting both single and
multiple adapter configurations with custom scaling factors.

Resolves #697
---
 .gitignore                |  1 +
 README.md                 | 32 ++++++++++++++++++++
 RELEASE.md                | 26 ++++++++++++++++
 llamafile/flags.cpp       | 53 ++++++++++++++++++++++++++++++++
 llamafile/llamafile.h     | 12 ++++++++
 llamafile/server/prog.cpp | 63 +++++++++++++++++++++++++++++++++++++--
 llamafile/server/slot.cpp | 19 ++++++++++++
 llamafile/server/slot.h   |  8 +++++
 8 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index d928dde4e9..16feca060d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@
 /trace.json
 
 /*.log
+/.models
\ No newline at end of file
diff --git a/README.md b/README.md
index e19d0a6014..25f1962314 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,38 @@ llamafile --server --v2 --help
 llamafile --server --v2
 ```
 
+## LoRA Adapter Support
+
+Llamafile supports LoRA (Low-Rank Adaptation) adapters, allowing you to fine-tune models with adapter layers applied on top of the base model. This is compatible with adapters created for llama.cpp.
+
+### Using LoRA Adapters
+
+To use LoRA adapters with llamafile server, use the `--lora` and `--lora-scaled` flags:
+
+```bash
+# Single adapter with default scale (1.0)
+llamafile -m base_model.gguf --lora adapter.gguf --server
+
+# Single adapter with custom scale
+llamafile -m base_model.gguf --lora-scaled adapter.gguf 0.8 --server
+
+# Multiple adapters with different scales
+llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server
+```
+
+### LoRA Adapter Flags
+
+- `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0)
+- `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor
+- `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed)
+
+### Important Notes
+
+- LoRA adapters are automatically applied to each inference slot when they start
+- Memory mapping (`mmap`) is automatically disabled when using LoRA adapters for compatibility
+- Multiple adapters can be combined by using multiple `--lora` and `--lora-scaled` flags
+- The base model specified by `-m` serves as the foundation for the LoRA adapter(s)
+
 ## Other example llamafiles
 
 We also provide example llamafiles for other models, so you can easily
diff --git a/RELEASE.md b/RELEASE.md
index 76a1c5a1ae..b7fb2634e4 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,6 +2,32 @@
 
 There are a few steps in making a Llamafile release which will be detailed in this document.
 
+## What's New in This Release
+
+### LoRA Adapter Support
+
+This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include:
+
+- **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors
+- **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference
+- **Compatible Flags**: 
+  - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0)
+  - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor
+  - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases)
+- **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility
+- **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle
+
+Example usage:
+```bash
+# Single adapter with default scale
+llamafile -m base_model.gguf --lora adapter.gguf --server
+
+# Multiple adapters with different scales
+llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server
+```
+
+This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows.
+
 The two primary artifacts of the release are the `llamafile-<version>.zip` and the binaries for the GitHub release.
 
 ## Release Process
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
index c0e3bb3b74..37e25cb4d0 100644
--- a/llamafile/flags.cpp
+++ b/llamafile/flags.cpp
@@ -65,6 +65,12 @@ const char *FLAG_model = nullptr;
 const char *FLAG_prompt = nullptr;
 const char *FLAG_url_prefix = "";
 const char *FLAG_www_root = "/zip/www";
+const char *FLAG_lora = nullptr;
+const char *FLAG_lora_base = nullptr;
+
+// Multiple LoRA adapters support
+struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0};
+int FLAG_lora_adapters_count = 0;
 double FLAG_token_rate = 1;
 float FLAG_decay_growth = .01;
 float FLAG_frequency_penalty = 0;
@@ -385,6 +391,53 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
+        //////////////////////////////////////////////////////////////////////
+        // LoRA flags
+
+        if (!strcmp(flag, "--lora")) {
+            if (i == argc)
+                missing("--lora");
+            if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+                error("too many LoRA adapters (max 8)");
+            }
+            FLAG_lora_adapters[FLAG_lora_adapters_count].path = argv[i++];
+            FLAG_lora_adapters[FLAG_lora_adapters_count].scale = 1.0f;
+            FLAG_lora_adapters_count++;
+            
+            // Keep FLAG_lora for backward compatibility
+            if (!FLAG_lora) {
+                FLAG_lora = FLAG_lora_adapters[0].path;
+            }
+            continue;
+        }
+
+        if (!strcmp(flag, "--lora-scaled")) {
+            if (i == argc)
+                missing("--lora-scaled");
+            const char* lora_adapter = argv[i++];
+            if (i == argc)
+                missing("--lora-scaled scale value");
+            if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+                error("too many LoRA adapters (max 8)");
+            }
+            FLAG_lora_adapters[FLAG_lora_adapters_count].path = lora_adapter;
+            FLAG_lora_adapters[FLAG_lora_adapters_count].scale = atof(argv[i++]);
+            FLAG_lora_adapters_count++;
+            
+            // Keep FLAG_lora for backward compatibility
+            if (!FLAG_lora) {
+                FLAG_lora = FLAG_lora_adapters[0].path;
+            }
+            continue;
+        }
+
+        if (!strcmp(flag, "--lora-base")) {
+            if (i == argc)
+                missing("--lora-base");
+            FLAG_lora_base = argv[i++];
+            continue;
+        }
+
         //////////////////////////////////////////////////////////////////////
         // model flags
 
diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
index b74dda60dd..c5fd428439 100644
--- a/llamafile/llamafile.h
+++ b/llamafile/llamafile.h
@@ -36,6 +36,18 @@ extern const char *FLAG_prompt;
 extern const char *FLAG_url_prefix;
 extern const char *FLAG_www_root;
 extern double FLAG_token_rate;
+extern const char *FLAG_lora;
+extern const char *FLAG_lora_base;
+
+// LoRA adapter info structure to match llama.cpp
+struct llamafile_lora_adapter_info {
+    const char* path;
+    float scale;
+};
+
+#define MAX_LORA_ADAPTERS 8
+extern struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS];
+extern int FLAG_lora_adapters_count;
 extern float FLAG_decay_growth;
 extern float FLAG_frequency_penalty;
 extern float FLAG_presence_penalty;
diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
index bd6e6b6a24..237be622de 100644
--- a/llamafile/server/prog.cpp
+++ b/llamafile/server/prog.cpp
@@ -29,6 +29,31 @@
 #include <cassert>
 #include <cosmo.h>
 
+// Global LoRA adapter storage for multiple adapters
+#define MAX_LORA_ADAPTERS 8
+struct lora_adapter_container {
+    struct llama_lora_adapter* adapter;
+    float scale;
+};
+
+static struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {0};
+static int g_lora_adapters_count = 0;
+
+// Function to get the first global LoRA adapter for backward compatibility
+extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() {
+    return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr;
+}
+
+// Function to get all LoRA adapters and their count
+extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) {
+    int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters;
+    for (int i = 0; i < count; i++) {
+        adapters[i] = g_lora_adapters[i].adapter;
+        scales[i] = g_lora_adapters[i].scale;
+    }
+    return count;
+}
+
 namespace lf {
 namespace server {
 
@@ -69,6 +94,8 @@ main(int argc, char* argv[])
         FLAG_log_disable = true;
 
     // load model
+    // --lora implies --no-mmap (as per llama.cpp server)
+    bool use_mmap = FLAG_mmap && (FLAG_lora_adapters_count == 0);
     llama_model_params mparams = {
         .n_gpu_layers = FLAG_n_gpu_layers,
         .split_mode = (enum llama_split_mode)FLAG_split_mode,
@@ -79,8 +106,8 @@ main(int argc, char* argv[])
         .progress_callback_user_data = nullptr,
         .kv_overrides = nullptr,
         .vocab_only = false,
-        .use_mmap = true,
-        .use_mlock = false,
+        .use_mmap = use_mmap,
+        .use_mlock = FLAG_mlock,
         .check_tensors = false,
     };
     llama_model* model = llama_load_model_from_file(FLAG_model, mparams);
@@ -89,6 +116,30 @@ main(int argc, char* argv[])
         exit(1);
     }
 
+    // load LoRA adapters if specified
+    if (FLAG_lora_adapters_count > 0) {
+        SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count);
+        for (int i = 0; i < FLAG_lora_adapters_count; i++) {
+            SLOG("loading LoRA adapter %d from %s with scale %.2f", i + 1, 
+                 FLAG_lora_adapters[i].path, FLAG_lora_adapters[i].scale);
+            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path);
+            g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
+            if (!g_lora_adapters[i].adapter) {
+                fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, FLAG_lora_adapters[i].path);
+                // Cleanup previously loaded adapters
+                for (int j = 0; j < i; j++) {
+                    if (g_lora_adapters[j].adapter) {
+                        llama_lora_adapter_free(g_lora_adapters[j].adapter);
+                    }
+                }
+                llama_free_model(model);
+                exit(1);
+            }
+            g_lora_adapters_count++;
+        }
+        SLOG("all LoRA adapters loaded successfully");
+    }
+
     // create slots
     Slots* slots = new Slots(model);
     if (!slots->start(FLAG_slots)) {
@@ -120,6 +171,14 @@ main(int argc, char* argv[])
     g_server->close();
     delete g_server;
     delete slots;
+    
+    // Cleanup LoRA adapters
+    for (int i = 0; i < g_lora_adapters_count; i++) {
+        if (g_lora_adapters[i].adapter) {
+            llama_lora_adapter_free(g_lora_adapters[i].adapter);
+        }
+    }
+    
     llama_free_model(model);
     tokenbucket_destroy();
     time_destroy();
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
index c57ca2541c..d11d9bdf4e 100644
--- a/llamafile/server/slot.cpp
+++ b/llamafile/server/slot.cpp
@@ -125,6 +125,25 @@ Slot::start()
     system_fingerprint_ = generate_system_fingerprint(&cparams);
     if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
         return false;
+    
+    // Apply LoRA adapters if available
+    struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
+    float scales[MAX_LORA_ADAPTERS];
+    int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
+    
+    if (adapter_count > 0) {
+        SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_);
+        for (int i = 0; i < adapter_count; i++) {
+            if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) {
+                SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_);
+                llama_free(ctx_);
+                ctx_ = nullptr;
+                return false;
+            }
+            SLOG("applied LoRA adapter %d to slot #%d with scale %.2f", i + 1, id_, scales[i]);
+        }
+    }
+    
     if (FLAG_mmproj)
         if (!(clip_ctx_ = clip_model_load(FLAG_mmproj, FLAG_verbose)))
             return false;
diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
index 7fdd7bf881..e8816c9009 100644
--- a/llamafile/server/slot.h
+++ b/llamafile/server/slot.h
@@ -23,11 +23,19 @@
 #include <vector>
 
 #define SLOT(e) DLL_CONTAINER(Slot, elem_, e)
+#define MAX_LORA_ADAPTERS 8
 
 struct llama_context;
 struct llama_model;
+struct llama_lora_adapter;
 struct clip_ctx;
 
+// Function to get the global LoRA adapter
+extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter();
+
+// Function to get multiple LoRA adapters with their scales
+extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters);
+
 namespace lf {
 namespace server {
 

From dc8a203dceac8b570e25af5d87cb37b0595026a0 Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 10:56:35 -0400
Subject: [PATCH 2/9] fixes scale printing in server log

---
 llamafile/server/prog.cpp | 6 ++++--
 llamafile/server/slot.cpp | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
index 237be622de..6377cabc78 100644
--- a/llamafile/server/prog.cpp
+++ b/llamafile/server/prog.cpp
@@ -120,8 +120,10 @@ main(int argc, char* argv[])
     if (FLAG_lora_adapters_count > 0) {
         SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count);
         for (int i = 0; i < FLAG_lora_adapters_count; i++) {
-            SLOG("loading LoRA adapter %d from %s with scale %.2f", i + 1, 
-                 FLAG_lora_adapters[i].path, FLAG_lora_adapters[i].scale);
+            char scale_buf[32];
+            snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale);
+            SLOG("loading LoRA adapter %d from %s with scale %s", i + 1, 
+                 FLAG_lora_adapters[i].path, scale_buf);
             g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path);
             g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
             if (!g_lora_adapters[i].adapter) {
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
index d11d9bdf4e..55138417b1 100644
--- a/llamafile/server/slot.cpp
+++ b/llamafile/server/slot.cpp
@@ -140,7 +140,9 @@ Slot::start()
                 ctx_ = nullptr;
                 return false;
             }
-            SLOG("applied LoRA adapter %d to slot #%d with scale %.2f", i + 1, id_, scales[i]);
+            char scale_buf[32];
+            snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]);
+            SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf);
         }
     }
     

From e3288bcba9d294cbe83601a251fca8a7e85c5ffc Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 19:29:47 -0400
Subject: [PATCH 3/9] adds multi-lora hot-swapping functionality with
 --lora-init-without-apply (mirroring llama.cpp functionality)

---
 README.md                          | 193 ++++++----
 RELEASE.md                         |  29 +-
 llamafile/flags.cpp                |   8 +-
 llamafile/llamafile.h              |   3 +-
 llamafile/server/client.cpp        |   2 +
 llamafile/server/client.h          |  22 ++
 llamafile/server/lora_adapters.cpp | 325 +++++++++++++++++
 llamafile/server/main.1            |  25 ++
 llamafile/server/main.1.asc        | 561 +++++++++++++++--------------
 llamafile/server/prog.cpp          |  44 ++-
 10 files changed, 842 insertions(+), 370 deletions(-)
 create mode 100644 llamafile/server/lora_adapters.cpp

diff --git a/README.md b/README.md
index 25f1962314..3c6cfb127f 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,8 @@ ever leaves your computer.
 2. Open your computer's terminal.
 
 3. If you're using macOS, Linux, or BSD, you'll need to grant permission
-for your computer to execute this new file. (You only need to do this
-once.)
+   for your computer to execute this new file. (You only need to do this
+   once.)
 
 ```sh
 chmod +x llava-v1.5-7b-q4.llamafile
@@ -48,10 +48,10 @@ chmod +x llava-v1.5-7b-q4.llamafile
 ```
 
 6. Your browser should open automatically and display a chat interface.
-(If it doesn't, just open your browser and point it at http://localhost:8080)
+   (If it doesn't, just open your browser and point it at http://localhost:8080)
 
 7. When you're done chatting, return to your terminal and hit
-`Control-C` to shut down llamafile.
+   `Control-C` to shut down llamafile.
 
 **Having trouble? See the "Gotchas" section below.**
 
@@ -103,25 +103,25 @@ The response that's printed should look like the following:
 
 ```json
 {
-   "choices" : [
-      {
-         "finish_reason" : "stop",
-         "index" : 0,
-         "message" : {
-            "content" : "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.",
-            "role" : "assistant"
-         }
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "message": {
+        "content": "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.",
+        "role": "assistant"
       }
-   ],
-   "created" : 1704199256,
-   "id" : "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU",
-   "model" : "LLaMA_CPP",
-   "object" : "chat.completion",
-   "usage" : {
-      "completion_tokens" : 38,
-      "prompt_tokens" : 78,
-      "total_tokens" : 116
-   }
+    }
+  ],
+  "created": 1704199256,
+  "id": "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU",
+  "model": "LLaMA_CPP",
+  "object": "chat.completion",
+  "usage": {
+    "completion_tokens": 38,
+    "prompt_tokens": 78,
+    "total_tokens": 116
+  }
 }
 ```
 
@@ -201,39 +201,77 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.
 - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor
 - `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed)
 
+### Dynamic LoRA Adapter Management (Hot-Swapping)
+
+When running llamafile in server mode, you can dynamically adjust LoRA adapter scales during inference without restarting the server. This enables hot-swapping of adapter configurations on the fly.
+
+#### API Endpoints
+
+- **GET** `/lora-adapters`: Returns current LoRA adapter configuration
+- **POST** `/lora-adapters`: Updates LoRA adapter scales with JSON payload
+
+#### Examples
+
+**View current LoRA adapter configuration:**
+
+```bash
+curl http://localhost:8080/lora-adapters
+```
+
+**Update LoRA adapter scales:**
+
+```bash
+curl -X POST http://localhost:8080/lora-adapters \
+  -H "Content-Type: application/json" \
+  -d '[
+      {"id": 0, "scale": 0.8},
+      {"id": 1, "scale": 1.2}
+    ]'
+```
+
+The API returns JSON responses with the current adapter configuration:
+
+```json
+[
+  { "id": 0, "path": "adapter1.gguf", "scale": 0.8 },
+  { "id": 1, "path": "adapter2.gguf", "scale": 1.2 }
+]
+```
+
 ### Important Notes
 
 - LoRA adapters are automatically applied to each inference slot when they start
 - Memory mapping (`mmap`) is automatically disabled when using LoRA adapters for compatibility
 - Multiple adapters can be combined by using multiple `--lora` and `--lora-scaled` flags
 - The base model specified by `-m` serves as the foundation for the LoRA adapter(s)
+- Scale changes via the API take effect immediately for new inference requests
+- Hot-swapping allows real-time fine-tuning of model behavior without server restart
 
 ## Other example llamafiles
 
 We also provide example llamafiles for other models, so you can easily
 try out llamafile with different kinds of LLMs.
 
-| Model                   | Size     | License                                                                                                                            | llamafile                                                                                                                                                                                      | other quants                                                                        |
-| ---                     | ---      | ---                                                                                                                                | ---                                                                                                                                                                                            | ---                                                                                 |
-| LLaMA 3.2 1B Instruct   | 1.11 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE)                                      | [Llama-3.2-1B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true)                           | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile)       |
-| LLaMA 3.2 3B Instruct   | 2.62 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE)                                      | [Llama-3.2-3B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true)                           | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile)       |
-| LLaMA 3.1 8B Instruct   | 5.23 GB  | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE)                                 | [Llama-3.1-8B-Instruct.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true)         | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile)  |
-| Gemma 3 1B Instruct     | 1.32 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                                                       | [gemma-3-1b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true)                                         | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile)               |
-| Gemma 3 4B Instruct     | 3.50 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                                                       | [gemma-3-4b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true)                                         | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile)               |
-| Gemma 3 12B Instruct    | 7.61 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                                                       | [gemma-3-12b-it.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true)                                 | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile)              |
-| QwQ 32B                 | 7.61 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                                                      | [Qwen\_QwQ-32B-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true)                                                  | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile)                     |
-| R1 Distill Qwen 14B     | 9.30 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                                                    | [DeepSeek-R1-Distill-Qwen-14B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true)        | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile)|
-| R1 Distill Llama 8B     | 5.23 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                                                    | [DeepSeek-R1-Distill-Llama-8B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true)        | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile)|
-| LLaVA 1.5               | 3.97 GB  | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)                                                     | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true)                                                     | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile)               |
-| Mistral-7B-Instruct v0.3| 4.42 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                                                      | [mistral-7b-instruct-v0.3.Q4\_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true)               | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile)    |
-| Granite 3.2 8B Instruct | 5.25 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                                                      | [granite-3.2-8b-instruct-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true)             | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile)     |
-| Phi-3-mini-4k-instruct  | 7.67 GB  | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE)                                    | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true)                        | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile)      |
-| Mixtral-8x7B-Instruct   | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                                                      | [mixtral-8x7b-instruct-v0.1.Q5\_K\_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true)    | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile)  |
-| OLMo-7B                 | 5.68 GB  | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE)                                              | [OLMo-7B-0424.Q6\_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true)                                                   | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile)                |
-| *Text Embedding Models* |          |                                                                                                                                    |                                                                                                                                                                                                |                                                                                     |
-| E5-Mistral-7B-Instruct  | 5.16 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                                                    | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true)                            | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct)                |
-| mxbai-embed-large-v1    | 0.7 GB   | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                                                      | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true)                              | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile)        |
-
+| Model                    | Size     | License                                                                                            | llamafile                                                                                                                                                                                 | other quants                                                                         |
+| ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ |
+| LLaMA 3.2 1B Instruct    | 1.11 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE)      | [Llama-3.2-1B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true)                       | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile)        |
+| LLaMA 3.2 3B Instruct    | 2.62 GB  | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE)      | [Llama-3.2-3B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true)                       | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile)        |
+| LLaMA 3.1 8B Instruct    | 5.23 GB  | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.1-8B-Instruct.Q4_K_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true)      | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile)   |
+| Gemma 3 1B Instruct      | 1.32 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                       | [gemma-3-1b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true)                                     | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile)                |
+| Gemma 3 4B Instruct      | 3.50 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                       | [gemma-3-4b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true)                                     | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile)                |
+| Gemma 3 12B Instruct     | 7.61 GB  | [Gemma 3](https://ai.google.dev/gemma/terms)                                                       | [gemma-3-12b-it.Q4_K_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true)                              | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile)               |
+| QwQ 32B                  | 7.61 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                      | [Qwen_QwQ-32B-Q4_K_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true)                                                | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile)                      |
+| R1 Distill Qwen 14B      | 9.30 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                    | [DeepSeek-R1-Distill-Qwen-14B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true)     | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile) |
+| R1 Distill Llama 8B      | 5.23 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                    | [DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true)     | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile) |
+| LLaVA 1.5                | 3.97 GB  | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)                     | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true)                                                | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile)                |
+| Mistral-7B-Instruct v0.3 | 4.42 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                      | [mistral-7b-instruct-v0.3.Q4_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true)           | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile)     |
+| Granite 3.2 8B Instruct  | 5.25 GB  | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                      | [granite-3.2-8b-instruct-Q4_K_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true)          | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile)      |
+| Phi-3-mini-4k-instruct   | 7.67 GB  | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE)    | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true)                   | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile)       |
+| Mixtral-8x7B-Instruct    | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                      | [mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile)   |
+| OLMo-7B                  | 5.68 GB  | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE)              | [OLMo-7B-0424.Q6_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true)                                               | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile)                 |
+| _Text Embedding Models_  |          |                                                                                                    |                                                                                                                                                                                           |                                                                                      |
+| E5-Mistral-7B-Instruct   | 5.16 GB  | [MIT](https://choosealicense.com/licenses/mit/)                                                    | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true)                       | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct)                 |
+| mxbai-embed-large-v1     | 0.7 GB   | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/)                                      | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true)                         | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile)         |
 
 Here is an example for the Mistral command-line llamafile:
 
@@ -243,9 +281,9 @@ Here is an example for the Mistral command-line llamafile:
 
 And here is an example for WizardCoder-Python command-line llamafile:
 
-```sh
+````sh
 ./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n'
-```
+````
 
 And here's an example for the LLaVA command-line llamafile:
 
@@ -278,38 +316,38 @@ This is all accomplished by combining llama.cpp with Cosmopolitan Libc,
 which provides some useful capabilities:
 
 1. llamafiles can run on multiple CPU microarchitectures. We
-added runtime dispatching to llama.cpp that lets new Intel systems use
-modern CPU features without trading away support for older computers.
+   added runtime dispatching to llama.cpp that lets new Intel systems use
+   modern CPU features without trading away support for older computers.
 
 2. llamafiles can run on multiple CPU architectures. We do
-that by concatenating AMD64 and ARM64 builds with a shell script that
-launches the appropriate one. Our file format is compatible with WIN32
-and most UNIX shells. It's also able to be easily converted (by either
-you or your users) to the platform-native format, whenever required.
+   that by concatenating AMD64 and ARM64 builds with a shell script that
+   launches the appropriate one. Our file format is compatible with WIN32
+   and most UNIX shells. It's also able to be easily converted (by either
+   you or your users) to the platform-native format, whenever required.
 
 3. llamafiles can run on six OSes (macOS, Windows, Linux,
-FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll
-only need to build your code once, using a Linux-style toolchain. The
-GCC-based compiler we provide is itself an Actually Portable Executable,
-so you can build your software for all six OSes from the comfort of
-whichever one you prefer most for development.
+   FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll
+   only need to build your code once, using a Linux-style toolchain. The
+   GCC-based compiler we provide is itself an Actually Portable Executable,
+   so you can build your software for all six OSes from the comfort of
+   whichever one you prefer most for development.
 
 4. The weights for an LLM can be embedded within the llamafile.
-We added support for PKZIP to the GGML library. This lets uncompressed
-weights be mapped directly into memory, similar to a self-extracting
-archive. It enables quantized weights distributed online to be prefixed
-with a compatible version of the llama.cpp software, thereby ensuring
-its originally observed behaviors can be reproduced indefinitely.
+   We added support for PKZIP to the GGML library. This lets uncompressed
+   weights be mapped directly into memory, similar to a self-extracting
+   archive. It enables quantized weights distributed online to be prefixed
+   with a compatible version of the llama.cpp software, thereby ensuring
+   its originally observed behaviors can be reproduced indefinitely.
 
 5. Finally, with the tools included in this project you can create your
-*own* llamafiles, using any compatible model weights you want. You can
-then distribute these llamafiles to other people, who can easily make
-use of them regardless of what kind of computer they have.
+   _own_ llamafiles, using any compatible model weights you want. You can
+   then distribute these llamafiles to other people, who can easily make
+   use of them regardless of what kind of computer they have.
 
 ## Using llamafile with external weights
 
 Even though our example llamafiles have the weights built-in, you don't
-*have* to use llamafile that way. Instead, you can download *just* the
+_have_ to use llamafile that way. Instead, you can download _just_ the
 llamafile software (without any weights included) from our releases page.
 You can then use it alongside any external weights you may have on hand.
 External weights are particularly useful for Windows users because they
@@ -326,7 +364,6 @@ curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1
 Windows users may need to change `./llamafile.exe` to `.\llamafile.exe`
 when running the above command.
 
-
 ## Gotchas and troubleshooting
 
 On any platform, if your llamafile process is immediately killed, check
@@ -341,13 +378,12 @@ If you use zsh and have trouble running llamafile, try saying `sh -c
 ./llamafile`. This is due to a bug that was fixed in zsh 5.9+. The same
 is the case for Python `subprocess`, old versions of Fish, etc.
 
-
 #### Mac error "... cannot be opened because the developer cannot be verified"
 
 1. Immediately launch System Settings, then go to Privacy & Security. llamafile should be listed at the bottom, with a button to Allow.
-2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama. 
+2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama.
 
-### Linux 
+### Linux
 
 On some Linux systems, you might get errors relating to `run-detectors`
 or WINE. This is due to `binfmt_misc` registrations. You can fix that by
@@ -362,6 +398,7 @@ sudo sh -c "echo ':APE-jart:M::jartsr::/usr/bin/ape:' >/proc/sys/fs/binfmt_misc/
 ```
 
 ### Windows
+
 As mentioned above, on Windows you may need to rename your llamafile by
 adding `.exe` to the filename.
 
@@ -438,8 +475,8 @@ systems, llamafile extracts a small loader program named `ape` to
 `$TMPDIR/.llamafile` or `~/.ape-1.9` which is used to map your model
 into memory.
 
-[1] Darwin kernel versions 15.6+ *should* be supported, but we currently
-    have no way of testing that.
+[1] Darwin kernel versions 15.6+ _should_ be supported, but we currently
+have no way of testing that.
 
 ## Supported CPUs
 
@@ -496,7 +533,7 @@ On Linux, NVIDIA users will need to install the CUDA SDK (ideally using
 the shell script installer) and ROCm users need to install the HIP SDK.
 They're detected by looking to see if `nvcc` or `hipcc` are on the PATH.
 
-If you have both an AMD GPU *and* an NVIDIA GPU in your machine, then
+If you have both an AMD GPU _and_ an NVIDIA GPU in your machine, then
 you may need to qualify which one you want used, by passing either
 `--gpu amd` or `--gpu nvidia`.
 
@@ -521,12 +558,12 @@ Here's an example of how to generate code for a libc function using the
 llama.cpp command line interface, utilizing WizardCoder-Python-13B
 weights:
 
-```sh
+````sh
 llamafile \
   -m wizardcoder-python-13b-v1.0.Q8_0.gguf \
   --temp 0 -r '}\n' -r '```\n' \
   -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n'
-```
+````
 
 Here's a similar example that instead utilizes Mistral-7B-Instruct
 weights for prose composition:
@@ -680,12 +717,13 @@ commands will display that information when passing the `--help` flag.
 
 ## Running llamafile with models downloaded by third-party applications
 
-This section answers the question *"I already have a model downloaded locally by application X, can I use it with llamafile?"*. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow.
+This section answers the question _"I already have a model downloaded locally by application X, can I use it with llamafile?"_. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow.
 
 ### LM Studio
+
 [LM Studio](https://lmstudio.ai/) stores downloaded models in `~/.cache/lm-studio/models`, in subdirectories with the same name of the models (following HuggingFace's `account_name/model_name` format), with the same filename you saw when you chose to download the file.
 
- So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows:
+So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows:
 
 ```
 cd ~/.cache/lm-studio/models/TheBloke/Llama-2-7B-GGUF
@@ -698,7 +736,7 @@ When you download a new model with [ollama](https://ollama.com), all its metadat
 
 The manifest maps each file related to the model (e.g. GGUF weights, license, prompt template, etc) to a sha256 digest. The digest corresponding to the element whose `mediaType` is `application/vnd.ollama.image.model` is the one referring to the model's GGUF file.
 
-Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see *only* those sha256-* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows:
+Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see _only_ those sha256-\* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows:
 
 ```
 cd ~/.ollama/models/blobs
@@ -847,5 +885,4 @@ should that be desired.
 
 The llamafile logo on this page was generated with the assistance of DALL·E 3.
 
-
 [![Star History Chart](https://api.star-history.com/svg?repos=Mozilla-Ocho/llamafile&type=Date)](https://star-history.com/#Mozilla-Ocho/llamafile&Date)
diff --git a/RELEASE.md b/RELEASE.md
index b7fb2634e4..c3b727fa4f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,24 +9,35 @@ There are a few steps in making a Llamafile release which will be detailed in th
 This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include:
 
 - **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors
+- **Dynamic Hot-Swapping API**: Adjust LoRA adapter scales in real-time during inference without restarting the server
 - **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference
-- **Compatible Flags**: 
+- **REST API Endpoints**:
+  - `GET /lora-adapters`: View current adapter configuration
+  - `POST /lora-adapters`: Update adapter scales dynamically
+- **Compatible Flags**:
   - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0)
   - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor
   - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases)
 - **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility
+- **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety
 - **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle
 
 Example usage:
+
 ```bash
 # Single adapter with default scale
 llamafile -m base_model.gguf --lora adapter.gguf --server
 
 # Multiple adapters with different scales
 llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server
+
+# Dynamic scale adjustment via API
+curl -X POST http://localhost:8080/lora-adapters \
+  -H "Content-Type: application/json" \
+  -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1.2}]'
 ```
 
-This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows.
+This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities.
 
 The two primary artifacts of the release are the `llamafile-<version>.zip` and the binaries for the GitHub release.
 
@@ -36,13 +47,13 @@ Note: Step 2 and 3 are only needed if you are making a new release of the ggml-c
 
 1. Update the version number in `version.h`
 2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA.
-    - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
-    - For LocalScore you can do this by running the script `./localscore/cuda.sh`.
-    - The files will be built and placed your home directory.
+   - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
+   - For LocalScore you can do this by running the script `./localscore/cuda.sh`.
+   - The files will be built and placed your home directory.
 3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore.
-    - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
-    - For LocalScore you can do this by running the script `./localscore/cuda.bat`.
-    - The files will be built and placed in the `build/release` directory.
+   - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
+   - For LocalScore you can do this by running the script `./localscore/cuda.bat`.
+   - The files will be built and placed in the `build/release` directory.
 4. Build the project with `make -j8`
 5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local`
 
@@ -152,4 +163,4 @@ You can use the script to create the appropriately named binaries:
 
 `./llamafile/release.sh -v <version> -s <source_dir> -d <dest_dir>`
 
-Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.
\ No newline at end of file
+Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
index 37e25cb4d0..17fa46e229 100644
--- a/llamafile/flags.cpp
+++ b/llamafile/flags.cpp
@@ -53,6 +53,7 @@ bool FLAG_tinyblas = false;
 bool FLAG_trace = false;
 bool FLAG_unsecure = false;
 bool FLAG_v2 = false;
+bool FLAG_lora_init_without_apply = false;
 const char *FLAG_chat_template = "";
 const char *FLAG_db = nullptr;
 const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;"
@@ -66,7 +67,6 @@ const char *FLAG_prompt = nullptr;
 const char *FLAG_url_prefix = "";
 const char *FLAG_www_root = "/zip/www";
 const char *FLAG_lora = nullptr;
-const char *FLAG_lora_base = nullptr;
 
 // Multiple LoRA adapters support
 struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0};
@@ -431,10 +431,8 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
-        if (!strcmp(flag, "--lora-base")) {
-            if (i == argc)
-                missing("--lora-base");
-            FLAG_lora_base = argv[i++];
+        if (!strcmp(flag, "--lora-init-without-apply")) {
+            FLAG_lora_init_without_apply = true;
             continue;
         }
 
diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
index c5fd428439..7226ec051d 100644
--- a/llamafile/llamafile.h
+++ b/llamafile/llamafile.h
@@ -24,6 +24,7 @@ extern bool FLAG_trace;
 extern bool FLAG_trap;
 extern bool FLAG_unsecure;
 extern bool FLAG_v2;
+extern bool FLAG_lora_init_without_apply;
 extern const char *FLAG_chat_template;
 extern const char *FLAG_db;
 extern const char *FLAG_db_startup_sql;
@@ -37,11 +38,11 @@ extern const char *FLAG_url_prefix;
 extern const char *FLAG_www_root;
 extern double FLAG_token_rate;
 extern const char *FLAG_lora;
-extern const char *FLAG_lora_base;
 
 // LoRA adapter info structure to match llama.cpp
 struct llamafile_lora_adapter_info {
     const char* path;
+    const char* name;  // Model/adapter name for identification
     float scale;
 };
 
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
index e142a5a219..364348a54b 100644
--- a/llamafile/server/client.cpp
+++ b/llamafile/server/client.cpp
@@ -705,6 +705,8 @@ Client::dispatcher()
         return slotz();
     if (p1 == "flagz")
         return flagz();
+    if (p1 == "lora-adapters")
+        return lora_adapters();
 
 #if 0
     // TODO: implement frontend for database
diff --git a/llamafile/server/client.h b/llamafile/server/client.h
index b9e00da41b..74d1314e62 100644
--- a/llamafile/server/client.h
+++ b/llamafile/server/client.h
@@ -35,6 +35,11 @@
     SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H))
 
 struct llama_model;
+struct llama_lora_adapter;
+
+namespace jt {
+struct Json;
+}
 
 namespace lf {
 namespace server {
@@ -121,6 +126,11 @@ struct Client
 
     bool slotz() __wur;
     bool flagz() __wur;
+    bool lora_adapters() __wur;
+    bool handle_apply_adapters(jt::Json&) __wur;
+    bool handle_load_adapter(jt::Json&) __wur;
+    bool handle_clear_adapters() __wur;
+    bool handle_upstream_lora_apply(jt::Json&) __wur;
     bool db_chat(int64_t) __wur;
     bool db_chats() __wur;
     bool db_message(int64_t) __wur;
@@ -129,3 +139,15 @@ struct Client
 
 } // namespace server
 } // namespace lf
+
+// Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp)
+#define MAX_LORA_ADAPTERS 8
+struct lora_adapter_container {
+    struct llama_lora_adapter* adapter;
+    float scale;
+    std::string name;  // Model/adapter name for identification
+    bool applied;      // Whether this adapter is currently applied to slots
+};
+
+extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
+extern int g_lora_adapters_count;
diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp
new file mode 100644
index 0000000000..35e55198df
--- /dev/null
+++ b/llamafile/server/lora_adapters.cpp
@@ -0,0 +1,325 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "client.h"
+#include "llama.cpp/llama.h"
+#include "llamafile/json.h"
+#include "llamafile/llamafile.h"
+#include "llamafile/server/log.h"
+#include "llamafile/server/server.h"
+#include "llamafile/server/worker.h"
+#include "llamafile/server/slots.h"
+#include "llamafile/server/slot.h"
+#include <filesystem>
+
+using jt::Json;
+
+// External declarations for global LoRA adapter storage from prog.cpp (outside namespace)
+// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h
+extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
+extern int g_lora_adapters_count;
+
+namespace lf {
+namespace server {
+
+bool
+Client::lora_adapters()
+{
+    // Support both GET and POST methods
+    if (msg_.method == kHttpGet) {
+        // GET: Return current adapter configuration (upstream llama.cpp format)
+        Json json;
+        json.setArray();
+        std::vector<Json>& json_array = json.getArray();
+        
+        for (int i = 0; i < g_lora_adapters_count; i++) {
+            Json adapter;
+            adapter.setObject();
+            adapter["id"] = i;
+            adapter["path"] = g_lora_adapters[i].name;  // Use name as path for now
+            adapter["scale"] = g_lora_adapters[i].scale;
+            json_array.push_back(adapter);
+        }
+        
+        char* p = append_http_response_message(obuf_.p, 200);
+        p = stpcpy(p, "Content-Type: application/json\r\n");
+        return send_response(obuf_.p, p, json.toString());
+        
+    } else if (msg_.method == kHttpPost) {
+        // POST: Apply LoRA adapters by ID and scale (upstream llama.cpp format)
+        
+        // Validate content type
+        if (!HasHeader(kHttpContentType) ||
+            !IsMimeType(HeaderData(kHttpContentType),
+                       HeaderLength(kHttpContentType),
+                       "application/json")) {
+            return send_error(400, "Content-Type must be application/json");
+        }
+        
+        // Read the payload
+        if (!read_payload())
+            return false;
+        
+        // Parse JSON payload - expecting an array of {id, scale} objects
+        auto [status, json] = Json::parse(std::string(payload_));
+        if (status != Json::success)
+            return send_error(400, Json::StatusToString(status));
+        if (!json.isArray())
+            return send_error(400, "Request body must be an array");
+        
+        // Apply the LoRA configuration
+        return handle_upstream_lora_apply(json);
+        
+    } else {
+        return send_error(405, "Method Not Allowed");
+    }
+}
+
+bool
+Client::handle_apply_adapters(Json& json)
+{
+    // Get active slots and apply current adapters to them
+    if (g_lora_adapters_count == 0) {
+        Json response;
+        response["success"] = false;
+        response["message"] = "No adapters loaded to apply";
+        
+        char* p = append_http_response_message(obuf_.p, 400);
+        p = stpcpy(p, "Content-Type: application/json\r\n");
+        return send_response(obuf_.p, p, response.toString());
+    }
+    
+    // Apply adapters to all slots via the server
+    // Note: This would require coordination with the slot management system
+    SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count);
+    
+    Json response;
+    response["success"] = true;
+    response["message"] = "Adapters applied to active slots";
+    response["adapters_applied"] = g_lora_adapters_count;
+    
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    return send_response(obuf_.p, p, response.toString());
+}
+
+bool
+Client::handle_load_adapter(Json& json)
+{
+    // Load a new adapter from file
+    if (!json.contains("path")) {
+        return send_error(400, "Missing 'path' field for load operation");
+    }
+    
+    std::string adapter_path = json["path"].getString();
+    float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f;
+    
+    // Check if we have room for more adapters
+    if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+        Json response;
+        response["success"] = false;
+        response["message"] = "Maximum number of adapters already loaded";
+        response["max_adapters"] = MAX_LORA_ADAPTERS;
+        
+        char* p = append_http_response_message(obuf_.p, 400);
+        p = stpcpy(p, "Content-Type: application/json\r\n");
+        return send_response(obuf_.p, p, response.toString());
+    }
+    
+    // Check if file exists
+    if (!std::filesystem::exists(adapter_path)) {
+        Json response;
+        response["success"] = false;
+        response["message"] = "Adapter file not found: " + adapter_path;
+        
+        char* p = append_http_response_message(obuf_.p, 404);
+        p = stpcpy(p, "Content-Type: application/json\r\n");
+        return send_response(obuf_.p, p, response.toString());
+    }
+    
+    // Load the adapter
+    char scale_buf[32];
+    snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale);
+    SLOG("loading LoRA adapter from %s with scale %s", adapter_path.c_str(), scale_buf);
+    
+    struct llama_lora_adapter* adapter = llama_lora_adapter_init(model_, adapter_path.c_str());
+    if (!adapter) {
+        Json response;
+        response["success"] = false;
+        response["message"] = "Failed to load adapter from " + adapter_path;
+        
+        char* p = append_http_response_message(obuf_.p, 500);
+        p = stpcpy(p, "Content-Type: application/json\r\n");
+        return send_response(obuf_.p, p, response.toString());
+    }
+    
+    // Store the adapter
+    int index = g_lora_adapters_count;
+    g_lora_adapters[index].adapter = adapter;
+    g_lora_adapters[index].scale = scale;
+    g_lora_adapters_count++;
+    
+    SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str());
+    
+    Json response;
+    response["success"] = true;
+    response["message"] = "Adapter loaded successfully";
+    response["index"] = index;
+    response["path"] = adapter_path;
+    response["scale"] = scale;
+    response["total_adapters"] = g_lora_adapters_count;
+    
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    return send_response(obuf_.p, p, response.toString());
+}
+
+bool
+Client::handle_clear_adapters()
+{
+    // Clear all loaded adapters
+    SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count);
+    
+    for (int i = 0; i < g_lora_adapters_count; i++) {
+        if (g_lora_adapters[i].adapter) {
+            llama_lora_adapter_free(g_lora_adapters[i].adapter);
+            g_lora_adapters[i].adapter = nullptr;
+            g_lora_adapters[i].scale = 0.0f;
+        }
+    }
+    
+    int cleared_count = g_lora_adapters_count;
+    g_lora_adapters_count = 0;
+    
+    SLOG("cleared %d LoRA adapter(s)", cleared_count);
+    
+    Json response;
+    response["success"] = true;
+    response["message"] = "All adapters cleared";
+    response["cleared_count"] = cleared_count;
+    response["remaining_count"] = 0;
+    
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    return send_response(obuf_.p, p, response.toString());
+}
+
+bool
+Client::handle_upstream_lora_apply(Json& json)
+{
+    // Handle upstream llama.cpp LoRA API format: array of {id, scale} objects
+    std::vector<Json>& json_array = json.getArray();
+    SLOG("applying LoRA configuration with %d entries", (int)json_array.size());
+    
+    // First, reset all adapter scales to 0.0 (disabled)
+    for (int i = 0; i < g_lora_adapters_count; i++) {
+        g_lora_adapters[i].applied = false;
+    }
+    
+    // Process each entry in the array
+    for (size_t i = 0; i < json_array.size(); i++) {
+        Json& entry = json_array[i];
+        
+        if (!entry.isObject()) {
+            return send_error(400, "Each entry must be an object with 'id' and 'scale' fields");
+        }
+        
+        if (!entry.contains("id") || !entry.contains("scale")) {
+            return send_error(400, "Each entry must have 'id' and 'scale' fields");
+        }
+        
+        int id = entry["id"].getNumber();
+        float scale = entry["scale"].getNumber();
+        
+        // Validate ID range
+        if (id < 0 || id >= g_lora_adapters_count) {
+            return send_error(400, "Invalid adapter ID");
+        }
+        
+        // Update the adapter configuration
+        g_lora_adapters[id].scale = scale;
+        g_lora_adapters[id].applied = (scale > 0.0f);
+        
+        char scale_buf[32];
+        snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale);
+        SLOG("set LoRA adapter %d ('%s') scale to %s", 
+             id, g_lora_adapters[id].name.c_str(), scale_buf);
+    }
+    
+    // Re-apply LoRA adapters to all active slots with updated scales
+    SLOG("re-applying LoRA adapters to all active slots");
+    Slots* slots = worker_->server_->slots_;
+    
+    // Lock the slots to prevent concurrent access during LoRA re-application
+    pthread_mutex_lock(&slots->lock_);
+    
+    for (size_t i = 0; i < slots->slots_.size(); ++i) {
+        Slot* slot = slots->slots_[i].get();
+        if (slot->ctx_) {
+            SLOG("re-applying LoRA adapters to slot #%d", slot->id_);
+            
+            // Clear existing LoRA adapters from this context
+            llama_lora_adapter_clear(slot->ctx_);
+            
+            // Use the same approach as slot initialization: get all adapters via the function
+            struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
+            float scales[MAX_LORA_ADAPTERS];
+            int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
+            
+            SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_);
+            
+            // Re-apply all adapters with their current scales
+            for (int j = 0; j < adapter_count; ++j) {
+                char scale_buf[32];
+                snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]);
+                SLOG("processing LoRA adapter %d with scale %s", j, scale_buf);
+                if (scales[j] > 0.0f) {
+                    if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) {
+                        SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_);
+                    } else {
+                        SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf);
+                    }
+                } else {
+                    SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf);
+                }
+            }
+        }
+    }
+    
+    pthread_mutex_unlock(&slots->lock_);
+    SLOG("finished re-applying LoRA adapters to all slots");
+    
+    // Return updated adapter configuration
+    Json response;
+    response.setArray();
+    std::vector<Json>& response_array = response.getArray();
+    for (int i = 0; i < g_lora_adapters_count; i++) {
+        Json adapter;
+        adapter.setObject();
+        adapter["id"] = i;
+        adapter["path"] = g_lora_adapters[i].name;
+        adapter["scale"] = g_lora_adapters[i].scale;
+        response_array.push_back(adapter);
+    }
+    
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    return send_response(obuf_.p, p, response.toString());
+}
+
+} // namespace server
+} // namespace lf
diff --git a/llamafile/server/main.1 b/llamafile/server/main.1
index e5d01adc2a..60c6d3adb7 100644
--- a/llamafile/server/main.1
+++ b/llamafile/server/main.1
@@ -29,6 +29,23 @@ recommended that you run multiple instances of llamafiler behind a
 reverse proxy such as NGINX or Redbean.
 .It Fl mm Ar FNAME , Fl Fl mmproj Ar FNAME
 Path of vision model weights.
+.It Fl Fl lora Ar FNAME
+Path to LoRA adapter weights. This flag may be repeated to load multiple
+LoRA adapters. Each adapter will be applied with a default scale of 1.0.
+The base model specified by
+.Fl m
+will be used as the foundation for all LoRA adaptations.
+.It Fl Fl lora-scaled Ar FNAME Ar SCALE
+Path to LoRA adapter weights with custom scaling factor. The
+.Ar SCALE
+parameter is a floating point number that controls the strength of the
+LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced strength).
+This flag may be repeated to load multiple scaled LoRA adapters.
+.It Fl Fl lora-init-without-apply
+Load LoRA adapters at startup without automatically applying them. When
+this flag is used, adapters are initialized but not active until
+explicitly applied via the API. This is useful for dynamic LoRA adapter
+management through the HTTP endpoints.
 .It Fl Fl db Ar FILE
 Specifies path of sqlite3 database.
 .Pp
@@ -215,6 +232,14 @@ Here's an example of how you might start this server:
 .Pp
 .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf"
 .Pp
+Here's how to start with a LoRA adapter:
+.Pp
+.Dl "llamafiler -m base_model.gguf --lora adapter.gguf"
+.Pp
+Here's how to use multiple LoRA adapters with custom scaling:
+.Pp
+.Dl "llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.8"
+.Pp
 Here's how to send a tokenization request:
 .Pp
 .Dl "curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world"
diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc
index ab99e21913..b12ee187eb 100644
--- a/llamafile/server/main.1.asc
+++ b/llamafile/server/main.1.asc
@@ -1,269 +1,292 @@
-[4mLLAMAFILER[24m(1)               General Commands Manual              [4mLLAMAFILER[24m(1)
-
-[1mNAME[0m
-       llamafiler — fast reliable large language model server
-
-[1mSYNOPSIS[0m
-       [1mllamafiler -m [4m[22mmodel.gguf[24m [flags...]
-
-[1mDESCRIPTION[0m
-       [1mllamafiler  [22mllamafiler  is  an  HTTP  server  for Large Language Models
-       (LLMs). It includes a web GUI for both chatbot and text completion.  It
-       can  be your OpenAI API compatible embeddings / completions / chat com‐
-       pletions server. It's able to more intelligently recycle  context  win‐
-       dows across multiple slots serving multiple clients.
-
-[1mOPTIONS[0m
-       The following options are available:
-
-       [1m--version[0m
-               Print version and exit.
-
-       [1m-h[22m, [1m--help[0m
-               Show help message and exit.
-
-       [1m-m [4m[22mFNAME[24m, [1m--model [4m[22mFNAME[0m
-               Path  of  GGUF  model weights. Each server process is currently
-               limited to serving only one model. If you need to host multiple
-               models, then it's recommended that you run  multiple  instances
-               of llamafiler behind a reverse proxy such as NGINX or Redbean.
-
-       [1m-mm [4m[22mFNAME[24m, [1m--mmproj [4m[22mFNAME[0m
-               Path of vision model weights.
-
-       [1m--db [4m[22mFILE[0m
-               Specifies path of sqlite3 database.
-
-               The default is [4m~/.llamafile/llamafile.sqlite3[0m
-
-       [1m-ngl [4m[22mN[24m, [1m--gpu-layers [4m[22mN[24m, [1m--n-gpu-layers [4m[22mN[0m
-               Specifies number of layers to offload to GPU.
-
-               This  flag  must  be passed in order to use GPU on systems with
-               NVIDIA or AMD GPUs. If you're confident that  you  have  enough
-               VRAM,  then  you  can  pass [1m-ngl [4m[22m999[24m to enable full offloading,
-               since this number is automatically downtuned  to  however  many
-               number  of  layers  the model has. If VRAM is limited, then the
-               [1m--verbose [22mflag may be passed to learn how many layers the model
-               has, e.g. 35, which can then be down-tuned  until  the  out  of
-               memory error goes away.
-
-               On  Apple Silicon systems with Metal, GPU offloading is enabled
-               by default.  Since  these  GPUs  use  unified  memory,  they're
-               treated  as  having  a  single  layer;  therefore, using values
-               higher than 1 will be treated as 1. You can pass [1m-ngl [4m[22m0[24m to dis‐
-               able GPU offloading and run in CPU mode on Apple Metal systems.
-
-       [1m-l [4m[22mHOSTPORT[24m, [1m--listen [4m[22mHOSTPORT[0m
-               Specifies the local [HOST:]PORT on which the HTTP server should
-               listen.  By default this is 0.0.0.0:8080 which means llamafiler
-               will bind to port 8080 on every locally available IPv4  network
-               interface. This option may currently only be specified once.
-
-       [1m-c [4m[22mTOKENS[24m, [1m--ctx-size [4m[22mTOKENS[0m
-               Specifies  context  size.  This specifies how long a completion
-               can get before it runs out of space. It defaults  to  8k  which
-               means  8192 tokens.  Many models support a larger context size,
-               like 128k, but that'll need much more RAM or VRAM per slot.  If
-               this  value  is  larger  than  the  trained context size of the
-               model, it'll be tuned down to the maximum. If this value  is  0
-               or negative, the maximum number of tokens will be used.
-
-       [1m-s [4m[22mCOUNT[24m, [1m--slots [4m[22mCOUNT[0m
-               Specifies how many slots to maintain. This defaults to 1. Slots
-               are  used  by  chat  completions  requests. When such a request
-               comes in, the client needs to take control of a slot. When  the
-               completion  is  finished,  the slot is relinquished back to the
-               server. HTTP clients will wait for a slot to be relinquished if
-               none are available. Tuning this parameter to nicely fit  avail‐
-               able RAM or VRAM can help you manage your server resources, and
-               control  how  much  completion  parallelism can happen.  Please
-               note that [1m--ctx-size [22mhas a strong influence on how  many  slots
-               can be created.
-
-       [1m--decay-delay [4m[22mINT[0m
-               Number  of  seconds  a context window slot needs to be inactive
-               before the system starts to  strongly  consider  giving  it  to
-               other clients. The default is 300 which is five minutes.
-
-       [1m--decay-growth [4m[22mFLOAT[0m
-               Sets  slot  decay  growth  factor. Context window slots are as‐
-               signed in a least recently used fashion, based on  the  formula
-               age + e sup {growth * (age - delay)}
-
-       [1m-p [4m[22mTEXT[24m, [1m--prompt [4m[22mTEXT[24m, [1m--system-prompt [4m[22mTEXT[0m
-               Specifies  system prompt. This value is passed along to the web
-               frontend.
-
-       [1m--no-display-prompt[0m
-               Hide system prompt from web user interface.
-
-       [1m--nologo[0m
-               Hide llamafile logo icon from web ui.
-
-       [1m--url-prefix [4m[22mURLPREFIX[0m
-               Specifies a URL prefix  (subdirectory)  under  which  the  HTTP
-               server  will  make  the API accessible, e.g. /lamafiler. Useful
-               when running llamafiler behind a reverse proxy such as NGINX or
-               Redbean. By default, this is set to / (root).
-
-       [1m--verbose[0m
-               Enable logging of diagnostic information. This flag  is  useful
-               for  learning more about the model and hardware. It can also be
-               helpful for troubleshooting errors. We currently recommend that
-               this flag be avoided in production since the  llama.cpp  logger
-               may disrupt thread cancelation.
-
-       [1m-w [4m[22mN[24m, [1m--workers [4m[22mN[0m
-               Number of HTTP client handling threads.
-
-       [1m--trust [4m[22mCIDR[0m
-               Adds  a  network  to the trusted network list. This argument is
-               specified in the form IPV4/MASKBITS,  e.g.  192.168.0.0/24.  By
-               default, all clients are untrusted, which means they're subject
-               to token bucket throttling, and additional security precautions
-               that  may  cause request handling to go slightly slower. There‐
-               fore this flag is important to use if you  want  to  accurately
-               benchmark  llamafiler,  since the server will otherwise see the
-               benchmark as a DDOS and deprioritize its traffic accordingly.
-
-       [1m--ip-header [4m[22mSTR[0m
-               If this flag is passed a value, e.g. X-Forwarded-For, then  any
-               trusted may send this header to your llamafile server to let it
-               know  what  the true effective client IPv4 address actually is.
-               After this happens the default security restrictions, e.g.  to‐
-               ken  bucket, will be measured and applied against that IPv4 ad‐
-               dress and its adjacent networks.
-
-       [1m--token-rate [4m[22mN[0m
-               Specifies how many times per second a token is dropped in  each
-               bucket.   This  setting  is  used to define a limitation on how
-               many TCP connects and HTTP messages each chunk of the IPv4  ad‐
-               dress space is permitted to send to llamafiler over a sustained
-               period  of time. The default token rate is 1, which means that,
-               on a long enough timeline, a class-C network will be  depriori‐
-               tized  if  it  sends  more than one request per second. No real
-               penalty actually applies though until the server  runs  out  of
-               resources, e.g. HTTP request workers.
-
-       [1m--token-burst [4m[22mN[0m
-               Specifies how many HTTP requests and TCP connects a given slice
-               of  the  IPv4 address space is permitted to send within a short
-               period of time, before token bucket restrictions kick  in,  and
-               cause the client to be deprioritized. By default, this value is
-               set  to 100. It may be tuned to any value between 1 and 127 in‐
-               clusive.
-
-       [1m--token-cidr [4m[22mN[0m
-               Specifies IPv4 address space granularity of token bucket  algo‐
-               rithm,  in  network  bits.  By default, this value is set to 24
-               which means individual IPv4 addresses are viewed as being  rep‐
-               resentative  members  of  a class-C network, or in other words,
-               each group of 256 IPv4 addresses is lumped together. If one  IP
-               in  the group does something bad, then bad things happen to all
-               the other IPv4 addresses in that granule. This  number  may  be
-               set  to  any  integer  between 3 and 32 inclusive. Specifying a
-               higher number will trade away system memory to increase network
-               specificity.  For example, using 32 means that 4 billion  indi‐
-               vidual  token buckets will be created. By default, a background
-               thread drops one token in each bucket  every  second,  so  that
-               could potentially be a lot of busy work. A value of three means
-               that  everyone  on  the  Internet who talks to your server will
-               have to fight over only eight token buckets in total.
-
-       [1m--unsecure[0m
-               Disables sandboxing. By default, llamafiler puts  itself  in  a
-               SECCOMP BPF sandbox, so that even if your server gets hacked in
-               the  worst  possible  way  (some  kind  of C++ memory bug) then
-               there's very little damage an attacker will be able to do. This
-               works by restricting system calls using Cosmopolitan Libc's im‐
-               plementation of pledge() which is currently only  supported  on
-               Linux  (other  OSes  will  simply be unsecured by default). The
-               pledge security policy that's used by default is  "stdio  anet"
-               which  means  that  only  relatively harmless system calls like
-               read(), write(), and accept() are allowed once the  server  has
-               finished  initializing. It's not possible for remotely executed
-               code to do things like launch subprocesses, read  or  write  to
-               the filesystem, or initiate a new connection to a server.
-
-       [1m-k [4m[22mN[24m, [1m--keepalive [4m[22mN[0m
-               Specifies  the TCP keepalive interval in seconds. This value is
-               passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if  they're
-               supported  by  the  host  operating  system.  If  this value is
-               greater than 0, then the the SO_KEEPALIVE and  TCP_NODELAY  op‐
-               tions  are enabled on network sockets, if supported by the host
-               operating system. The default keepalive is 5.
-
-       [1m--http-obuf-size [4m[22mN[0m
-               Size of HTTP output buffer size, in bytes. Default is 1048576.
-
-       [1m--http-ibuf-size [4m[22mN[0m
-               Size of HTTP input buffer size, in bytes. Default is 1048576.
-
-       [1m--chat-template [4m[22mNAME[0m
-               Specifies or overrides chat template for model.
-
-               Normally the GGUF metadata tokenizer.chat_template will specify
-               this value for instruct models. This flag may be used to either
-               override the chat template, or specify one when the GGUF  meta‐
-               data  field  is  absent, which effectively forces the web ui to
-               enable chatbot mode.
-
-               Supported chat template names are: chatml, llama2, llama3, mis‐
-               tral (alias for llama2), phi3, zephyr, monarch,  gemma,  gemma2
-               (alias   for  gemma),  orion,  openchat,  vicuna,  vicuna-orca,
-               deepseek, command-r, chatglm3, chatglm4, minicpm, deepseek2, or
-               exaone3.
-
-               It is also possible to pass the jinja2 template itself to  this
-               argument.  Since llamafiler doesn't currently support jinja2, a
-               heuristic  will  be  used to guess which of the above templates
-               the template represents.
-
-       [1m--completion-mode[0m
-               Forces web ui to operate in completion mode, rather  than  chat
-               mode.   Normally  the web ui chooses its mode based on the GGUF
-               metadata. Base models normally don't define tokenizer.chat_tem‐
-               plate whereas instruct models do. If it's a  base  model,  then
-               the web ui will automatically use completion mode only, without
-               needing  to  specify  this  flag.  This flag is useful in cases
-               where a prompt template is defined by the gguf, but it  is  de‐
-               sirable for the chat interface to be disabled.
-
-       [1m--db-startup-sql [4m[22mCODE[0m
-               Specifies  SQL code that should be executed whenever connecting
-               to the SQLite database. The  default  is  the  following  code,
-               which enables the write-ahead log.
-
-                     PRAGMA journal_mode=WAL;
-                     PRAGMA synchronous=NORMAL;
-
-       [1m--reserve-tokens [4m[22mN[0m
-               Percent of context window to reserve for predicted tokens. When
-               the  server  runs out of context window, old chat messages will
-               be forgotten until this percent of the context  is  empty.  The
-               default  is  15%. If this is specified as a floating point num‐
-               ber, e.g. 0.15, then it'll be multiplied by 100 to get the per‐
-               cent.
-
-[1mEXAMPLES[0m
-       Here's an example of how you might start this server:
-
-             [1mllamafiler -m all-MiniLM-L6-v2.F32.gguf[0m
-
-       Here's how to send a tokenization request:
-
-             [1mcurl -v http://127.0.0.1:8080/tokenize?prompt=hello+world[0m
-
-       Here's how to send a embedding request:
-
-             [1mcurl -v http://127.0.0.1:8080/embedding?content=hello+world[0m
-
-[1mDOCUMENTATION[0m
-       Read our Markdown documentation for additional help and tutorials.  See
-       llamafile/server/doc/index.md in the source repository on GitHub.
-
-[1mSEE ALSO[0m
-       [4mllamafile[24m(1), [4mwhisperfile[24m(1)
-
-Mozilla Ocho                   November 30, 2024                 [4mLLAMAFILER[24m(1)
+LLAMAFILER(1)                General Commands Manual               LLAMAFILER(1)
+
+NNAAMMEE
+     llllaammaaffiilleerr – fast reliable large language model server
+
+SSYYNNOOPPSSIISS
+     llllaammaaffiilleerr --mm _m_o_d_e_l_._g_g_u_f [flags...]
+
+DDEESSCCRRIIPPTTIIOONN
+     llllaammaaffiilleerr llamafiler is an HTTP server for Large Language Models (LLMs).
+     It includes a web GUI for both chatbot and text completion. It can be your
+     OpenAI API compatible embeddings / completions / chat completions server.
+     It's able to more intelligently recycle context windows across multiple
+     slots serving multiple clients.
+
+OOPPTTIIOONNSS
+     The following options are available:
+
+     ----vveerrssiioonn
+             Print version and exit.
+
+     --hh, ----hheellpp
+             Show help message and exit.
+
+     --mm _F_N_A_M_E, ----mmooddeell _F_N_A_M_E
+             Path of GGUF model weights. Each server process is currently
+             limited to serving only one model. If you need to host multiple
+             models, then it's recommended that you run multiple instances of
+             llamafiler behind a reverse proxy such as NGINX or Redbean.
+
+     --mmmm _F_N_A_M_E, ----mmmmpprroojj _F_N_A_M_E
+             Path of vision model weights.
+
+     ----lloorraa _F_N_A_M_E
+             Path to LoRA adapter weights. This flag may be repeated to load
+             multiple LoRA adapters. Each adapter will be applied with a default
+             scale of 1.0.  The base model specified by --mm will be used as the
+             foundation for all LoRA adaptations.
+
+     ----lloorraa--ssccaalleedd _F_N_A_M_E _S_C_A_L_E
+             Path to LoRA adapter weights with custom scaling factor. The _S_C_A_L_E
+             parameter is a floating point number that controls the strength of
+             the LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced
+             strength).  This flag may be repeated to load multiple scaled LoRA
+             adapters.
+
+     ----lloorraa--iinniitt--wwiitthhoouutt--aappppllyy
+             Load LoRA adapters at startup without automatically applying them.
+             When this flag is used, adapters are initialized but not active
+             until explicitly applied via the API. This is useful for dynamic
+             LoRA adapter management through the HTTP endpoints.
+
+     ----ddbb _F_I_L_E
+             Specifies path of sqlite3 database.
+
+             The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3
+
+     --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N
+             Specifies number of layers to offload to GPU.
+
+             This flag must be passed in order to use GPU on systems with NVIDIA
+             or AMD GPUs. If you're confident that you have enough VRAM, then
+             you can pass --nnggll _9_9_9 to enable full offloading, since this number
+             is automatically downtuned to however many number of layers the
+             model has. If VRAM is limited, then the ----vveerrbboossee flag may be
+             passed to learn how many layers the model has, e.g. 35, which can
+             then be down-tuned until the out of memory error goes away.
+
+             On Apple Silicon systems with Metal, GPU offloading is enabled by
+             default. Since these GPUs use unified memory, they're treated as
+             having a single layer; therefore, using values higher than 1 will
+             be treated as 1. You can pass --nnggll _0 to disable GPU offloading and
+             run in CPU mode on Apple Metal systems.
+
+     --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T
+             Specifies the local [HOST:]PORT on which the HTTP server should
+             listen.  By default this is 0.0.0.0:8080 which means llamafiler
+             will bind to port 8080 on every locally available IPv4 network
+             interface. This option may currently only be specified once.
+
+     --cc _T_O_K_E_N_S, ----ccttxx--ssiizzee _T_O_K_E_N_S
+             Specifies context size. This specifies how long a completion can
+             get before it runs out of space. It defaults to 8k which means 8192
+             tokens.  Many models support a larger context size, like 128k, but
+             that'll need much more RAM or VRAM per slot. If this value is
+             larger than the trained context size of the model, it'll be tuned
+             down to the maximum. If this value is 0 or negative, the maximum
+             number of tokens will be used.
+
+     --ss _C_O_U_N_T, ----sslloottss _C_O_U_N_T
+             Specifies how many slots to maintain. This defaults to 1. Slots are
+             used by chat completions requests. When such a request comes in,
+             the client needs to take control of a slot. When the completion is
+             finished, the slot is relinquished back to the server. HTTP clients
+             will wait for a slot to be relinquished if none are available.
+             Tuning this parameter to nicely fit available RAM or VRAM can help
+             you manage your server resources, and control how much completion
+             parallelism can happen.  Please note that ----ccttxx--ssiizzee has a strong
+             influence on how many slots can be created.
+
+     ----ddeeccaayy--ddeellaayy _I_N_T
+             Number of seconds a context window slot needs to be inactive before
+             the system starts to strongly consider giving it to other clients.
+             The default is 300 which is five minutes.
+
+     ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T
+             Sets slot decay growth factor. Context window slots are assigned in
+             a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h
+             * (_a_g_e − _d_e_l_a_y))
+
+     --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T
+             Specifies system prompt. This value is passed along to the web
+             frontend.
+
+     ----nnoo--ddiissppllaayy--pprroommpptt
+             Hide system prompt from web user interface.
+
+     ----nnoollooggoo
+             Hide llamafile logo icon from web ui.
+
+     ----uurrll--pprreeffiixx _U_R_L_P_R_E_F_I_X
+             Specifies a URL prefix (subdirectory) under which the HTTP server
+             will make the API accessible, e.g. /lamafiler. Useful when running
+             llamafiler behind a reverse proxy such as NGINX or Redbean. By
+             default, this is set to / (root).
+
+     ----vveerrbboossee
+             Enable logging of diagnostic information. This flag is useful for
+             learning more about the model and hardware. It can also be helpful
+             for troubleshooting errors. We currently recommend that this flag
+             be avoided in production since the llama.cpp logger may disrupt
+             thread cancelation.
+
+     --ww _N, ----wwoorrkkeerrss _N
+             Number of HTTP client handling threads.
+
+     ----ttrruusstt _C_I_D_R
+             Adds a network to the trusted network list. This argument is
+             specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By
+             default, all clients are untrusted, which means they're subject to
+             token bucket throttling, and additional security precautions that
+             may cause request handling to go slightly slower. Therefore this
+             flag is important to use if you want to accurately benchmark
+             llamafiler, since the server will otherwise see the benchmark as a
+             DDOS and deprioritize its traffic accordingly.
+
+     ----iipp--hheeaaddeerr _S_T_R
+             If this flag is passed a value, e.g. X-Forwarded-For, then any
+             trusted may send this header to your llamafile server to let it
+             know what the true effective client IPv4 address actually is. After
+             this happens the default security restrictions, e.g. token bucket,
+             will be measured and applied against that IPv4 address and its
+             adjacent networks.
+
+     ----ttookkeenn--rraattee _N
+             Specifies how many times per second a token is dropped in each
+             bucket.  This setting is used to define a limitation on how many
+             TCP connects and HTTP messages each chunk of the IPv4 address space
+             is permitted to send to llamafiler over a sustained period of time.
+             The default token rate is 1, which means that, on a long enough
+             timeline, a class-C network will be deprioritized if it sends more
+             than one request per second. No real penalty actually applies
+             though until the server runs out of resources, e.g. HTTP request
+             workers.
+
+     ----ttookkeenn--bbuurrsstt _N
+             Specifies how many HTTP requests and TCP connects a given slice of
+             the IPv4 address space is permitted to send within a short period
+             of time, before token bucket restrictions kick in, and cause the
+             client to be deprioritized. By default, this value is set to 100.
+             It may be tuned to any value between 1 and 127 inclusive.
+
+     ----ttookkeenn--cciiddrr _N
+             Specifies IPv4 address space granularity of token bucket algorithm,
+             in network bits. By default, this value is set to 24 which means
+             individual IPv4 addresses are viewed as being representative
+             members of a class-C network, or in other words, each group of 256
+             IPv4 addresses is lumped together. If one IP in the group does
+             something bad, then bad things happen to all the other IPv4
+             addresses in that granule. This number may be set to any integer
+             between 3 and 32 inclusive. Specifying a higher number will trade
+             away system memory to increase network specificity.  For example,
+             using 32 means that 4 billion individual token buckets will be
+             created. By default, a background thread drops one token in each
+             bucket every second, so that could potentially be a lot of busy
+             work. A value of three means that everyone on the Internet who
+             talks to your server will have to fight over only eight token
+             buckets in total.
+
+     ----uunnsseeccuurree
+             Disables sandboxing. By default, llamafiler puts itself in a
+             SECCOMP BPF sandbox, so that even if your server gets hacked in the
+             worst possible way (some kind of C++ memory bug) then there's very
+             little damage an attacker will be able to do. This works by
+             restricting system calls using Cosmopolitan Libc's implementation
+             of pledge() which is currently only supported on Linux (other OSes
+             will simply be unsecured by default). The pledge security policy
+             that's used by default is "stdio anet" which means that only
+             relatively harmless system calls like read(), write(), and accept()
+             are allowed once the server has finished initializing. It's not
+             possible for remotely executed code to do things like launch
+             subprocesses, read or write to the filesystem, or initiate a new
+             connection to a server.
+
+     --kk _N, ----kkeeeeppaalliivvee _N
+             Specifies the TCP keepalive interval in seconds. This value is
+             passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're
+             supported by the host operating system. If this value is greater
+             than 0, then the the SO_KEEPALIVE and TCP_NODELAY options are
+             enabled on network sockets, if supported by the host operating
+             system. The default keepalive is 5.
+
+     ----hhttttpp--oobbuuff--ssiizzee _N
+             Size of HTTP output buffer size, in bytes. Default is 1048576.
+
+     ----hhttttpp--iibbuuff--ssiizzee _N
+             Size of HTTP input buffer size, in bytes. Default is 1048576.
+
+     ----cchhaatt--tteemmppllaattee _N_A_M_E
+             Specifies or overrides chat template for model.
+
+             Normally the GGUF metadata tokenizer.chat_template will specify
+             this value for instruct models. This flag may be used to either
+             override the chat template, or specify one when the GGUF metadata
+             field is absent, which effectively forces the web ui to enable
+             chatbot mode.
+
+             Supported chat template names are: chatml, llama2, llama3, mistral
+             (alias for llama2), phi3, zephyr, monarch, gemma, gemma2 (alias for
+             gemma), orion, openchat, vicuna, vicuna-orca, deepseek, command-r,
+             chatglm3, chatglm4, minicpm, deepseek2, or exaone3.
+
+             It is also possible to pass the jinja2 template itself to this
+             argument.  Since llamafiler doesn't currently support jinja2, a
+             heuristic will be used to guess which of the above templates the
+             template represents.
+
+     ----ccoommpplleettiioonn--mmooddee
+             Forces web ui to operate in completion mode, rather than chat mode.
+             Normally the web ui chooses its mode based on the GGUF metadata.
+             Base models normally don't define tokenizer.chat_template whereas
+             instruct models do. If it's a base model, then the web ui will
+             automatically use completion mode only, without needing to specify
+             this flag. This flag is useful in cases where a prompt template is
+             defined by the gguf, but it is desirable for the chat interface to
+             be disabled.
+
+     ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E
+             Specifies SQL code that should be executed whenever connecting to
+             the SQLite database. The default is the following code, which
+             enables the write-ahead log.
+
+                   PRAGMA journal_mode=WAL;
+                   PRAGMA synchronous=NORMAL;
+
+     ----rreesseerrvvee--ttookkeennss _N
+             Percent of context window to reserve for predicted tokens. When the
+             server runs out of context window, old chat messages will be
+             forgotten until this percent of the context is empty. The default
+             is 15%. If this is specified as a floating point number, e.g. 0.15,
+             then it'll be multiplied by 100 to get the percent.
+
+EEXXAAMMPPLLEESS
+     Here's an example of how you might start this server:
+
+           llamafiler -m all-MiniLM-L6-v2.F32.gguf
+
+     Here's how to start with a LoRA adapter:
+
+           llamafiler -m base_model.gguf --lora adapter.gguf
+
+     Here's how to use multiple LoRA adapters with custom scaling:
+
+           llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled
+           adapter2.gguf 0.8
+
+     Here's how to send a tokenization request:
+
+           curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world
+
+     Here's how to send a embedding request:
+
+           curl -v http://127.0.0.1:8080/embedding?content=hello+world
+
+DDOOCCUUMMEENNTTAATTIIOONN
+     Read our Markdown documentation for additional help and tutorials. See
+     llamafile/server/doc/index.md in the source repository on GitHub.
+
+SSEEEE AALLSSOO
+     llamafile(1), whisperfile(1)
+
+Mozilla Ocho                    November 30, 2024                   Mozilla Ocho
diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
index 6377cabc78..a21c809614 100644
--- a/llamafile/server/prog.cpp
+++ b/llamafile/server/prog.cpp
@@ -31,13 +31,17 @@
 
 // Global LoRA adapter storage for multiple adapters
 #define MAX_LORA_ADAPTERS 8
+#include <string>
 struct lora_adapter_container {
     struct llama_lora_adapter* adapter;
     float scale;
+    std::string name;  // Model/adapter name for identification
+    bool applied;      // Whether this adapter is currently applied to slots
 };
 
-static struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {0};
-static int g_lora_adapters_count = 0;
+// Make these externally accessible for HTTP endpoint
+struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {};
+int g_lora_adapters_count = 0;
 
 // Function to get the first global LoRA adapter for backward compatibility
 extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() {
@@ -118,16 +122,35 @@ main(int argc, char* argv[])
 
     // load LoRA adapters if specified
     if (FLAG_lora_adapters_count > 0) {
-        SLOG("loading %d LoRA adapter(s)", FLAG_lora_adapters_count);
+        const char* apply_mode = FLAG_lora_init_without_apply ? "without applying" : "and applying";
+        SLOG("loading %d LoRA adapter(s) %s", FLAG_lora_adapters_count, apply_mode);
+        
         for (int i = 0; i < FLAG_lora_adapters_count; i++) {
             char scale_buf[32];
             snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale);
-            SLOG("loading LoRA adapter %d from %s with scale %s", i + 1, 
-                 FLAG_lora_adapters[i].path, scale_buf);
-            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, FLAG_lora_adapters[i].path);
+            
+            // Generate model name from filename
+            const char* path = FLAG_lora_adapters[i].path;
+            const char* filename = strrchr(path, '/');
+            filename = filename ? filename + 1 : path;
+            
+            // Remove file extension for cleaner name
+            std::string model_name(filename);
+            size_t dot_pos = model_name.find_last_of('.');
+            if (dot_pos != std::string::npos) {
+                model_name = model_name.substr(0, dot_pos);
+            }
+            
+            SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, 
+                 model_name.c_str(), path, scale_buf);
+                 
+            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path);
             g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
+            g_lora_adapters[i].name = model_name;
+            g_lora_adapters[i].applied = !FLAG_lora_init_without_apply;  // Apply unless flag is set
+            
             if (!g_lora_adapters[i].adapter) {
-                fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, FLAG_lora_adapters[i].path);
+                fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path);
                 // Cleanup previously loaded adapters
                 for (int j = 0; j < i; j++) {
                     if (g_lora_adapters[j].adapter) {
@@ -139,7 +162,12 @@ main(int argc, char* argv[])
             }
             g_lora_adapters_count++;
         }
-        SLOG("all LoRA adapters loaded successfully");
+        
+        if (FLAG_lora_init_without_apply) {
+            SLOG("all LoRA adapters loaded successfully but not applied (use /lora-adapters API to apply)");
+        } else {
+            SLOG("all LoRA adapters loaded and applied successfully");
+        }
     }
 
     // create slots

From 78a3b7632201995565340a91580039e9d4b86e6b Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 19:31:42 -0400
Subject: [PATCH 4/9] hk

---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index c3b727fa4f..9a1519948c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -17,7 +17,7 @@ This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters,
 - **Compatible Flags**:
   - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0)
   - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor
-  - `--lora-base [FNAME]`: Optional base model for LoRA adapter (advanced use cases)
+  - `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping)
 - **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility
 - **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety
 - **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle

From 95f6887e3f62d329bb9b8254f23e014b76205cfd Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 19:40:23 -0400
Subject: [PATCH 5/9] moar hk

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3c6cfb127f..2b56dcf7a0 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.
 
 - `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0)
 - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor
-- `--lora-base [FNAME]`: Optional base model for LoRA adapter (usually not needed)
+- `--lora-init-without-apply [FNAME]`: Load LoRA adapters without applying (lora hot-swapping)
 
 ### Dynamic LoRA Adapter Management (Hot-Swapping)
 

From 069024d862e46cd01f9df8c2b3b76523a5f19fc1 Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Fri, 8 Aug 2025 19:41:08 -0400
Subject: [PATCH 6/9] hk... sorry

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2b56dcf7a0..259b462ecb 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.
 
 - `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0)
 - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor
-- `--lora-init-without-apply [FNAME]`: Load LoRA adapters without applying (lora hot-swapping)
+- `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping)
 
 ### Dynamic LoRA Adapter Management (Hot-Swapping)
 

From 09632b7be9456993982dc75457917fab1efe97b9 Mon Sep 17 00:00:00 2001
From: Logan Powell <logan@tepper.cmu.edu>
Date: Wed, 13 Aug 2025 14:51:18 -0400
Subject: [PATCH 7/9] fix: add intelligent slot refresh for LoRA adapter
 updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Removes redundant code by deferring to llama.cpp for lora structures
- Add Slot::mark_for_refresh() to flag slots for context refresh after LoRA changes
- Integrate needs_refresh_ flag and logic into Slot class and prefill() method
- Update LoRA adapter API handlers to call mark_for_refresh() after applying or updating adapters
- Ensure system prompts and context are preserved using slot’s intelligent prefill mechanism
- Remove naive KV cache clearing logic in favor of slot-managed refresh
- Improves runtime LoRA scale update reliability
---
 .vscode/c_cpp_properties.json      |  60 ++++
 .vscode/launch.json                |  43 +++
 .vscode/settings.json              |  38 +++
 .vscode/tasks.json                 | 175 ++++++++++
 diff.txt                           | 512 +++++++++++++++++++++++++++++
 llamafile/server/client.h          |  12 +-
 llamafile/server/lora_adapters.cpp | 139 ++++----
 llamafile/server/prog.cpp          |  57 ++--
 llamafile/server/slot.cpp          |  49 +--
 llamafile/server/slot.h            |   3 +-
 10 files changed, 944 insertions(+), 144 deletions(-)
 create mode 100644 .vscode/c_cpp_properties.json
 create mode 100644 .vscode/launch.json
 create mode 100644 .vscode/settings.json
 create mode 100644 .vscode/tasks.json
 create mode 100644 diff.txt

diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000000..d9cbe110f5
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,60 @@
+{
+  // Simplified IntelliSense config without compile_commands.json
+  // Uses broad include/define coverage; may be less exact per-file.
+  "env": {
+    "cosmoccPath": "${workspaceFolder}/.cosmocc"
+  },
+  "configurations": [
+    {
+      "name": "Cosmopolitan",
+      "compilerPath": "${cosmoccPath}/3.9.7/bin/cosmocc",
+      "intelliSenseMode": "clang-x64",
+      "cppStandard": "gnu++23",
+      "cStandard": "gnu11",
+      "defines": [
+        "GGML_MULTIPLATFORM",
+        "LLAMAFILE_DEBUG",
+        "_LIBCPP_HAS_NO_XLOCALE",
+        "_LIBCPP_HAS_MUSL_LIBC"
+      ],
+      "compilerArgs": [
+        "-j8"
+        // "-std=gnu++23",
+        // "-Wall",
+          // "-Wextra",
+        // Force C++ mode and reassert libc++ include even if driver fallback fails
+        // "-nostdinc++",
+        // "-I${cosmoccPath}/3.9.7/include/c++/v1"
+      ],
+      "includePath": [
+        "${workspaceFolder}",
+        "${workspaceFolder}/llamafile",
+        "${workspaceFolder}/llama.cpp",
+        "${workspaceFolder}/whisper.cpp",
+        "${workspaceFolder}/stable-diffusion.cpp",
+        "${workspaceFolder}/localscore",
+        "${workspaceFolder}/third_party",
+        "${cosmoccPath}/include",
+        "${cosmoccPath}/3.9.7/include",
+        "${cosmoccPath}/3.9.7/include/c++/v1"
+      ],
+      "forcedInclude": [
+        // Normalizes some Cosmopolitan integral typedefs early.
+        "${cosmoccPath}/include/libc/integral/normalize.inc"
+      ],
+      "browse": {
+        "path": [
+          "${workspaceFolder}",
+          "${workspaceFolder}/llamafile",
+          "${workspaceFolder}/llama.cpp",
+          "${workspaceFolder}/whisper.cpp",
+          "${workspaceFolder}/stable-diffusion.cpp",
+          "${workspaceFolder}/localscore",
+          "${workspaceFolder}/third_party"
+        ],
+        "limitSymbolsToIncludedHeaders": false
+      }
+    }
+  ],
+  "version": 4
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000000..c50ac4842c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,43 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Debug Server (llamafiler)",
+      "type": "cppdbg",
+      "request": "launch",
+      "MIMode": "lldb",
+      "program": "${workspaceFolder}/o/opt/llamafile/server/main",
+      "args": [],
+      "preLaunchTask": "Build (fast)",
+      "stopAtEntry": false
+    },
+    {
+      "name": "Debug Llama CLI (llamafile)",
+      "type": "cppdbg",
+      "request": "launch",
+      "MIMode": "lldb",
+      "program": "${workspaceFolder}/o/opt/llama.cpp/main/main",
+      "args": [],
+      "preLaunchTask": "Build (fast)",
+      "stopAtEntry": false
+    },
+    {
+      "name": "Debug Quantize Tool",
+      "type": "cppdbg",
+      "request": "launch",
+      "MIMode": "lldb",
+      "program": "${workspaceFolder}/o/opt/llama.cpp/quantize/quantize",
+      "args": [],
+      "preLaunchTask": "Build (fast)",
+      "stopAtEntry": false
+    },
+    {
+      "name": "Attach to PID",
+      "type": "cppdbg",
+      "request": "attach",
+      "MIMode": "lldb",
+  "processId": "${command:pickProcess}",
+  "program": "${workspaceFolder}/o/opt/llamafile/server/main"
+    }
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..0948ad7be5
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,38 @@
+{
+  // Core C/C++ extension behavior
+  "C_Cpp.default.configurationProvider": "ms-vscode.cpptools", // use c_cpp_properties.json
+  "C_Cpp.intelliSenseEngine": "default",
+  "C_Cpp.errorSquiggles": "disabled",
+  "C_Cpp.autoAddFileAssociations": false,
+  "C_Cpp.default.browse.limitSymbolsToIncludedHeaders": false,
+  "C_Cpp.workspaceParsingPriority": "highest",
+  "C_Cpp.loggingLevel": "Warning",
+  // Speed: avoid re-indexing node_modules or build/artifacts if present
+  "files.watcherExclude": {
+    "**/o/**": true,
+    "**/.git/**": true,
+    "**/.cosmocc/**": true
+  },
+  "search.exclude": {
+    "o": true,
+    "**/o/**": true
+  },
+  // Formatting / style (adjust to project preference)
+  "editor.formatOnSave": false,
+  "C_Cpp.formatting": "disabled",
+  // Diagnostics tuning: treat missing headers as warnings (since we simplified config)
+  "C_Cpp.codeAnalysis.clangTidy.enabled": false,
+  "C_Cpp.intelliSenseCacheSize": 512,
+  // Optional UI niceties
+  "C_Cpp.enhancedColorization": "enabled",
+  "C_Cpp.dimInactiveRegions": true,
+  // File associations
+  "files.associations": {
+    "*.cpp": "cpp",
+    "*.c": "c",
+    "*.h": "c",
+    "*.hpp": "cpp",
+    "*.mk": "makefile",
+    "BUILD.mk": "makefile"
+  }
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000000..894f7a924f
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,175 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": {
+				"kind": "build",
+				"isDefault": true
+			},
+			"problemMatcher": [
+				"$gcc"
+			],
+			"presentation": {
+				"reveal": "always",
+				"panel": "shared"
+			}
+		},
+		{
+			"label": "Rebuild (clean + all)",
+			"type": "shell",
+			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"clean",
+				"&&",
+				"${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+				"-j8"
+			],
+			"problemMatcher": [
+				"$gcc"
+			],
+			"presentation": {
+				"reveal": "always",
+				"panel": "shared"
+			}
+		},
+		{
+			"label": "Build vmathf_test",
+			"type": "shell",
+			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8",
+				"o/opt/llamafile/vmathf_test"
+			],
+			"problemMatcher": [
+				"$gcc"
+			],
+			"presentation": {
+				"reveal": "always",
+				"panel": "shared"
+			}
+		},
+		{
+			"label": "Run vmathf_test",
+			"type": "shell",
+			"command": "o/opt/llamafile/vmathf_test",
+			"dependsOn": "Build vmathf_test",
+			"presentation": {
+				"reveal": "always",
+				"panel": "shared"
+			}
+		},
+		{
+			"label": "Clean",
+			"type": "shell",
+			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"clean"
+			],
+			"group": "build",
+			"presentation": {
+				"reveal": "always",
+				"panel": "shared"
+			}
+		},
+		{
+			"label": "Watch (incremental)",
+			"type": "shell",
+			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"isBackground": true,
+			"problemMatcher": [
+				{
+					"owner": "cpp",
+					"pattern": {
+						"regexp": "^(.*):(\\d+):(\\d+): (warning|error): (.*)$",
+						"file": 1,
+						"line": 2,
+						"column": 3,
+						"severity": 4,
+						"message": 5
+					},
+					"background": {
+						"activeOnStart": true,
+						"beginsPattern": "^.*Building.*$",
+						"endsPattern": "^.*(error|warning|linking).*$"
+					}
+				}
+			],
+			"presentation": {
+				"reveal": "never",
+				"panel": "dedicated"
+			}
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		},
+		{
+			"label": "Build (fast)",
+			"type": "shell",
+			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
+			"args": [
+				"-j8"
+			],
+			"group": "build"
+		}
+	]
+}
\ No newline at end of file
diff --git a/diff.txt b/diff.txt
new file mode 100644
index 0000000000..c6be66ce7e
--- /dev/null
+++ b/diff.txt
@@ -0,0 +1,512 @@
+diff --git a/llamafile/server/client.h b/llamafile/server/client.h
+index 74d1314e6..f82eed422 100644
+--- a/llamafile/server/client.h
++++ b/llamafile/server/client.h
+@@ -25,6 +25,7 @@
+ #include <optional>
+ #include <string>
+ #include <sys/resource.h>
++#include "llama.cpp/common.h"
+ 
+ #define HasHeader(H) (!!msg_.headers[H].a)
+ #define HeaderData(H) (ibuf_.p + msg_.headers[H].a)
+@@ -141,13 +142,4 @@ struct Client
+ } // namespace lf
+ 
+ // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp)
+-#define MAX_LORA_ADAPTERS 8
+-struct lora_adapter_container {
+-    struct llama_lora_adapter* adapter;
+-    float scale;
+-    std::string name;  // Model/adapter name for identification
+-    bool applied;      // Whether this adapter is currently applied to slots
+-};
+-
+-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
+-extern int g_lora_adapters_count;
++// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead
+diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp
+index 35e55198d..c048a2a4e 100644
+--- a/llamafile/server/lora_adapters.cpp
++++ b/llamafile/server/lora_adapters.cpp
+@@ -17,6 +17,7 @@
+ 
+ #include "client.h"
+ #include "llama.cpp/llama.h"
++#include "llama.cpp/common.h"
+ #include "llamafile/json.h"
+ #include "llamafile/llamafile.h"
+ #include "llamafile/server/log.h"
+@@ -29,9 +30,7 @@
+ using jt::Json;
+ 
+ // External declarations for global LoRA adapter storage from prog.cpp (outside namespace)
+-// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h
+-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
+-extern int g_lora_adapters_count;
++extern std::vector<llama_lora_adapter_container> g_lora_adapters;
+ 
+ namespace lf {
+ namespace server {
+@@ -46,12 +45,12 @@ Client::lora_adapters()
+         json.setArray();
+         std::vector<Json>& json_array = json.getArray();
+         
+-        for (int i = 0; i < g_lora_adapters_count; i++) {
++        for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
+             Json adapter;
+             adapter.setObject();
+-            adapter["id"] = i;
+-            adapter["path"] = g_lora_adapters[i].name;  // Use name as path for now
+-            adapter["scale"] = g_lora_adapters[i].scale;
++            adapter["id"] = (int)i;
++            adapter["path"] = ::g_lora_adapters[i].path;
++            adapter["scale"] = ::g_lora_adapters[i].scale;
+             json_array.push_back(adapter);
+         }
+         
+@@ -93,7 +92,7 @@ bool
+ Client::handle_apply_adapters(Json& json)
+ {
+     // Get active slots and apply current adapters to them
+-    if (g_lora_adapters_count == 0) {
++    if (::g_lora_adapters.empty()) {
+         Json response;
+         response["success"] = false;
+         response["message"] = "No adapters loaded to apply";
+@@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json)
+         return send_response(obuf_.p, p, response.toString());
+     }
+     
+-    // Apply adapters to all slots via the server
+-    // Note: This would require coordination with the slot management system
+-    SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count);
++    // Apply adapters to all slots via the server using llama.cpp unified function
++    SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", 
++         (int)::g_lora_adapters.size());
++    
++    // Apply to all active slots
++    Slots* slots = worker_->server_->slots_;
++    pthread_mutex_lock(&slots->lock_);
++    
++    for (size_t i = 0; i < slots->slots_.size(); ++i) {
++        Slot* slot = slots->slots_[i].get();
++        if (slot->ctx_) {
++            SLOG("applying LoRA adapters to slot #%d", slot->id_);
++            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
++            
++            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
++            // The slot's prefill() mechanism will intelligently preserve system prompts
++            // and only re-evaluate what's necessary when the next request comes in
++            slot->mark_for_refresh();
++            SLOG("marked slot #%d for refresh after LoRA application", slot->id_);
++        }
++    }
++    
++    pthread_mutex_unlock(&slots->lock_);
+     
+     Json response;
+     response["success"] = true;
+     response["message"] = "Adapters applied to active slots";
+-    response["adapters_applied"] = g_lora_adapters_count;
++    response["adapters_applied"] = (int)::g_lora_adapters.size();
+     
+     char* p = append_http_response_message(obuf_.p, 200);
+     p = stpcpy(p, "Content-Type: application/json\r\n");
+@@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json)
+     std::string adapter_path = json["path"].getString();
+     float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f;
+     
+-    // Check if we have room for more adapters
+-    if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+-        Json response;
+-        response["success"] = false;
+-        response["message"] = "Maximum number of adapters already loaded";
+-        response["max_adapters"] = MAX_LORA_ADAPTERS;
+-        
+-        char* p = append_http_response_message(obuf_.p, 400);
+-        p = stpcpy(p, "Content-Type: application/json\r\n");
+-        return send_response(obuf_.p, p, response.toString());
+-    }
+-    
+     // Check if file exists
+     if (!std::filesystem::exists(adapter_path)) {
+         Json response;
+@@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json)
+         return send_response(obuf_.p, p, response.toString());
+     }
+     
++    // Create the adapter container
++    llama_lora_adapter_container adapter_container;
++    adapter_container.path = adapter_path;
++    adapter_container.scale = scale;
++    adapter_container.adapter = adapter;
++    
+     // Store the adapter
+-    int index = g_lora_adapters_count;
+-    g_lora_adapters[index].adapter = adapter;
+-    g_lora_adapters[index].scale = scale;
+-    g_lora_adapters_count++;
++    int index = (int)::g_lora_adapters.size();
++    ::g_lora_adapters.push_back(adapter_container);
+     
+     SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str());
+     
+@@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json)
+     response["index"] = index;
+     response["path"] = adapter_path;
+     response["scale"] = scale;
+-    response["total_adapters"] = g_lora_adapters_count;
++    response["total_adapters"] = (int)::g_lora_adapters.size();
+     
+     char* p = append_http_response_message(obuf_.p, 200);
+     p = stpcpy(p, "Content-Type: application/json\r\n");
+@@ -192,18 +203,16 @@ bool
+ Client::handle_clear_adapters()
+ {
+     // Clear all loaded adapters
+-    SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count);
++    SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size());
+     
+-    for (int i = 0; i < g_lora_adapters_count; i++) {
+-        if (g_lora_adapters[i].adapter) {
+-            llama_lora_adapter_free(g_lora_adapters[i].adapter);
+-            g_lora_adapters[i].adapter = nullptr;
+-            g_lora_adapters[i].scale = 0.0f;
++    int cleared_count = (int)::g_lora_adapters.size();
++    for (auto& la : ::g_lora_adapters) {
++        if (la.adapter) {
++            llama_lora_adapter_free(la.adapter);
+         }
+     }
+     
+-    int cleared_count = g_lora_adapters_count;
+-    g_lora_adapters_count = 0;
++    ::g_lora_adapters.clear();
+     
+     SLOG("cleared %d LoRA adapter(s)", cleared_count);
+     
+@@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json)
+     std::vector<Json>& json_array = json.getArray();
+     SLOG("applying LoRA configuration with %d entries", (int)json_array.size());
+     
+-    // First, reset all adapter scales to 0.0 (disabled)
+-    for (int i = 0; i < g_lora_adapters_count; i++) {
+-        g_lora_adapters[i].applied = false;
+-    }
+-    
+     // Process each entry in the array
+     for (size_t i = 0; i < json_array.size(); i++) {
+         Json& entry = json_array[i];
+@@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json)
+         float scale = entry["scale"].getNumber();
+         
+         // Validate ID range
+-        if (id < 0 || id >= g_lora_adapters_count) {
++        if (id < 0 || id >= (int)::g_lora_adapters.size()) {
+             return send_error(400, "Invalid adapter ID");
+         }
+         
+         // Update the adapter configuration
+-        g_lora_adapters[id].scale = scale;
+-        g_lora_adapters[id].applied = (scale > 0.0f);
++        ::g_lora_adapters[id].scale = scale;
+         
+         char scale_buf[32];
+         snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale);
+         SLOG("set LoRA adapter %d ('%s') scale to %s", 
+-             id, g_lora_adapters[id].name.c_str(), scale_buf);
++             id, ::g_lora_adapters[id].path.c_str(), scale_buf);
+     }
+     
+-    // Re-apply LoRA adapters to all active slots with updated scales
+-    SLOG("re-applying LoRA adapters to all active slots");
++    // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function
++    SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function");
+     Slots* slots = worker_->server_->slots_;
+     
+     // Lock the slots to prevent concurrent access during LoRA re-application
+@@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json)
+         Slot* slot = slots->slots_[i].get();
+         if (slot->ctx_) {
+             SLOG("re-applying LoRA adapters to slot #%d", slot->id_);
++            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
+             
+-            // Clear existing LoRA adapters from this context
+-            llama_lora_adapter_clear(slot->ctx_);
+-            
+-            // Use the same approach as slot initialization: get all adapters via the function
+-            struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
+-            float scales[MAX_LORA_ADAPTERS];
+-            int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
+-            
+-            SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_);
+-            
+-            // Re-apply all adapters with their current scales
+-            for (int j = 0; j < adapter_count; ++j) {
+-                char scale_buf[32];
+-                snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]);
+-                SLOG("processing LoRA adapter %d with scale %s", j, scale_buf);
+-                if (scales[j] > 0.0f) {
+-                    if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) {
+-                        SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_);
+-                    } else {
+-                        SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf);
+-                    }
+-                } else {
+-                    SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf);
+-                }
+-            }
++            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
++            // The slot's prefill() mechanism will intelligently preserve system prompts
++            // and only re-evaluate what's necessary when the next request comes in
++            slot->mark_for_refresh();
++            SLOG("marked slot #%d for refresh after LoRA update", slot->id_);
+         }
+     }
+     
+@@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json)
+     Json response;
+     response.setArray();
+     std::vector<Json>& response_array = response.getArray();
+-    for (int i = 0; i < g_lora_adapters_count; i++) {
++    
++    for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
+         Json adapter;
+         adapter.setObject();
+-        adapter["id"] = i;
+-        adapter["path"] = g_lora_adapters[i].name;
+-        adapter["scale"] = g_lora_adapters[i].scale;
++        adapter["id"] = (int)i;
++        adapter["path"] = ::g_lora_adapters[i].path;
++        adapter["scale"] = ::g_lora_adapters[i].scale;
+         response_array.push_back(adapter);
+     }
+     
+diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
+index a21c80961..89ae3f12d 100644
+--- a/llamafile/server/prog.cpp
++++ b/llamafile/server/prog.cpp
+@@ -26,31 +26,21 @@
+ #include "llamafile/server/tokenbucket.h"
+ #include "llamafile/server/utils.h"
+ #include "llamafile/version.h"
++#include "llama.cpp/common.h"
+ #include <cassert>
+ #include <cosmo.h>
+ 
+-// Global LoRA adapter storage for multiple adapters
+-#define MAX_LORA_ADAPTERS 8
+-#include <string>
+-struct lora_adapter_container {
+-    struct llama_lora_adapter* adapter;
+-    float scale;
+-    std::string name;  // Model/adapter name for identification
+-    bool applied;      // Whether this adapter is currently applied to slots
+-};
+-
+-// Make these externally accessible for HTTP endpoint
+-struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {};
+-int g_lora_adapters_count = 0;
++// Global LoRA adapter storage using llama.cpp structures
++std::vector<llama_lora_adapter_container> g_lora_adapters;
+ 
+ // Function to get the first global LoRA adapter for backward compatibility
+ extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() {
+-    return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr;
++    return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter;
+ }
+ 
+ // Function to get all LoRA adapters and their count
+ extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) {
+-    int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters;
++    int count = std::min((int)g_lora_adapters.size(), max_adapters);
+     for (int i = 0; i < count; i++) {
+         adapters[i] = g_lora_adapters[i].adapter;
+         scales[i] = g_lora_adapters[i].scale;
+@@ -129,38 +119,31 @@ main(int argc, char* argv[])
+             char scale_buf[32];
+             snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale);
+             
+-            // Generate model name from filename
++            // Generate model name from filename for identification
+             const char* path = FLAG_lora_adapters[i].path;
+             const char* filename = strrchr(path, '/');
+             filename = filename ? filename + 1 : path;
+             
+-            // Remove file extension for cleaner name
+-            std::string model_name(filename);
+-            size_t dot_pos = model_name.find_last_of('.');
+-            if (dot_pos != std::string::npos) {
+-                model_name = model_name.substr(0, dot_pos);
+-            }
+-            
+             SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, 
+-                 model_name.c_str(), path, scale_buf);
+-                 
+-            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path);
+-            g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
+-            g_lora_adapters[i].name = model_name;
+-            g_lora_adapters[i].applied = !FLAG_lora_init_without_apply;  // Apply unless flag is set
++                 filename, path, scale_buf);
++            
++            llama_lora_adapter_container adapter_container;
++            adapter_container.path = std::string(path);
++            adapter_container.scale = FLAG_lora_adapters[i].scale;
++            adapter_container.adapter = llama_lora_adapter_init(model, path);
+             
+-            if (!g_lora_adapters[i].adapter) {
++            if (!adapter_container.adapter) {
+                 fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path);
+                 // Cleanup previously loaded adapters
+-                for (int j = 0; j < i; j++) {
+-                    if (g_lora_adapters[j].adapter) {
+-                        llama_lora_adapter_free(g_lora_adapters[j].adapter);
++                for (auto& la : g_lora_adapters) {
++                    if (la.adapter) {
++                        llama_lora_adapter_free(la.adapter);
+                     }
+                 }
+                 llama_free_model(model);
+                 exit(1);
+             }
+-            g_lora_adapters_count++;
++            g_lora_adapters.push_back(adapter_container);
+         }
+         
+         if (FLAG_lora_init_without_apply) {
+@@ -203,9 +186,9 @@ main(int argc, char* argv[])
+     delete slots;
+     
+     // Cleanup LoRA adapters
+-    for (int i = 0; i < g_lora_adapters_count; i++) {
+-        if (g_lora_adapters[i].adapter) {
+-            llama_lora_adapter_free(g_lora_adapters[i].adapter);
++    for (auto& la : g_lora_adapters) {
++        if (la.adapter) {
++            llama_lora_adapter_free(la.adapter);
+         }
+     }
+     
+diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
+index 55138417b..a081d69df 100644
+--- a/llamafile/server/slot.cpp
++++ b/llamafile/server/slot.cpp
+@@ -18,6 +18,7 @@
+ #include "slot.h"
+ #include "llama.cpp/llava/clip.h"
+ #include "llama.cpp/llava/llava.h"
++#include "llama.cpp/common.h"
+ #include "llamafile/image.h"
+ #include "llamafile/llama.h"
+ #include "llamafile/llamafile.h"
+@@ -32,6 +33,9 @@
+ #include <cassert>
+ #include <cosmo.h>
+ 
++// External declaration for global LoRA adapter storage
++extern std::vector<llama_lora_adapter_container> g_lora_adapters;
++
+ namespace lf {
+ namespace server {
+ 
+@@ -79,7 +83,7 @@ Slot::describe_error(int err)
+     }
+ }
+ 
+-Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
++Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false)
+ {
+     dll_init(&elem_);
+     last_used_ = time(0);
+@@ -126,24 +130,16 @@ Slot::start()
+     if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
+         return false;
+     
+-    // Apply LoRA adapters if available
+-    struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
+-    float scales[MAX_LORA_ADAPTERS];
+-    int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
+-    
+-    if (adapter_count > 0) {
+-        SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_);
+-        for (int i = 0; i < adapter_count; i++) {
+-            if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) {
+-                SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_);
+-                llama_free(ctx_);
+-                ctx_ = nullptr;
+-                return false;
+-            }
+-            char scale_buf[32];
+-            snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]);
+-            SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf);
+-        }
++    // Apply LoRA adapters if available using llama.cpp's unified function
++    if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) {
++        SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", 
++             (int)::g_lora_adapters.size(), id_);
++        llama_lora_adapters_apply(ctx_, ::g_lora_adapters);
++    } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) {
++        // When --lora-init-without-apply is set, explicitly clear any LoRA state
++        // to ensure no residual LoRA effects from model initialization
++        SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_);
++        llama_lora_adapter_clear(ctx_);
+     }
+     
+     if (FLAG_mmproj)
+@@ -314,6 +310,15 @@ Slot::prefill(const std::vector<Atom>& atoms, const ProgressCallback& progress)
+     if (!ctx_)
+         return uninitialized;
+ 
++    // Check if we need to refresh due to LoRA adapter changes
++    if (needs_refresh_) {
++        SLOG("Refreshing slot due to LoRA adapter changes");
++        llama_kv_cache_clear(ctx_);
++        history_.clear();
++        needs_refresh_ = false;
++        // Fall through to normal prefill logic with cleared state
++    }
++
+     // handle special case of empty prefill
+     if (atoms.empty()) {
+         llama_kv_cache_clear(ctx_);
+@@ -458,5 +463,11 @@ Slot::dump(std::string* result)
+     }
+ }
+ 
++void
++Slot::mark_for_refresh()
++{
++    needs_refresh_ = true;
++}
++
+ } // namespace server
+ } // namespace lf
+diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
+index e8816c900..104aa7623 100644
+--- a/llamafile/server/slot.h
++++ b/llamafile/server/slot.h
+@@ -23,7 +23,6 @@
+ #include <vector>
+ 
+ #define SLOT(e) DLL_CONTAINER(Slot, elem_, e)
+-#define MAX_LORA_ADAPTERS 8
+ 
+ struct llama_context;
+ struct llama_model;
+@@ -66,6 +65,7 @@ struct Slot
+     llama_context* ctx_ = nullptr;
+     std::vector<Atom> history_;
+     std::string system_fingerprint_;
++    bool needs_refresh_ = false;
+ 
+     ~Slot();
+     Slot(int, llama_model*);
+@@ -79,6 +79,7 @@ struct Slot
+     int prefill(const std::vector<Atom>&, const ProgressCallback& = nullptr);
+     void tokenize(std::vector<Atom>*, std::string_view, bool);
+     void dump(std::string*);
++    void mark_for_refresh();
+ };
+ 
+ } // namespace server
diff --git a/llamafile/server/client.h b/llamafile/server/client.h
index 74d1314e62..f82eed4225 100644
--- a/llamafile/server/client.h
+++ b/llamafile/server/client.h
@@ -25,6 +25,7 @@
 #include <optional>
 #include <string>
 #include <sys/resource.h>
+#include "llama.cpp/common.h"
 
 #define HasHeader(H) (!!msg_.headers[H].a)
 #define HeaderData(H) (ibuf_.p + msg_.headers[H].a)
@@ -141,13 +142,4 @@ struct Client
 } // namespace lf
 
 // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp)
-#define MAX_LORA_ADAPTERS 8
-struct lora_adapter_container {
-    struct llama_lora_adapter* adapter;
-    float scale;
-    std::string name;  // Model/adapter name for identification
-    bool applied;      // Whether this adapter is currently applied to slots
-};
-
-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
-extern int g_lora_adapters_count;
+// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead
diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp
index 35e55198df..c048a2a4ef 100644
--- a/llamafile/server/lora_adapters.cpp
+++ b/llamafile/server/lora_adapters.cpp
@@ -17,6 +17,7 @@
 
 #include "client.h"
 #include "llama.cpp/llama.h"
+#include "llama.cpp/common.h"
 #include "llamafile/json.h"
 #include "llamafile/llamafile.h"
 #include "llamafile/server/log.h"
@@ -29,9 +30,7 @@
 using jt::Json;
 
 // External declarations for global LoRA adapter storage from prog.cpp (outside namespace)
-// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h
-extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
-extern int g_lora_adapters_count;
+extern std::vector<llama_lora_adapter_container> g_lora_adapters;
 
 namespace lf {
 namespace server {
@@ -46,12 +45,12 @@ Client::lora_adapters()
         json.setArray();
         std::vector<Json>& json_array = json.getArray();
         
-        for (int i = 0; i < g_lora_adapters_count; i++) {
+        for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
             Json adapter;
             adapter.setObject();
-            adapter["id"] = i;
-            adapter["path"] = g_lora_adapters[i].name;  // Use name as path for now
-            adapter["scale"] = g_lora_adapters[i].scale;
+            adapter["id"] = (int)i;
+            adapter["path"] = ::g_lora_adapters[i].path;
+            adapter["scale"] = ::g_lora_adapters[i].scale;
             json_array.push_back(adapter);
         }
         
@@ -93,7 +92,7 @@ bool
 Client::handle_apply_adapters(Json& json)
 {
     // Get active slots and apply current adapters to them
-    if (g_lora_adapters_count == 0) {
+    if (::g_lora_adapters.empty()) {
         Json response;
         response["success"] = false;
         response["message"] = "No adapters loaded to apply";
@@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json)
         return send_response(obuf_.p, p, response.toString());
     }
     
-    // Apply adapters to all slots via the server
-    // Note: This would require coordination with the slot management system
-    SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count);
+    // Apply adapters to all slots via the server using llama.cpp unified function
+    SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", 
+         (int)::g_lora_adapters.size());
+    
+    // Apply to all active slots
+    Slots* slots = worker_->server_->slots_;
+    pthread_mutex_lock(&slots->lock_);
+    
+    for (size_t i = 0; i < slots->slots_.size(); ++i) {
+        Slot* slot = slots->slots_[i].get();
+        if (slot->ctx_) {
+            SLOG("applying LoRA adapters to slot #%d", slot->id_);
+            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
+            
+            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
+            // The slot's prefill() mechanism will intelligently preserve system prompts
+            // and only re-evaluate what's necessary when the next request comes in
+            slot->mark_for_refresh();
+            SLOG("marked slot #%d for refresh after LoRA application", slot->id_);
+        }
+    }
+    
+    pthread_mutex_unlock(&slots->lock_);
     
     Json response;
     response["success"] = true;
     response["message"] = "Adapters applied to active slots";
-    response["adapters_applied"] = g_lora_adapters_count;
+    response["adapters_applied"] = (int)::g_lora_adapters.size();
     
     char* p = append_http_response_message(obuf_.p, 200);
     p = stpcpy(p, "Content-Type: application/json\r\n");
@@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json)
     std::string adapter_path = json["path"].getString();
     float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f;
     
-    // Check if we have room for more adapters
-    if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) {
-        Json response;
-        response["success"] = false;
-        response["message"] = "Maximum number of adapters already loaded";
-        response["max_adapters"] = MAX_LORA_ADAPTERS;
-        
-        char* p = append_http_response_message(obuf_.p, 400);
-        p = stpcpy(p, "Content-Type: application/json\r\n");
-        return send_response(obuf_.p, p, response.toString());
-    }
-    
     // Check if file exists
     if (!std::filesystem::exists(adapter_path)) {
         Json response;
@@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json)
         return send_response(obuf_.p, p, response.toString());
     }
     
+    // Create the adapter container
+    llama_lora_adapter_container adapter_container;
+    adapter_container.path = adapter_path;
+    adapter_container.scale = scale;
+    adapter_container.adapter = adapter;
+    
     // Store the adapter
-    int index = g_lora_adapters_count;
-    g_lora_adapters[index].adapter = adapter;
-    g_lora_adapters[index].scale = scale;
-    g_lora_adapters_count++;
+    int index = (int)::g_lora_adapters.size();
+    ::g_lora_adapters.push_back(adapter_container);
     
     SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str());
     
@@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json)
     response["index"] = index;
     response["path"] = adapter_path;
     response["scale"] = scale;
-    response["total_adapters"] = g_lora_adapters_count;
+    response["total_adapters"] = (int)::g_lora_adapters.size();
     
     char* p = append_http_response_message(obuf_.p, 200);
     p = stpcpy(p, "Content-Type: application/json\r\n");
@@ -192,18 +203,16 @@ bool
 Client::handle_clear_adapters()
 {
     // Clear all loaded adapters
-    SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count);
+    SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size());
     
-    for (int i = 0; i < g_lora_adapters_count; i++) {
-        if (g_lora_adapters[i].adapter) {
-            llama_lora_adapter_free(g_lora_adapters[i].adapter);
-            g_lora_adapters[i].adapter = nullptr;
-            g_lora_adapters[i].scale = 0.0f;
+    int cleared_count = (int)::g_lora_adapters.size();
+    for (auto& la : ::g_lora_adapters) {
+        if (la.adapter) {
+            llama_lora_adapter_free(la.adapter);
         }
     }
     
-    int cleared_count = g_lora_adapters_count;
-    g_lora_adapters_count = 0;
+    ::g_lora_adapters.clear();
     
     SLOG("cleared %d LoRA adapter(s)", cleared_count);
     
@@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json)
     std::vector<Json>& json_array = json.getArray();
     SLOG("applying LoRA configuration with %d entries", (int)json_array.size());
     
-    // First, reset all adapter scales to 0.0 (disabled)
-    for (int i = 0; i < g_lora_adapters_count; i++) {
-        g_lora_adapters[i].applied = false;
-    }
-    
     // Process each entry in the array
     for (size_t i = 0; i < json_array.size(); i++) {
         Json& entry = json_array[i];
@@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json)
         float scale = entry["scale"].getNumber();
         
         // Validate ID range
-        if (id < 0 || id >= g_lora_adapters_count) {
+        if (id < 0 || id >= (int)::g_lora_adapters.size()) {
             return send_error(400, "Invalid adapter ID");
         }
         
         // Update the adapter configuration
-        g_lora_adapters[id].scale = scale;
-        g_lora_adapters[id].applied = (scale > 0.0f);
+        ::g_lora_adapters[id].scale = scale;
         
         char scale_buf[32];
         snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale);
         SLOG("set LoRA adapter %d ('%s') scale to %s", 
-             id, g_lora_adapters[id].name.c_str(), scale_buf);
+             id, ::g_lora_adapters[id].path.c_str(), scale_buf);
     }
     
-    // Re-apply LoRA adapters to all active slots with updated scales
-    SLOG("re-applying LoRA adapters to all active slots");
+    // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function
+    SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function");
     Slots* slots = worker_->server_->slots_;
     
     // Lock the slots to prevent concurrent access during LoRA re-application
@@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json)
         Slot* slot = slots->slots_[i].get();
         if (slot->ctx_) {
             SLOG("re-applying LoRA adapters to slot #%d", slot->id_);
+            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
             
-            // Clear existing LoRA adapters from this context
-            llama_lora_adapter_clear(slot->ctx_);
-            
-            // Use the same approach as slot initialization: get all adapters via the function
-            struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
-            float scales[MAX_LORA_ADAPTERS];
-            int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
-            
-            SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_);
-            
-            // Re-apply all adapters with their current scales
-            for (int j = 0; j < adapter_count; ++j) {
-                char scale_buf[32];
-                snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]);
-                SLOG("processing LoRA adapter %d with scale %s", j, scale_buf);
-                if (scales[j] > 0.0f) {
-                    if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) {
-                        SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_);
-                    } else {
-                        SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf);
-                    }
-                } else {
-                    SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf);
-                }
-            }
+            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
+            // The slot's prefill() mechanism will intelligently preserve system prompts
+            // and only re-evaluate what's necessary when the next request comes in
+            slot->mark_for_refresh();
+            SLOG("marked slot #%d for refresh after LoRA update", slot->id_);
         }
     }
     
@@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json)
     Json response;
     response.setArray();
     std::vector<Json>& response_array = response.getArray();
-    for (int i = 0; i < g_lora_adapters_count; i++) {
+    
+    for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
         Json adapter;
         adapter.setObject();
-        adapter["id"] = i;
-        adapter["path"] = g_lora_adapters[i].name;
-        adapter["scale"] = g_lora_adapters[i].scale;
+        adapter["id"] = (int)i;
+        adapter["path"] = ::g_lora_adapters[i].path;
+        adapter["scale"] = ::g_lora_adapters[i].scale;
         response_array.push_back(adapter);
     }
     
diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
index a21c809614..89ae3f12d7 100644
--- a/llamafile/server/prog.cpp
+++ b/llamafile/server/prog.cpp
@@ -26,31 +26,21 @@
 #include "llamafile/server/tokenbucket.h"
 #include "llamafile/server/utils.h"
 #include "llamafile/version.h"
+#include "llama.cpp/common.h"
 #include <cassert>
 #include <cosmo.h>
 
-// Global LoRA adapter storage for multiple adapters
-#define MAX_LORA_ADAPTERS 8
-#include <string>
-struct lora_adapter_container {
-    struct llama_lora_adapter* adapter;
-    float scale;
-    std::string name;  // Model/adapter name for identification
-    bool applied;      // Whether this adapter is currently applied to slots
-};
-
-// Make these externally accessible for HTTP endpoint
-struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {};
-int g_lora_adapters_count = 0;
+// Global LoRA adapter storage using llama.cpp structures
+std::vector<llama_lora_adapter_container> g_lora_adapters;
 
 // Function to get the first global LoRA adapter for backward compatibility
 extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() {
-    return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr;
+    return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter;
 }
 
 // Function to get all LoRA adapters and their count
 extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) {
-    int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters;
+    int count = std::min((int)g_lora_adapters.size(), max_adapters);
     for (int i = 0; i < count; i++) {
         adapters[i] = g_lora_adapters[i].adapter;
         scales[i] = g_lora_adapters[i].scale;
@@ -129,38 +119,31 @@ main(int argc, char* argv[])
             char scale_buf[32];
             snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale);
             
-            // Generate model name from filename
+            // Generate model name from filename for identification
             const char* path = FLAG_lora_adapters[i].path;
             const char* filename = strrchr(path, '/');
             filename = filename ? filename + 1 : path;
             
-            // Remove file extension for cleaner name
-            std::string model_name(filename);
-            size_t dot_pos = model_name.find_last_of('.');
-            if (dot_pos != std::string::npos) {
-                model_name = model_name.substr(0, dot_pos);
-            }
-            
             SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, 
-                 model_name.c_str(), path, scale_buf);
-                 
-            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path);
-            g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
-            g_lora_adapters[i].name = model_name;
-            g_lora_adapters[i].applied = !FLAG_lora_init_without_apply;  // Apply unless flag is set
+                 filename, path, scale_buf);
+            
+            llama_lora_adapter_container adapter_container;
+            adapter_container.path = std::string(path);
+            adapter_container.scale = FLAG_lora_adapters[i].scale;
+            adapter_container.adapter = llama_lora_adapter_init(model, path);
             
-            if (!g_lora_adapters[i].adapter) {
+            if (!adapter_container.adapter) {
                 fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path);
                 // Cleanup previously loaded adapters
-                for (int j = 0; j < i; j++) {
-                    if (g_lora_adapters[j].adapter) {
-                        llama_lora_adapter_free(g_lora_adapters[j].adapter);
+                for (auto& la : g_lora_adapters) {
+                    if (la.adapter) {
+                        llama_lora_adapter_free(la.adapter);
                     }
                 }
                 llama_free_model(model);
                 exit(1);
             }
-            g_lora_adapters_count++;
+            g_lora_adapters.push_back(adapter_container);
         }
         
         if (FLAG_lora_init_without_apply) {
@@ -203,9 +186,9 @@ main(int argc, char* argv[])
     delete slots;
     
     // Cleanup LoRA adapters
-    for (int i = 0; i < g_lora_adapters_count; i++) {
-        if (g_lora_adapters[i].adapter) {
-            llama_lora_adapter_free(g_lora_adapters[i].adapter);
+    for (auto& la : g_lora_adapters) {
+        if (la.adapter) {
+            llama_lora_adapter_free(la.adapter);
         }
     }
     
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
index 55138417b1..a081d69dfb 100644
--- a/llamafile/server/slot.cpp
+++ b/llamafile/server/slot.cpp
@@ -18,6 +18,7 @@
 #include "slot.h"
 #include "llama.cpp/llava/clip.h"
 #include "llama.cpp/llava/llava.h"
+#include "llama.cpp/common.h"
 #include "llamafile/image.h"
 #include "llamafile/llama.h"
 #include "llamafile/llamafile.h"
@@ -32,6 +33,9 @@
 #include <cassert>
 #include <cosmo.h>
 
+// External declaration for global LoRA adapter storage
+extern std::vector<llama_lora_adapter_container> g_lora_adapters;
+
 namespace lf {
 namespace server {
 
@@ -79,7 +83,7 @@ Slot::describe_error(int err)
     }
 }
 
-Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
+Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false)
 {
     dll_init(&elem_);
     last_used_ = time(0);
@@ -126,24 +130,16 @@ Slot::start()
     if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
         return false;
     
-    // Apply LoRA adapters if available
-    struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
-    float scales[MAX_LORA_ADAPTERS];
-    int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
-    
-    if (adapter_count > 0) {
-        SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_);
-        for (int i = 0; i < adapter_count; i++) {
-            if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) {
-                SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_);
-                llama_free(ctx_);
-                ctx_ = nullptr;
-                return false;
-            }
-            char scale_buf[32];
-            snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]);
-            SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf);
-        }
+    // Apply LoRA adapters if available using llama.cpp's unified function
+    if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) {
+        SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", 
+             (int)::g_lora_adapters.size(), id_);
+        llama_lora_adapters_apply(ctx_, ::g_lora_adapters);
+    } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) {
+        // When --lora-init-without-apply is set, explicitly clear any LoRA state
+        // to ensure no residual LoRA effects from model initialization
+        SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_);
+        llama_lora_adapter_clear(ctx_);
     }
     
     if (FLAG_mmproj)
@@ -314,6 +310,15 @@ Slot::prefill(const std::vector<Atom>& atoms, const ProgressCallback& progress)
     if (!ctx_)
         return uninitialized;
 
+    // Check if we need to refresh due to LoRA adapter changes
+    if (needs_refresh_) {
+        SLOG("Refreshing slot due to LoRA adapter changes");
+        llama_kv_cache_clear(ctx_);
+        history_.clear();
+        needs_refresh_ = false;
+        // Fall through to normal prefill logic with cleared state
+    }
+
     // handle special case of empty prefill
     if (atoms.empty()) {
         llama_kv_cache_clear(ctx_);
@@ -458,5 +463,11 @@ Slot::dump(std::string* result)
     }
 }
 
+void
+Slot::mark_for_refresh()
+{
+    needs_refresh_ = true;
+}
+
 } // namespace server
 } // namespace lf
diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
index e8816c9009..104aa7623c 100644
--- a/llamafile/server/slot.h
+++ b/llamafile/server/slot.h
@@ -23,7 +23,6 @@
 #include <vector>
 
 #define SLOT(e) DLL_CONTAINER(Slot, elem_, e)
-#define MAX_LORA_ADAPTERS 8
 
 struct llama_context;
 struct llama_model;
@@ -66,6 +65,7 @@ struct Slot
     llama_context* ctx_ = nullptr;
     std::vector<Atom> history_;
     std::string system_fingerprint_;
+    bool needs_refresh_ = false;
 
     ~Slot();
     Slot(int, llama_model*);
@@ -79,6 +79,7 @@ struct Slot
     int prefill(const std::vector<Atom>&, const ProgressCallback& = nullptr);
     void tokenize(std::vector<Atom>*, std::string_view, bool);
     void dump(std::string*);
+    void mark_for_refresh();
 };
 
 } // namespace server

From f9204e755eee091f667ce7e365e6d2c0a2630849 Mon Sep 17 00:00:00 2001
From: loganpowell <loganp@tepper.cmu.edu>
Date: Wed, 20 Aug 2025 14:00:15 -0400
Subject: [PATCH 8/9] removes .vscode setup

---
 .gitignore                    |   3 +-
 .vscode/c_cpp_properties.json |  60 ------------
 .vscode/launch.json           |  43 ---------
 .vscode/settings.json         |  38 --------
 .vscode/tasks.json            | 175 ----------------------------------
 RELEASE.md                    |   2 +-
 6 files changed, 3 insertions(+), 318 deletions(-)
 delete mode 100644 .vscode/c_cpp_properties.json
 delete mode 100644 .vscode/launch.json
 delete mode 100644 .vscode/settings.json
 delete mode 100644 .vscode/tasks.json

diff --git a/.gitignore b/.gitignore
index 16feca060d..dd97ddc5b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@
 /trace.json
 
 /*.log
-/.models
\ No newline at end of file
+/.models
+/.vscode
\ No newline at end of file
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
deleted file mode 100644
index d9cbe110f5..0000000000
--- a/.vscode/c_cpp_properties.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  // Simplified IntelliSense config without compile_commands.json
-  // Uses broad include/define coverage; may be less exact per-file.
-  "env": {
-    "cosmoccPath": "${workspaceFolder}/.cosmocc"
-  },
-  "configurations": [
-    {
-      "name": "Cosmopolitan",
-      "compilerPath": "${cosmoccPath}/3.9.7/bin/cosmocc",
-      "intelliSenseMode": "clang-x64",
-      "cppStandard": "gnu++23",
-      "cStandard": "gnu11",
-      "defines": [
-        "GGML_MULTIPLATFORM",
-        "LLAMAFILE_DEBUG",
-        "_LIBCPP_HAS_NO_XLOCALE",
-        "_LIBCPP_HAS_MUSL_LIBC"
-      ],
-      "compilerArgs": [
-        "-j8"
-        // "-std=gnu++23",
-        // "-Wall",
-          // "-Wextra",
-        // Force C++ mode and reassert libc++ include even if driver fallback fails
-        // "-nostdinc++",
-        // "-I${cosmoccPath}/3.9.7/include/c++/v1"
-      ],
-      "includePath": [
-        "${workspaceFolder}",
-        "${workspaceFolder}/llamafile",
-        "${workspaceFolder}/llama.cpp",
-        "${workspaceFolder}/whisper.cpp",
-        "${workspaceFolder}/stable-diffusion.cpp",
-        "${workspaceFolder}/localscore",
-        "${workspaceFolder}/third_party",
-        "${cosmoccPath}/include",
-        "${cosmoccPath}/3.9.7/include",
-        "${cosmoccPath}/3.9.7/include/c++/v1"
-      ],
-      "forcedInclude": [
-        // Normalizes some Cosmopolitan integral typedefs early.
-        "${cosmoccPath}/include/libc/integral/normalize.inc"
-      ],
-      "browse": {
-        "path": [
-          "${workspaceFolder}",
-          "${workspaceFolder}/llamafile",
-          "${workspaceFolder}/llama.cpp",
-          "${workspaceFolder}/whisper.cpp",
-          "${workspaceFolder}/stable-diffusion.cpp",
-          "${workspaceFolder}/localscore",
-          "${workspaceFolder}/third_party"
-        ],
-        "limitSymbolsToIncludedHeaders": false
-      }
-    }
-  ],
-  "version": 4
-}
diff --git a/.vscode/launch.json b/.vscode/launch.json
deleted file mode 100644
index c50ac4842c..0000000000
--- a/.vscode/launch.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "version": "0.2.0",
-  "configurations": [
-    {
-      "name": "Debug Server (llamafiler)",
-      "type": "cppdbg",
-      "request": "launch",
-      "MIMode": "lldb",
-      "program": "${workspaceFolder}/o/opt/llamafile/server/main",
-      "args": [],
-      "preLaunchTask": "Build (fast)",
-      "stopAtEntry": false
-    },
-    {
-      "name": "Debug Llama CLI (llamafile)",
-      "type": "cppdbg",
-      "request": "launch",
-      "MIMode": "lldb",
-      "program": "${workspaceFolder}/o/opt/llama.cpp/main/main",
-      "args": [],
-      "preLaunchTask": "Build (fast)",
-      "stopAtEntry": false
-    },
-    {
-      "name": "Debug Quantize Tool",
-      "type": "cppdbg",
-      "request": "launch",
-      "MIMode": "lldb",
-      "program": "${workspaceFolder}/o/opt/llama.cpp/quantize/quantize",
-      "args": [],
-      "preLaunchTask": "Build (fast)",
-      "stopAtEntry": false
-    },
-    {
-      "name": "Attach to PID",
-      "type": "cppdbg",
-      "request": "attach",
-      "MIMode": "lldb",
-  "processId": "${command:pickProcess}",
-  "program": "${workspaceFolder}/o/opt/llamafile/server/main"
-    }
-  ]
-}
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 0948ad7be5..0000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  // Core C/C++ extension behavior
-  "C_Cpp.default.configurationProvider": "ms-vscode.cpptools", // use c_cpp_properties.json
-  "C_Cpp.intelliSenseEngine": "default",
-  "C_Cpp.errorSquiggles": "disabled",
-  "C_Cpp.autoAddFileAssociations": false,
-  "C_Cpp.default.browse.limitSymbolsToIncludedHeaders": false,
-  "C_Cpp.workspaceParsingPriority": "highest",
-  "C_Cpp.loggingLevel": "Warning",
-  // Speed: avoid re-indexing node_modules or build/artifacts if present
-  "files.watcherExclude": {
-    "**/o/**": true,
-    "**/.git/**": true,
-    "**/.cosmocc/**": true
-  },
-  "search.exclude": {
-    "o": true,
-    "**/o/**": true
-  },
-  // Formatting / style (adjust to project preference)
-  "editor.formatOnSave": false,
-  "C_Cpp.formatting": "disabled",
-  // Diagnostics tuning: treat missing headers as warnings (since we simplified config)
-  "C_Cpp.codeAnalysis.clangTidy.enabled": false,
-  "C_Cpp.intelliSenseCacheSize": 512,
-  // Optional UI niceties
-  "C_Cpp.enhancedColorization": "enabled",
-  "C_Cpp.dimInactiveRegions": true,
-  // File associations
-  "files.associations": {
-    "*.cpp": "cpp",
-    "*.c": "c",
-    "*.h": "c",
-    "*.hpp": "cpp",
-    "*.mk": "makefile",
-    "BUILD.mk": "makefile"
-  }
-}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
deleted file mode 100644
index 894f7a924f..0000000000
--- a/.vscode/tasks.json
+++ /dev/null
@@ -1,175 +0,0 @@
-{
-	"version": "2.0.0",
-	"tasks": [
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": {
-				"kind": "build",
-				"isDefault": true
-			},
-			"problemMatcher": [
-				"$gcc"
-			],
-			"presentation": {
-				"reveal": "always",
-				"panel": "shared"
-			}
-		},
-		{
-			"label": "Rebuild (clean + all)",
-			"type": "shell",
-			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"clean",
-				"&&",
-				"${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-				"-j8"
-			],
-			"problemMatcher": [
-				"$gcc"
-			],
-			"presentation": {
-				"reveal": "always",
-				"panel": "shared"
-			}
-		},
-		{
-			"label": "Build vmathf_test",
-			"type": "shell",
-			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8",
-				"o/opt/llamafile/vmathf_test"
-			],
-			"problemMatcher": [
-				"$gcc"
-			],
-			"presentation": {
-				"reveal": "always",
-				"panel": "shared"
-			}
-		},
-		{
-			"label": "Run vmathf_test",
-			"type": "shell",
-			"command": "o/opt/llamafile/vmathf_test",
-			"dependsOn": "Build vmathf_test",
-			"presentation": {
-				"reveal": "always",
-				"panel": "shared"
-			}
-		},
-		{
-			"label": "Clean",
-			"type": "shell",
-			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"clean"
-			],
-			"group": "build",
-			"presentation": {
-				"reveal": "always",
-				"panel": "shared"
-			}
-		},
-		{
-			"label": "Watch (incremental)",
-			"type": "shell",
-			"command": "${workspaceFolder}/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"isBackground": true,
-			"problemMatcher": [
-				{
-					"owner": "cpp",
-					"pattern": {
-						"regexp": "^(.*):(\\d+):(\\d+): (warning|error): (.*)$",
-						"file": 1,
-						"line": 2,
-						"column": 3,
-						"severity": 4,
-						"message": 5
-					},
-					"background": {
-						"activeOnStart": true,
-						"beginsPattern": "^.*Building.*$",
-						"endsPattern": "^.*(error|warning|linking).*$"
-					}
-				}
-			],
-			"presentation": {
-				"reveal": "never",
-				"panel": "dedicated"
-			}
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		},
-		{
-			"label": "Build (fast)",
-			"type": "shell",
-			"command": "/Users/logan/Documents/projects/llamafile/.cosmocc/3.9.7/bin/make",
-			"args": [
-				"-j8"
-			],
-			"group": "build"
-		}
-	]
-}
\ No newline at end of file
diff --git a/RELEASE.md b/RELEASE.md
index 9a1519948c..8af0a68a40 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -34,7 +34,7 @@ llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.
 # Dynamic scale adjustment via API
 curl -X POST http://localhost:8080/lora-adapters \
   -H "Content-Type: application/json" \
-  -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1.2}]'
+  -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1}]'
 ```
 
 This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities.

From 86cae3d8f6e087920642262957ebd34c4dde8b02 Mon Sep 17 00:00:00 2001
From: loganpowell <loganp@tepper.cmu.edu>
Date: Wed, 20 Aug 2025 14:43:58 -0400
Subject: [PATCH 9/9] hk

---
 .gitignore |   4 +-
 diff.txt   | 512 -----------------------------------------------------
 2 files changed, 2 insertions(+), 514 deletions(-)
 delete mode 100644 diff.txt

diff --git a/.gitignore b/.gitignore
index dd97ddc5b8..95709537cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,5 +10,5 @@
 /trace.json
 
 /*.log
-/.models
-/.vscode
\ No newline at end of file
+.models
+.vscode
\ No newline at end of file
diff --git a/diff.txt b/diff.txt
deleted file mode 100644
index c6be66ce7e..0000000000
--- a/diff.txt
+++ /dev/null
@@ -1,512 +0,0 @@
-diff --git a/llamafile/server/client.h b/llamafile/server/client.h
-index 74d1314e6..f82eed422 100644
---- a/llamafile/server/client.h
-+++ b/llamafile/server/client.h
-@@ -25,6 +25,7 @@
- #include <optional>
- #include <string>
- #include <sys/resource.h>
-+#include "llama.cpp/common.h"
- 
- #define HasHeader(H) (!!msg_.headers[H].a)
- #define HeaderData(H) (ibuf_.p + msg_.headers[H].a)
-@@ -141,13 +142,4 @@ struct Client
- } // namespace lf
- 
- // Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp)
--#define MAX_LORA_ADAPTERS 8
--struct lora_adapter_container {
--    struct llama_lora_adapter* adapter;
--    float scale;
--    std::string name;  // Model/adapter name for identification
--    bool applied;      // Whether this adapter is currently applied to slots
--};
--
--extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
--extern int g_lora_adapters_count;
-+// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead
-diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp
-index 35e55198d..c048a2a4e 100644
---- a/llamafile/server/lora_adapters.cpp
-+++ b/llamafile/server/lora_adapters.cpp
-@@ -17,6 +17,7 @@
- 
- #include "client.h"
- #include "llama.cpp/llama.h"
-+#include "llama.cpp/common.h"
- #include "llamafile/json.h"
- #include "llamafile/llamafile.h"
- #include "llamafile/server/log.h"
-@@ -29,9 +30,7 @@
- using jt::Json;
- 
- // External declarations for global LoRA adapter storage from prog.cpp (outside namespace)
--// Note: struct lora_adapter_container and MAX_LORA_ADAPTERS are already defined in client.h
--extern struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS];
--extern int g_lora_adapters_count;
-+extern std::vector<llama_lora_adapter_container> g_lora_adapters;
- 
- namespace lf {
- namespace server {
-@@ -46,12 +45,12 @@ Client::lora_adapters()
-         json.setArray();
-         std::vector<Json>& json_array = json.getArray();
-         
--        for (int i = 0; i < g_lora_adapters_count; i++) {
-+        for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
-             Json adapter;
-             adapter.setObject();
--            adapter["id"] = i;
--            adapter["path"] = g_lora_adapters[i].name;  // Use name as path for now
--            adapter["scale"] = g_lora_adapters[i].scale;
-+            adapter["id"] = (int)i;
-+            adapter["path"] = ::g_lora_adapters[i].path;
-+            adapter["scale"] = ::g_lora_adapters[i].scale;
-             json_array.push_back(adapter);
-         }
-         
-@@ -93,7 +92,7 @@ bool
- Client::handle_apply_adapters(Json& json)
- {
-     // Get active slots and apply current adapters to them
--    if (g_lora_adapters_count == 0) {
-+    if (::g_lora_adapters.empty()) {
-         Json response;
-         response["success"] = false;
-         response["message"] = "No adapters loaded to apply";
-@@ -103,14 +102,34 @@ Client::handle_apply_adapters(Json& json)
-         return send_response(obuf_.p, p, response.toString());
-     }
-     
--    // Apply adapters to all slots via the server
--    // Note: This would require coordination with the slot management system
--    SLOG("applying %d LoRA adapter(s) to all active slots", g_lora_adapters_count);
-+    // Apply adapters to all slots via the server using llama.cpp unified function
-+    SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", 
-+         (int)::g_lora_adapters.size());
-+    
-+    // Apply to all active slots
-+    Slots* slots = worker_->server_->slots_;
-+    pthread_mutex_lock(&slots->lock_);
-+    
-+    for (size_t i = 0; i < slots->slots_.size(); ++i) {
-+        Slot* slot = slots->slots_[i].get();
-+        if (slot->ctx_) {
-+            SLOG("applying LoRA adapters to slot #%d", slot->id_);
-+            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
-+            
-+            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
-+            // The slot's prefill() mechanism will intelligently preserve system prompts
-+            // and only re-evaluate what's necessary when the next request comes in
-+            slot->mark_for_refresh();
-+            SLOG("marked slot #%d for refresh after LoRA application", slot->id_);
-+        }
-+    }
-+    
-+    pthread_mutex_unlock(&slots->lock_);
-     
-     Json response;
-     response["success"] = true;
-     response["message"] = "Adapters applied to active slots";
--    response["adapters_applied"] = g_lora_adapters_count;
-+    response["adapters_applied"] = (int)::g_lora_adapters.size();
-     
-     char* p = append_http_response_message(obuf_.p, 200);
-     p = stpcpy(p, "Content-Type: application/json\r\n");
-@@ -128,18 +147,6 @@ Client::handle_load_adapter(Json& json)
-     std::string adapter_path = json["path"].getString();
-     float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f;
-     
--    // Check if we have room for more adapters
--    if (g_lora_adapters_count >= MAX_LORA_ADAPTERS) {
--        Json response;
--        response["success"] = false;
--        response["message"] = "Maximum number of adapters already loaded";
--        response["max_adapters"] = MAX_LORA_ADAPTERS;
--        
--        char* p = append_http_response_message(obuf_.p, 400);
--        p = stpcpy(p, "Content-Type: application/json\r\n");
--        return send_response(obuf_.p, p, response.toString());
--    }
--    
-     // Check if file exists
-     if (!std::filesystem::exists(adapter_path)) {
-         Json response;
-@@ -167,11 +174,15 @@ Client::handle_load_adapter(Json& json)
-         return send_response(obuf_.p, p, response.toString());
-     }
-     
-+    // Create the adapter container
-+    llama_lora_adapter_container adapter_container;
-+    adapter_container.path = adapter_path;
-+    adapter_container.scale = scale;
-+    adapter_container.adapter = adapter;
-+    
-     // Store the adapter
--    int index = g_lora_adapters_count;
--    g_lora_adapters[index].adapter = adapter;
--    g_lora_adapters[index].scale = scale;
--    g_lora_adapters_count++;
-+    int index = (int)::g_lora_adapters.size();
-+    ::g_lora_adapters.push_back(adapter_container);
-     
-     SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str());
-     
-@@ -181,7 +192,7 @@ Client::handle_load_adapter(Json& json)
-     response["index"] = index;
-     response["path"] = adapter_path;
-     response["scale"] = scale;
--    response["total_adapters"] = g_lora_adapters_count;
-+    response["total_adapters"] = (int)::g_lora_adapters.size();
-     
-     char* p = append_http_response_message(obuf_.p, 200);
-     p = stpcpy(p, "Content-Type: application/json\r\n");
-@@ -192,18 +203,16 @@ bool
- Client::handle_clear_adapters()
- {
-     // Clear all loaded adapters
--    SLOG("clearing all %d LoRA adapter(s)", g_lora_adapters_count);
-+    SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size());
-     
--    for (int i = 0; i < g_lora_adapters_count; i++) {
--        if (g_lora_adapters[i].adapter) {
--            llama_lora_adapter_free(g_lora_adapters[i].adapter);
--            g_lora_adapters[i].adapter = nullptr;
--            g_lora_adapters[i].scale = 0.0f;
-+    int cleared_count = (int)::g_lora_adapters.size();
-+    for (auto& la : ::g_lora_adapters) {
-+        if (la.adapter) {
-+            llama_lora_adapter_free(la.adapter);
-         }
-     }
-     
--    int cleared_count = g_lora_adapters_count;
--    g_lora_adapters_count = 0;
-+    ::g_lora_adapters.clear();
-     
-     SLOG("cleared %d LoRA adapter(s)", cleared_count);
-     
-@@ -225,11 +234,6 @@ Client::handle_upstream_lora_apply(Json& json)
-     std::vector<Json>& json_array = json.getArray();
-     SLOG("applying LoRA configuration with %d entries", (int)json_array.size());
-     
--    // First, reset all adapter scales to 0.0 (disabled)
--    for (int i = 0; i < g_lora_adapters_count; i++) {
--        g_lora_adapters[i].applied = false;
--    }
--    
-     // Process each entry in the array
-     for (size_t i = 0; i < json_array.size(); i++) {
-         Json& entry = json_array[i];
-@@ -246,22 +250,21 @@ Client::handle_upstream_lora_apply(Json& json)
-         float scale = entry["scale"].getNumber();
-         
-         // Validate ID range
--        if (id < 0 || id >= g_lora_adapters_count) {
-+        if (id < 0 || id >= (int)::g_lora_adapters.size()) {
-             return send_error(400, "Invalid adapter ID");
-         }
-         
-         // Update the adapter configuration
--        g_lora_adapters[id].scale = scale;
--        g_lora_adapters[id].applied = (scale > 0.0f);
-+        ::g_lora_adapters[id].scale = scale;
-         
-         char scale_buf[32];
-         snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale);
-         SLOG("set LoRA adapter %d ('%s') scale to %s", 
--             id, g_lora_adapters[id].name.c_str(), scale_buf);
-+             id, ::g_lora_adapters[id].path.c_str(), scale_buf);
-     }
-     
--    // Re-apply LoRA adapters to all active slots with updated scales
--    SLOG("re-applying LoRA adapters to all active slots");
-+    // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function
-+    SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function");
-     Slots* slots = worker_->server_->slots_;
-     
-     // Lock the slots to prevent concurrent access during LoRA re-application
-@@ -271,32 +274,13 @@ Client::handle_upstream_lora_apply(Json& json)
-         Slot* slot = slots->slots_[i].get();
-         if (slot->ctx_) {
-             SLOG("re-applying LoRA adapters to slot #%d", slot->id_);
-+            llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters);
-             
--            // Clear existing LoRA adapters from this context
--            llama_lora_adapter_clear(slot->ctx_);
--            
--            // Use the same approach as slot initialization: get all adapters via the function
--            struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
--            float scales[MAX_LORA_ADAPTERS];
--            int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
--            
--            SLOG("got %d LoRA adapters from llamafiler_get_lora_adapters for slot #%d", adapter_count, slot->id_);
--            
--            // Re-apply all adapters with their current scales
--            for (int j = 0; j < adapter_count; ++j) {
--                char scale_buf[32];
--                snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[j]);
--                SLOG("processing LoRA adapter %d with scale %s", j, scale_buf);
--                if (scales[j] > 0.0f) {
--                    if (llama_lora_adapter_set(slot->ctx_, adapters[j], scales[j]) != 0) {
--                        SLOG("failed to re-apply LoRA adapter %d to slot #%d", j, slot->id_);
--                    } else {
--                        SLOG("re-applied LoRA adapter %d to slot #%d with scale %s", j, slot->id_, scale_buf);
--                    }
--                } else {
--                    SLOG("skipping LoRA adapter %d (scale %s <= 0)", j, scale_buf);
--                }
--            }
-+            // CRITICAL: Mark slot for refresh to handle LoRA changes properly
-+            // The slot's prefill() mechanism will intelligently preserve system prompts
-+            // and only re-evaluate what's necessary when the next request comes in
-+            slot->mark_for_refresh();
-+            SLOG("marked slot #%d for refresh after LoRA update", slot->id_);
-         }
-     }
-     
-@@ -307,12 +291,13 @@ Client::handle_upstream_lora_apply(Json& json)
-     Json response;
-     response.setArray();
-     std::vector<Json>& response_array = response.getArray();
--    for (int i = 0; i < g_lora_adapters_count; i++) {
-+    
-+    for (size_t i = 0; i < ::g_lora_adapters.size(); i++) {
-         Json adapter;
-         adapter.setObject();
--        adapter["id"] = i;
--        adapter["path"] = g_lora_adapters[i].name;
--        adapter["scale"] = g_lora_adapters[i].scale;
-+        adapter["id"] = (int)i;
-+        adapter["path"] = ::g_lora_adapters[i].path;
-+        adapter["scale"] = ::g_lora_adapters[i].scale;
-         response_array.push_back(adapter);
-     }
-     
-diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp
-index a21c80961..89ae3f12d 100644
---- a/llamafile/server/prog.cpp
-+++ b/llamafile/server/prog.cpp
-@@ -26,31 +26,21 @@
- #include "llamafile/server/tokenbucket.h"
- #include "llamafile/server/utils.h"
- #include "llamafile/version.h"
-+#include "llama.cpp/common.h"
- #include <cassert>
- #include <cosmo.h>
- 
--// Global LoRA adapter storage for multiple adapters
--#define MAX_LORA_ADAPTERS 8
--#include <string>
--struct lora_adapter_container {
--    struct llama_lora_adapter* adapter;
--    float scale;
--    std::string name;  // Model/adapter name for identification
--    bool applied;      // Whether this adapter is currently applied to slots
--};
--
--// Make these externally accessible for HTTP endpoint
--struct lora_adapter_container g_lora_adapters[MAX_LORA_ADAPTERS] = {};
--int g_lora_adapters_count = 0;
-+// Global LoRA adapter storage using llama.cpp structures
-+std::vector<llama_lora_adapter_container> g_lora_adapters;
- 
- // Function to get the first global LoRA adapter for backward compatibility
- extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() {
--    return g_lora_adapters_count > 0 ? g_lora_adapters[0].adapter : nullptr;
-+    return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter;
- }
- 
- // Function to get all LoRA adapters and their count
- extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) {
--    int count = g_lora_adapters_count < max_adapters ? g_lora_adapters_count : max_adapters;
-+    int count = std::min((int)g_lora_adapters.size(), max_adapters);
-     for (int i = 0; i < count; i++) {
-         adapters[i] = g_lora_adapters[i].adapter;
-         scales[i] = g_lora_adapters[i].scale;
-@@ -129,38 +119,31 @@ main(int argc, char* argv[])
-             char scale_buf[32];
-             snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale);
-             
--            // Generate model name from filename
-+            // Generate model name from filename for identification
-             const char* path = FLAG_lora_adapters[i].path;
-             const char* filename = strrchr(path, '/');
-             filename = filename ? filename + 1 : path;
-             
--            // Remove file extension for cleaner name
--            std::string model_name(filename);
--            size_t dot_pos = model_name.find_last_of('.');
--            if (dot_pos != std::string::npos) {
--                model_name = model_name.substr(0, dot_pos);
--            }
--            
-             SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, 
--                 model_name.c_str(), path, scale_buf);
--                 
--            g_lora_adapters[i].adapter = llama_lora_adapter_init(model, path);
--            g_lora_adapters[i].scale = FLAG_lora_adapters[i].scale;
--            g_lora_adapters[i].name = model_name;
--            g_lora_adapters[i].applied = !FLAG_lora_init_without_apply;  // Apply unless flag is set
-+                 filename, path, scale_buf);
-+            
-+            llama_lora_adapter_container adapter_container;
-+            adapter_container.path = std::string(path);
-+            adapter_container.scale = FLAG_lora_adapters[i].scale;
-+            adapter_container.adapter = llama_lora_adapter_init(model, path);
-             
--            if (!g_lora_adapters[i].adapter) {
-+            if (!adapter_container.adapter) {
-                 fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path);
-                 // Cleanup previously loaded adapters
--                for (int j = 0; j < i; j++) {
--                    if (g_lora_adapters[j].adapter) {
--                        llama_lora_adapter_free(g_lora_adapters[j].adapter);
-+                for (auto& la : g_lora_adapters) {
-+                    if (la.adapter) {
-+                        llama_lora_adapter_free(la.adapter);
-                     }
-                 }
-                 llama_free_model(model);
-                 exit(1);
-             }
--            g_lora_adapters_count++;
-+            g_lora_adapters.push_back(adapter_container);
-         }
-         
-         if (FLAG_lora_init_without_apply) {
-@@ -203,9 +186,9 @@ main(int argc, char* argv[])
-     delete slots;
-     
-     // Cleanup LoRA adapters
--    for (int i = 0; i < g_lora_adapters_count; i++) {
--        if (g_lora_adapters[i].adapter) {
--            llama_lora_adapter_free(g_lora_adapters[i].adapter);
-+    for (auto& la : g_lora_adapters) {
-+        if (la.adapter) {
-+            llama_lora_adapter_free(la.adapter);
-         }
-     }
-     
-diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
-index 55138417b..a081d69df 100644
---- a/llamafile/server/slot.cpp
-+++ b/llamafile/server/slot.cpp
-@@ -18,6 +18,7 @@
- #include "slot.h"
- #include "llama.cpp/llava/clip.h"
- #include "llama.cpp/llava/llava.h"
-+#include "llama.cpp/common.h"
- #include "llamafile/image.h"
- #include "llamafile/llama.h"
- #include "llamafile/llamafile.h"
-@@ -32,6 +33,9 @@
- #include <cassert>
- #include <cosmo.h>
- 
-+// External declaration for global LoRA adapter storage
-+extern std::vector<llama_lora_adapter_container> g_lora_adapters;
-+
- namespace lf {
- namespace server {
- 
-@@ -79,7 +83,7 @@ Slot::describe_error(int err)
-     }
- }
- 
--Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
-+Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false)
- {
-     dll_init(&elem_);
-     last_used_ = time(0);
-@@ -126,24 +130,16 @@ Slot::start()
-     if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
-         return false;
-     
--    // Apply LoRA adapters if available
--    struct llama_lora_adapter* adapters[MAX_LORA_ADAPTERS];
--    float scales[MAX_LORA_ADAPTERS];
--    int adapter_count = llamafiler_get_lora_adapters(adapters, scales, MAX_LORA_ADAPTERS);
--    
--    if (adapter_count > 0) {
--        SLOG("applying %d LoRA adapter(s) to slot #%d", adapter_count, id_);
--        for (int i = 0; i < adapter_count; i++) {
--            if (llama_lora_adapter_set(ctx_, adapters[i], scales[i]) != 0) {
--                SLOG("failed to apply LoRA adapter %d to slot #%d", i + 1, id_);
--                llama_free(ctx_);
--                ctx_ = nullptr;
--                return false;
--            }
--            char scale_buf[32];
--            snprintf(scale_buf, sizeof(scale_buf), "%.2f", scales[i]);
--            SLOG("applied LoRA adapter %d to slot #%d with scale %s", i + 1, id_, scale_buf);
--        }
-+    // Apply LoRA adapters if available using llama.cpp's unified function
-+    if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) {
-+        SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", 
-+             (int)::g_lora_adapters.size(), id_);
-+        llama_lora_adapters_apply(ctx_, ::g_lora_adapters);
-+    } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) {
-+        // When --lora-init-without-apply is set, explicitly clear any LoRA state
-+        // to ensure no residual LoRA effects from model initialization
-+        SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_);
-+        llama_lora_adapter_clear(ctx_);
-     }
-     
-     if (FLAG_mmproj)
-@@ -314,6 +310,15 @@ Slot::prefill(const std::vector<Atom>& atoms, const ProgressCallback& progress)
-     if (!ctx_)
-         return uninitialized;
- 
-+    // Check if we need to refresh due to LoRA adapter changes
-+    if (needs_refresh_) {
-+        SLOG("Refreshing slot due to LoRA adapter changes");
-+        llama_kv_cache_clear(ctx_);
-+        history_.clear();
-+        needs_refresh_ = false;
-+        // Fall through to normal prefill logic with cleared state
-+    }
-+
-     // handle special case of empty prefill
-     if (atoms.empty()) {
-         llama_kv_cache_clear(ctx_);
-@@ -458,5 +463,11 @@ Slot::dump(std::string* result)
-     }
- }
- 
-+void
-+Slot::mark_for_refresh()
-+{
-+    needs_refresh_ = true;
-+}
-+
- } // namespace server
- } // namespace lf
-diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
-index e8816c900..104aa7623 100644
---- a/llamafile/server/slot.h
-+++ b/llamafile/server/slot.h
-@@ -23,7 +23,6 @@
- #include <vector>
- 
- #define SLOT(e) DLL_CONTAINER(Slot, elem_, e)
--#define MAX_LORA_ADAPTERS 8
- 
- struct llama_context;
- struct llama_model;
-@@ -66,6 +65,7 @@ struct Slot
-     llama_context* ctx_ = nullptr;
-     std::vector<Atom> history_;
-     std::string system_fingerprint_;
-+    bool needs_refresh_ = false;
- 
-     ~Slot();
-     Slot(int, llama_model*);
-@@ -79,6 +79,7 @@ struct Slot
-     int prefill(const std::vector<Atom>&, const ProgressCallback& = nullptr);
-     void tokenize(std::vector<Atom>*, std::string_view, bool);
-     void dump(std::string*);
-+    void mark_for_refresh();
- };
- 
- } // namespace server