mozilla-ai · loganpowell · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,5 @@
 /trace.json
 
 /*.log
+.models
+.vscode
diff --git a/README.md b/README.md
diff --git a/RELEASE.md b/RELEASE.md
@@ -2,6 +2,43 @@
 
 There are a few steps in making a Llamafile release which will be detailed in this document.
 
+## What's New in This Release
+
+### LoRA Adapter Support
+
+This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include:
+
+- **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors
+- **Dynamic Hot-Swapping API**: Adjust LoRA adapter scales in real-time during inference without restarting the server
+- **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference
+- **REST API Endpoints**:
+  - `GET /lora-adapters`: View current adapter configuration
+  - `POST /lora-adapters`: Update adapter scales dynamically
+- **Compatible Flags**:
+  - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0)
+  - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor
+  - `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping)
+- **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility
+- **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety
+- **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle
+
+Example usage:
+
+```bash
+# Single adapter with default scale
+llamafile -m base_model.gguf --lora adapter.gguf --server
+
+# Multiple adapters with different scales
+llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server
+
+# Dynamic scale adjustment via API
+curl -X POST http://localhost:8080/lora-adapters \
+  -H "Content-Type: application/json" \
+  -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1}]'
+```
+
+This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities.
+
 The two primary artifacts of the release are the `llamafile-<version>.zip` and the binaries for the GitHub release.
 
 ## Release Process
@@ -10,13 +47,13 @@ Note: Step 2 and 3 are only needed if you are making a new release of the ggml-c
 
 1. Update the version number in `version.h`
 2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA.
-    - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
-    - For LocalScore you can do this by running the script `./localscore/cuda.sh`.
-    - The files will be built and placed your home directory.
+   - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively.
+   - For LocalScore you can do this by running the script `./localscore/cuda.sh`.
+   - The files will be built and placed your home directory.
 3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore.
-    - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
-    - For LocalScore you can do this by running the script `./localscore/cuda.bat`.
-    - The files will be built and placed in the `build/release` directory.
+   - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively.
+   - For LocalScore you can do this by running the script `./localscore/cuda.bat`.
+   - The files will be built and placed in the `build/release` directory.
 4. Build the project with `make -j8`
 5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local`
 
@@ -126,4 +163,4 @@ You can use the script to create the appropriately named binaries:
 
 `./llamafile/release.sh -v <version> -s <source_dir> -d <dest_dir>`
 
-Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.
+Make sure to move the llamafile-<version>.zip file to the <dest_dir> as well, and you are good to release after you've tested.
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -53,6 +53,7 @@ bool FLAG_tinyblas = false;
 bool FLAG_trace = false;
 bool FLAG_unsecure = false;
 bool FLAG_v2 = false;
+bool FLAG_lora_init_without_apply = false;
 const char *FLAG_chat_template = "";
 const char *FLAG_db = nullptr;
 const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;"
@@ -65,6 +66,11 @@ const char *FLAG_model = nullptr;
 const char *FLAG_prompt = nullptr;
 const char *FLAG_url_prefix = "";
 const char *FLAG_www_root = "/zip/www";
+const char *FLAG_lora = nullptr;
+
+// Multiple LoRA adapters support
+struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0};
+int FLAG_lora_adapters_count = 0;
 double FLAG_token_rate = 1;
 float FLAG_decay_growth = .01;
 float FLAG_frequency_penalty = 0;
@@ -385,6 +391,51 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
+        //////////////////////////////////////////////////////////////////////
+        // LoRA flags
+
+        if (!strcmp(flag, "--lora")) {
+            if (i == argc)
+                missing("--lora");
+            if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+                error("too many LoRA adapters (max 8)");
+            }
+            FLAG_lora_adapters[FLAG_lora_adapters_count].path = argv[i++];
+            FLAG_lora_adapters[FLAG_lora_adapters_count].scale = 1.0f;
+            FLAG_lora_adapters_count++;
+
+            // Keep FLAG_lora for backward compatibility
+            if (!FLAG_lora) {
+                FLAG_lora = FLAG_lora_adapters[0].path;
+            }
+            continue;
+        }
+
+        if (!strcmp(flag, "--lora-scaled")) {
+            if (i == argc)
+                missing("--lora-scaled");
+            const char* lora_adapter = argv[i++];
+            if (i == argc)
+                missing("--lora-scaled scale value");
+            if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) {
+                error("too many LoRA adapters (max 8)");
+            }
+            FLAG_lora_adapters[FLAG_lora_adapters_count].path = lora_adapter;
+            FLAG_lora_adapters[FLAG_lora_adapters_count].scale = atof(argv[i++]);
+            FLAG_lora_adapters_count++;
+
+            // Keep FLAG_lora for backward compatibility
+            if (!FLAG_lora) {
+                FLAG_lora = FLAG_lora_adapters[0].path;
+            }
+            continue;
+        }
+
+        if (!strcmp(flag, "--lora-init-without-apply")) {
+            FLAG_lora_init_without_apply = true;
+            continue;
+        }
+
         //////////////////////////////////////////////////////////////////////
         // model flags
 

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -24,6 +24,7 @@ extern bool FLAG_trace;
 extern bool FLAG_trap;
 extern bool FLAG_unsecure;
 extern bool FLAG_v2;
+extern bool FLAG_lora_init_without_apply;
 extern const char *FLAG_chat_template;
 extern const char *FLAG_db;
 extern const char *FLAG_db_startup_sql;
@@ -36,6 +37,18 @@ extern const char *FLAG_prompt;
 extern const char *FLAG_url_prefix;
 extern const char *FLAG_www_root;
 extern double FLAG_token_rate;
+extern const char *FLAG_lora;
+
+// LoRA adapter info structure to match llama.cpp
+struct llamafile_lora_adapter_info {
+    const char* path;
+    const char* name;  // Model/adapter name for identification
+    float scale;
+};
+
+#define MAX_LORA_ADAPTERS 8
+extern struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS];
+extern int FLAG_lora_adapters_count;
 extern float FLAG_decay_growth;
 extern float FLAG_frequency_penalty;
 extern float FLAG_presence_penalty;

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -705,6 +705,8 @@ Client::dispatcher()
         return slotz();
     if (p1 == "flagz")
         return flagz();
+    if (p1 == "lora-adapters")
+        return lora_adapters();
 
 #if 0
     // TODO: implement frontend for database

diff --git a/llamafile/server/client.h b/llamafile/server/client.h
@@ -25,6 +25,7 @@
 #include <optional>
 #include <string>
 #include <sys/resource.h>
+#include "llama.cpp/common.h"
 
 #define HasHeader(H) (!!msg_.headers[H].a)
 #define HeaderData(H) (ibuf_.p + msg_.headers[H].a)
@@ -35,6 +36,11 @@
     SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H))
 
 struct llama_model;
+struct llama_lora_adapter;
+
+namespace jt {
+struct Json;
+}
 
 namespace lf {
 namespace server {
@@ -121,6 +127,11 @@ struct Client
 
     bool slotz() __wur;
     bool flagz() __wur;
+    bool lora_adapters() __wur;
+    bool handle_apply_adapters(jt::Json&) __wur;
+    bool handle_load_adapter(jt::Json&) __wur;
+    bool handle_clear_adapters() __wur;
+    bool handle_upstream_lora_apply(jt::Json&) __wur;
     bool db_chat(int64_t) __wur;
     bool db_chats() __wur;
     bool db_message(int64_t) __wur;
@@ -129,3 +140,6 @@ struct Client
 
 } // namespace server
 } // namespace lf
+
+// Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp)
+// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,5 @@ @@
     /trace.json
     /*.log
+    .models
+    .vscode