diff --git a/.gitignore b/.gitignore index d928dde4e9..95709537cd 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ /trace.json /*.log +.models +.vscode \ No newline at end of file diff --git a/README.md b/README.md index e19d0a6014..259b462ecb 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ ever leaves your computer. 2. Open your computer's terminal. 3. If you're using macOS, Linux, or BSD, you'll need to grant permission -for your computer to execute this new file. (You only need to do this -once.) + for your computer to execute this new file. (You only need to do this + once.) ```sh chmod +x llava-v1.5-7b-q4.llamafile @@ -48,10 +48,10 @@ chmod +x llava-v1.5-7b-q4.llamafile ``` 6. Your browser should open automatically and display a chat interface. -(If it doesn't, just open your browser and point it at http://localhost:8080) + (If it doesn't, just open your browser and point it at http://localhost:8080) 7. When you're done chatting, return to your terminal and hit -`Control-C` to shut down llamafile. + `Control-C` to shut down llamafile. **Having trouble? See the "Gotchas" section below.** @@ -103,25 +103,25 @@ The response that's printed should look like the following: ```json { - "choices" : [ - { - "finish_reason" : "stop", - "index" : 0, - "message" : { - "content" : "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.", - "role" : "assistant" - } + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "There once was a programmer named Mike\nWho wrote code that would often choke\nHe used try and except\nTo handle each step\nAnd his program ran without any hike.", + "role": "assistant" } - ], - "created" : 1704199256, - "id" : "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU", - "model" : "LLaMA_CPP", - "object" : "chat.completion", - "usage" : { - "completion_tokens" : 38, - "prompt_tokens" : 78, - "total_tokens" : 116 - } + } + ], + "created": 1704199256, + "id": "chatcmpl-Dt16ugf3vF8btUZj9psG7To5tc4murBU", + "model": "LLaMA_CPP", + "object": "chat.completion", + "usage": { + "completion_tokens": 38, + "prompt_tokens": 78, + "total_tokens": 116 + } } ``` @@ -176,32 +176,102 @@ llamafile --server --v2 --help llamafile --server --v2 ``` +## LoRA Adapter Support + +Llamafile supports LoRA (Low-Rank Adaptation) adapters, allowing you to fine-tune models with adapter layers applied on top of the base model. This is compatible with adapters created for llama.cpp. + +### Using LoRA Adapters + +To use LoRA adapters with llamafile server, use the `--lora` and `--lora-scaled` flags: + +```bash +# Single adapter with default scale (1.0) +llamafile -m base_model.gguf --lora adapter.gguf --server + +# Single adapter with custom scale +llamafile -m base_model.gguf --lora-scaled adapter.gguf 0.8 --server + +# Multiple adapters with different scales +llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server +``` + +### LoRA Adapter Flags + +- `--lora [FNAME]`: Apply a LoRA adapter from the specified file (default scale: 1.0) +- `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with a custom scaling factor +- `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping) + +### Dynamic LoRA Adapter Management (Hot-Swapping) + +When running llamafile in server mode, you can dynamically adjust LoRA adapter scales during inference without restarting the server. This enables hot-swapping of adapter configurations on the fly. + +#### API Endpoints + +- **GET** `/lora-adapters`: Returns current LoRA adapter configuration +- **POST** `/lora-adapters`: Updates LoRA adapter scales with JSON payload + +#### Examples + +**View current LoRA adapter configuration:** + +```bash +curl http://localhost:8080/lora-adapters +``` + +**Update LoRA adapter scales:** + +```bash +curl -X POST http://localhost:8080/lora-adapters \ + -H "Content-Type: application/json" \ + -d '[ + {"id": 0, "scale": 0.8}, + {"id": 1, "scale": 1.2} + ]' +``` + +The API returns JSON responses with the current adapter configuration: + +```json +[ + { "id": 0, "path": "adapter1.gguf", "scale": 0.8 }, + { "id": 1, "path": "adapter2.gguf", "scale": 1.2 } +] +``` + +### Important Notes + +- LoRA adapters are automatically applied to each inference slot when they start +- Memory mapping (`mmap`) is automatically disabled when using LoRA adapters for compatibility +- Multiple adapters can be combined by using multiple `--lora` and `--lora-scaled` flags +- The base model specified by `-m` serves as the foundation for the LoRA adapter(s) +- Scale changes via the API take effect immediately for new inference requests +- Hot-swapping allows real-time fine-tuning of model behavior without server restart + ## Other example llamafiles We also provide example llamafiles for other models, so you can easily try out llamafile with different kinds of LLMs. -| Model | Size | License | llamafile | other quants | -| --- | --- | --- | --- | --- | -| LLaMA 3.2 1B Instruct | 1.11 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-1B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile) | -| LLaMA 3.2 3B Instruct | 2.62 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-3B-Instruct.Q6\_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile) | -| LLaMA 3.1 8B Instruct | 5.23 GB | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.1-8B-Instruct.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile) | -| Gemma 3 1B Instruct | 1.32 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-1b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile) | -| Gemma 3 4B Instruct | 3.50 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-4b-it.Q6\_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile) | -| Gemma 3 12B Instruct | 7.61 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-12b-it.Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile) | -| QwQ 32B | 7.61 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen\_QwQ-32B-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile) | -| R1 Distill Qwen 14B | 9.30 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Qwen-14B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile)| -| R1 Distill Llama 8B | 5.23 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Llama-8B-Q4\_K\_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile)| -| LLaVA 1.5 | 3.97 GB | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile) | -| Mistral-7B-Instruct v0.3| 4.42 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mistral-7b-instruct-v0.3.Q4\_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile) | -| Granite 3.2 8B Instruct | 5.25 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [granite-3.2-8b-instruct-Q4\_K\_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile) | -| Phi-3-mini-4k-instruct | 7.67 GB | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE) | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile) | -| Mixtral-8x7B-Instruct | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mixtral-8x7b-instruct-v0.1.Q5\_K\_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile) | -| OLMo-7B | 5.68 GB | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE) | [OLMo-7B-0424.Q6\_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile) | -| *Text Embedding Models* | | | | | -| E5-Mistral-7B-Instruct | 5.16 GB | [MIT](https://choosealicense.com/licenses/mit/) | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct) | -| mxbai-embed-large-v1 | 0.7 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true) | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile) | - +| Model | Size | License | llamafile | other quants | +| ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | +| LLaMA 3.2 1B Instruct | 1.11 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-1B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/blob/main/Llama-3.2-1B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile) | +| LLaMA 3.2 3B Instruct | 2.62 GB | [LLaMA 3.2](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.2-3B-Instruct.Q6_K.llamafile](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile/blob/main/Llama-3.2-3B-Instruct.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Llama-3.2-3B-Instruct-llamafile) | +| LLaMA 3.1 8B Instruct | 5.23 GB | [LLaMA 3.1](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/blob/main/LICENSE) | [Llama-3.1-8B-Instruct.Q4_K_M.llamafile](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Meta-Llama-3.1-8B-Instruct-llamafile) | +| Gemma 3 1B Instruct | 1.32 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-1b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile/resolve/main/google_gemma-3-1b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-1b-it-llamafile) | +| Gemma 3 4B Instruct | 3.50 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-4b-it.Q6_K.llamafile](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile/resolve/main/google_gemma-3-4b-it-Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-4b-it-llamafile) | +| Gemma 3 12B Instruct | 7.61 GB | [Gemma 3](https://ai.google.dev/gemma/terms) | [gemma-3-12b-it.Q4_K_M.llamafile](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile/resolve/main/google_gemma-3-12b-it-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/gemma-3-12b-it-llamafile) | +| QwQ 32B | 7.61 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [Qwen_QwQ-32B-Q4_K_M.llamafile](https://huggingface.co/Mozilla/QwQ-32B-llamafile/resolve/main/Qwen_QwQ-32B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/QwQ-32B-llamafile) | +| R1 Distill Qwen 14B | 9.30 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Qwen-14B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Qwen-14B-llamafile) | +| R1 Distill Llama 8B | 5.23 GB | [MIT](https://choosealicense.com/licenses/mit/) | [DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/DeepSeek-R1-Distill-Llama-8B-llamafile) | +| LLaVA 1.5 | 3.97 GB | [LLaMA 2](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) | [llava-v1.5-7b-q4.llamafile](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile/resolve/main/llava-v1.5-7b-q4.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/llava-v1.5-7b-llamafile) | +| Mistral-7B-Instruct v0.3 | 4.42 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mistral-7b-instruct-v0.3.Q4_0.llamafile](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile/resolve/main/Mistral-7B-Instruct-v0.3.Q4_0.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.3-llamafile) | +| Granite 3.2 8B Instruct | 5.25 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [granite-3.2-8b-instruct-Q4_K_M.llamafile](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile/resolve/main/granite-3.2-8b-instruct-Q4_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/granite-3.2-8b-instruct-llamafile) | +| Phi-3-mini-4k-instruct | 7.67 GB | [Apache 2.0](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/blob/main/LICENSE) | [Phi-3-mini-4k-instruct.F16.llamafile](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile/resolve/main/Phi-3-mini-4k-instruct.F16.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Phi-3-mini-4k-instruct-llamafile) | +| Mixtral-8x7B-Instruct | 30.03 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile/resolve/main/mixtral-8x7b-instruct-v0.1.Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/Mixtral-8x7B-Instruct-v0.1-llamafile) | +| OLMo-7B | 5.68 GB | [Apache 2.0](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/blob/main/LICENSE) | [OLMo-7B-0424.Q6_K.llamafile](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile/resolve/main/OLMo-7B-0424.Q6_K.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/OLMo-7B-0424-llamafile) | +| _Text Embedding Models_ | | | | | +| E5-Mistral-7B-Instruct | 5.16 GB | [MIT](https://choosealicense.com/licenses/mit/) | [e5-mistral-7b-instruct-Q5_K_M.llamafile](https://huggingface.co/Mozilla/e5-mistral-7b-instruct/resolve/main/e5-mistral-7b-instruct-Q5_K_M.llamafile?download=true) | [See HF repo](https://huggingface.co/Mozilla/e5-mistral-7b-instruct) | +| mxbai-embed-large-v1 | 0.7 GB | [Apache 2.0](https://choosealicense.com/licenses/apache-2.0/) | [mxbai-embed-large-v1-f16.llamafile](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile/resolve/main/mxbai-embed-large-v1-f16.llamafile?download=true) | [See HF Repo](https://huggingface.co/Mozilla/mxbai-embed-large-v1-llamafile) | Here is an example for the Mistral command-line llamafile: @@ -211,9 +281,9 @@ Here is an example for the Mistral command-line llamafile: And here is an example for WizardCoder-Python command-line llamafile: -```sh +````sh ./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n' -``` +```` And here's an example for the LLaVA command-line llamafile: @@ -246,38 +316,38 @@ This is all accomplished by combining llama.cpp with Cosmopolitan Libc, which provides some useful capabilities: 1. llamafiles can run on multiple CPU microarchitectures. We -added runtime dispatching to llama.cpp that lets new Intel systems use -modern CPU features without trading away support for older computers. + added runtime dispatching to llama.cpp that lets new Intel systems use + modern CPU features without trading away support for older computers. 2. llamafiles can run on multiple CPU architectures. We do -that by concatenating AMD64 and ARM64 builds with a shell script that -launches the appropriate one. Our file format is compatible with WIN32 -and most UNIX shells. It's also able to be easily converted (by either -you or your users) to the platform-native format, whenever required. + that by concatenating AMD64 and ARM64 builds with a shell script that + launches the appropriate one. Our file format is compatible with WIN32 + and most UNIX shells. It's also able to be easily converted (by either + you or your users) to the platform-native format, whenever required. 3. llamafiles can run on six OSes (macOS, Windows, Linux, -FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll -only need to build your code once, using a Linux-style toolchain. The -GCC-based compiler we provide is itself an Actually Portable Executable, -so you can build your software for all six OSes from the comfort of -whichever one you prefer most for development. + FreeBSD, OpenBSD, and NetBSD). If you make your own llama files, you'll + only need to build your code once, using a Linux-style toolchain. The + GCC-based compiler we provide is itself an Actually Portable Executable, + so you can build your software for all six OSes from the comfort of + whichever one you prefer most for development. 4. The weights for an LLM can be embedded within the llamafile. -We added support for PKZIP to the GGML library. This lets uncompressed -weights be mapped directly into memory, similar to a self-extracting -archive. It enables quantized weights distributed online to be prefixed -with a compatible version of the llama.cpp software, thereby ensuring -its originally observed behaviors can be reproduced indefinitely. + We added support for PKZIP to the GGML library. This lets uncompressed + weights be mapped directly into memory, similar to a self-extracting + archive. It enables quantized weights distributed online to be prefixed + with a compatible version of the llama.cpp software, thereby ensuring + its originally observed behaviors can be reproduced indefinitely. 5. Finally, with the tools included in this project you can create your -*own* llamafiles, using any compatible model weights you want. You can -then distribute these llamafiles to other people, who can easily make -use of them regardless of what kind of computer they have. + _own_ llamafiles, using any compatible model weights you want. You can + then distribute these llamafiles to other people, who can easily make + use of them regardless of what kind of computer they have. ## Using llamafile with external weights Even though our example llamafiles have the weights built-in, you don't -*have* to use llamafile that way. Instead, you can download *just* the +_have_ to use llamafile that way. Instead, you can download _just_ the llamafile software (without any weights included) from our releases page. You can then use it alongside any external weights you may have on hand. External weights are particularly useful for Windows users because they @@ -294,7 +364,6 @@ curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1 Windows users may need to change `./llamafile.exe` to `.\llamafile.exe` when running the above command. - ## Gotchas and troubleshooting On any platform, if your llamafile process is immediately killed, check @@ -309,13 +378,12 @@ If you use zsh and have trouble running llamafile, try saying `sh -c ./llamafile`. This is due to a bug that was fixed in zsh 5.9+. The same is the case for Python `subprocess`, old versions of Fish, etc. - #### Mac error "... cannot be opened because the developer cannot be verified" 1. Immediately launch System Settings, then go to Privacy & Security. llamafile should be listed at the bottom, with a button to Allow. -2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama. +2. If not, then change your command in the Terminal to be `sudo spctl --master-disable; [llama launch command]; sudo spctl --master-enable`. This is because `--master-disable` disables _all_ checking, so you need to turn it back on after quitting llama. -### Linux +### Linux On some Linux systems, you might get errors relating to `run-detectors` or WINE. This is due to `binfmt_misc` registrations. You can fix that by @@ -330,6 +398,7 @@ sudo sh -c "echo ':APE-jart:M::jartsr::/usr/bin/ape:' >/proc/sys/fs/binfmt_misc/ ``` ### Windows + As mentioned above, on Windows you may need to rename your llamafile by adding `.exe` to the filename. @@ -406,8 +475,8 @@ systems, llamafile extracts a small loader program named `ape` to `$TMPDIR/.llamafile` or `~/.ape-1.9` which is used to map your model into memory. -[1] Darwin kernel versions 15.6+ *should* be supported, but we currently - have no way of testing that. +[1] Darwin kernel versions 15.6+ _should_ be supported, but we currently +have no way of testing that. ## Supported CPUs @@ -464,7 +533,7 @@ On Linux, NVIDIA users will need to install the CUDA SDK (ideally using the shell script installer) and ROCm users need to install the HIP SDK. They're detected by looking to see if `nvcc` or `hipcc` are on the PATH. -If you have both an AMD GPU *and* an NVIDIA GPU in your machine, then +If you have both an AMD GPU _and_ an NVIDIA GPU in your machine, then you may need to qualify which one you want used, by passing either `--gpu amd` or `--gpu nvidia`. @@ -489,12 +558,12 @@ Here's an example of how to generate code for a libc function using the llama.cpp command line interface, utilizing WizardCoder-Python-13B weights: -```sh +````sh llamafile \ -m wizardcoder-python-13b-v1.0.Q8_0.gguf \ --temp 0 -r '}\n' -r '```\n' \ -e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n' -``` +```` Here's a similar example that instead utilizes Mistral-7B-Instruct weights for prose composition: @@ -648,12 +717,13 @@ commands will display that information when passing the `--help` flag. ## Running llamafile with models downloaded by third-party applications -This section answers the question *"I already have a model downloaded locally by application X, can I use it with llamafile?"*. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow. +This section answers the question _"I already have a model downloaded locally by application X, can I use it with llamafile?"_. The general answer is "yes, as long as those models are locally stored in GGUF format" but its implementation can be more or less hacky depending on the application. A few examples (tested on a Mac) follow. ### LM Studio + [LM Studio](https://lmstudio.ai/) stores downloaded models in `~/.cache/lm-studio/models`, in subdirectories with the same name of the models (following HuggingFace's `account_name/model_name` format), with the same filename you saw when you chose to download the file. - So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows: +So if you have downloaded e.g. the `llama-2-7b.Q2_K.gguf` file for `TheBloke/Llama-2-7B-GGUF`, you can run llamafile as follows: ``` cd ~/.cache/lm-studio/models/TheBloke/Llama-2-7B-GGUF @@ -666,7 +736,7 @@ When you download a new model with [ollama](https://ollama.com), all its metadat The manifest maps each file related to the model (e.g. GGUF weights, license, prompt template, etc) to a sha256 digest. The digest corresponding to the element whose `mediaType` is `application/vnd.ollama.image.model` is the one referring to the model's GGUF file. -Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see *only* those sha256-* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows: +Each sha256 digest is also used as a filename in the `~/.ollama/models/blobs` directory (if you look into that directory you'll see _only_ those sha256-\* filenames). This means you can directly run llamafile by passing the sha256 digest as the model filename. So if e.g. the `llama3:latest` GGUF file digest is `sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29`, you can run llamafile as follows: ``` cd ~/.ollama/models/blobs @@ -815,5 +885,4 @@ should that be desired. The llamafile logo on this page was generated with the assistance of DALL·E 3. - [![Star History Chart](https://api.star-history.com/svg?repos=Mozilla-Ocho/llamafile&type=Date)](https://star-history.com/#Mozilla-Ocho/llamafile&Date) diff --git a/RELEASE.md b/RELEASE.md index 76a1c5a1ae..8af0a68a40 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,6 +2,43 @@ There are a few steps in making a Llamafile release which will be detailed in this document. +## What's New in This Release + +### LoRA Adapter Support + +This release adds comprehensive support for LoRA (Low-Rank Adaptation) adapters, enabling fine-tuning capabilities compatible with llama.cpp. Key features include: + +- **Multiple LoRA Adapter Support**: Load and apply multiple LoRA adapters simultaneously with individual scaling factors +- **Dynamic Hot-Swapping API**: Adjust LoRA adapter scales in real-time during inference without restarting the server +- **Server Integration**: Full integration with the llamafile server (`--server` mode) for LoRA-enhanced inference +- **REST API Endpoints**: + - `GET /lora-adapters`: View current adapter configuration + - `POST /lora-adapters`: Update adapter scales dynamically +- **Compatible Flags**: + - `--lora [FNAME]`: Apply a LoRA adapter with default scale (1.0) + - `--lora-scaled [FNAME] [SCALE]`: Apply a LoRA adapter with custom scaling factor + - `--lora-init-without-apply`: Load LoRA adapters without applying (lora hot-swapping) +- **Automatic Optimizations**: Memory mapping is automatically disabled when using LoRA adapters for optimal compatibility +- **Thread-Safe Operations**: Hot-swapping includes proper mutex locking for concurrent access safety +- **Clean Resource Management**: Proper loading, application, and cleanup of LoRA adapters across server lifecycle + +Example usage: + +```bash +# Single adapter with default scale +llamafile -m base_model.gguf --lora adapter.gguf --server + +# Multiple adapters with different scales +llamafile -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.5 --server + +# Dynamic scale adjustment via API +curl -X POST http://localhost:8080/lora-adapters \ + -H "Content-Type: application/json" \ + -d '[{"id": 0, "scale": 0.8}, {"id": 1, "scale": 1}]' +``` + +This implementation follows llama.cpp patterns for maximum compatibility and provides a foundation for advanced fine-tuning workflows with real-time adaptation capabilities. + The two primary artifacts of the release are the `llamafile-.zip` and the binaries for the GitHub release. ## Release Process @@ -10,13 +47,13 @@ Note: Step 2 and 3 are only needed if you are making a new release of the ggml-c 1. Update the version number in `version.h` 2. Build the ggml-cuda.so and ggml-rocm.so shared libraries on Linux. You need to do this for Llamafile and LocalScore. Llamafile uses TINYBLAS as a default and LocalScore uses CUBLAS as a default for CUDA. - - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively. - - For LocalScore you can do this by running the script `./localscore/cuda.sh`. - - The files will be built and placed your home directory. + - For Llamafile you can do this by running the script `./llamafile/cuda.sh` and `./llamafile/rocm.sh` respectively. + - For LocalScore you can do this by running the script `./localscore/cuda.sh`. + - The files will be built and placed your home directory. 3. Build the ggml-cuda.dll and ggml-rocm.dll shared libraries on Windows. You need to do this for Llamafile and LocalScore. - - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively. - - For LocalScore you can do this by running the script `./localscore/cuda.bat`. - - The files will be built and placed in the `build/release` directory. + - You can do this by running the script `./llamafile/cuda.bat` and `./llamafile/rocm.bat` respectively. + - For LocalScore you can do this by running the script `./localscore/cuda.bat`. + - The files will be built and placed in the `build/release` directory. 4. Build the project with `make -j8` 5. Install the built project to your /usr/local/bin directory with `sudo make install PREFIX=/usr/local` @@ -126,4 +163,4 @@ You can use the script to create the appropriately named binaries: `./llamafile/release.sh -v -s -d ` -Make sure to move the llamafile-.zip file to the as well, and you are good to release after you've tested. \ No newline at end of file +Make sure to move the llamafile-.zip file to the as well, and you are good to release after you've tested. diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp index c0e3bb3b74..17fa46e229 100644 --- a/llamafile/flags.cpp +++ b/llamafile/flags.cpp @@ -53,6 +53,7 @@ bool FLAG_tinyblas = false; bool FLAG_trace = false; bool FLAG_unsecure = false; bool FLAG_v2 = false; +bool FLAG_lora_init_without_apply = false; const char *FLAG_chat_template = ""; const char *FLAG_db = nullptr; const char *FLAG_db_startup_sql = "PRAGMA journal_mode=WAL;" @@ -65,6 +66,11 @@ const char *FLAG_model = nullptr; const char *FLAG_prompt = nullptr; const char *FLAG_url_prefix = ""; const char *FLAG_www_root = "/zip/www"; +const char *FLAG_lora = nullptr; + +// Multiple LoRA adapters support +struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS] = {0}; +int FLAG_lora_adapters_count = 0; double FLAG_token_rate = 1; float FLAG_decay_growth = .01; float FLAG_frequency_penalty = 0; @@ -385,6 +391,51 @@ void llamafile_get_flags(int argc, char **argv) { continue; } + ////////////////////////////////////////////////////////////////////// + // LoRA flags + + if (!strcmp(flag, "--lora")) { + if (i == argc) + missing("--lora"); + if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) { + error("too many LoRA adapters (max 8)"); + } + FLAG_lora_adapters[FLAG_lora_adapters_count].path = argv[i++]; + FLAG_lora_adapters[FLAG_lora_adapters_count].scale = 1.0f; + FLAG_lora_adapters_count++; + + // Keep FLAG_lora for backward compatibility + if (!FLAG_lora) { + FLAG_lora = FLAG_lora_adapters[0].path; + } + continue; + } + + if (!strcmp(flag, "--lora-scaled")) { + if (i == argc) + missing("--lora-scaled"); + const char* lora_adapter = argv[i++]; + if (i == argc) + missing("--lora-scaled scale value"); + if (FLAG_lora_adapters_count >= MAX_LORA_ADAPTERS) { + error("too many LoRA adapters (max 8)"); + } + FLAG_lora_adapters[FLAG_lora_adapters_count].path = lora_adapter; + FLAG_lora_adapters[FLAG_lora_adapters_count].scale = atof(argv[i++]); + FLAG_lora_adapters_count++; + + // Keep FLAG_lora for backward compatibility + if (!FLAG_lora) { + FLAG_lora = FLAG_lora_adapters[0].path; + } + continue; + } + + if (!strcmp(flag, "--lora-init-without-apply")) { + FLAG_lora_init_without_apply = true; + continue; + } + ////////////////////////////////////////////////////////////////////// // model flags diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index b74dda60dd..7226ec051d 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -24,6 +24,7 @@ extern bool FLAG_trace; extern bool FLAG_trap; extern bool FLAG_unsecure; extern bool FLAG_v2; +extern bool FLAG_lora_init_without_apply; extern const char *FLAG_chat_template; extern const char *FLAG_db; extern const char *FLAG_db_startup_sql; @@ -36,6 +37,18 @@ extern const char *FLAG_prompt; extern const char *FLAG_url_prefix; extern const char *FLAG_www_root; extern double FLAG_token_rate; +extern const char *FLAG_lora; + +// LoRA adapter info structure to match llama.cpp +struct llamafile_lora_adapter_info { + const char* path; + const char* name; // Model/adapter name for identification + float scale; +}; + +#define MAX_LORA_ADAPTERS 8 +extern struct llamafile_lora_adapter_info FLAG_lora_adapters[MAX_LORA_ADAPTERS]; +extern int FLAG_lora_adapters_count; extern float FLAG_decay_growth; extern float FLAG_frequency_penalty; extern float FLAG_presence_penalty; diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index e142a5a219..364348a54b 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -705,6 +705,8 @@ Client::dispatcher() return slotz(); if (p1 == "flagz") return flagz(); + if (p1 == "lora-adapters") + return lora_adapters(); #if 0 // TODO: implement frontend for database diff --git a/llamafile/server/client.h b/llamafile/server/client.h index b9e00da41b..f82eed4225 100644 --- a/llamafile/server/client.h +++ b/llamafile/server/client.h @@ -25,6 +25,7 @@ #include #include #include +#include "llama.cpp/common.h" #define HasHeader(H) (!!msg_.headers[H].a) #define HeaderData(H) (ibuf_.p + msg_.headers[H].a) @@ -35,6 +36,11 @@ SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H)) struct llama_model; +struct llama_lora_adapter; + +namespace jt { +struct Json; +} namespace lf { namespace server { @@ -121,6 +127,11 @@ struct Client bool slotz() __wur; bool flagz() __wur; + bool lora_adapters() __wur; + bool handle_apply_adapters(jt::Json&) __wur; + bool handle_load_adapter(jt::Json&) __wur; + bool handle_clear_adapters() __wur; + bool handle_upstream_lora_apply(jt::Json&) __wur; bool db_chat(int64_t) __wur; bool db_chats() __wur; bool db_message(int64_t) __wur; @@ -129,3 +140,6 @@ struct Client } // namespace server } // namespace lf + +// Global LoRA adapter storage - extern declarations (outside namespace to match definitions in prog.cpp) +// Remove the custom lora_adapter_container - we'll use llama.cpp's structure instead diff --git a/llamafile/server/lora_adapters.cpp b/llamafile/server/lora_adapters.cpp new file mode 100644 index 0000000000..c048a2a4ef --- /dev/null +++ b/llamafile/server/lora_adapters.cpp @@ -0,0 +1,310 @@ +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "client.h" +#include "llama.cpp/llama.h" +#include "llama.cpp/common.h" +#include "llamafile/json.h" +#include "llamafile/llamafile.h" +#include "llamafile/server/log.h" +#include "llamafile/server/server.h" +#include "llamafile/server/worker.h" +#include "llamafile/server/slots.h" +#include "llamafile/server/slot.h" +#include + +using jt::Json; + +// External declarations for global LoRA adapter storage from prog.cpp (outside namespace) +extern std::vector g_lora_adapters; + +namespace lf { +namespace server { + +bool +Client::lora_adapters() +{ + // Support both GET and POST methods + if (msg_.method == kHttpGet) { + // GET: Return current adapter configuration (upstream llama.cpp format) + Json json; + json.setArray(); + std::vector& json_array = json.getArray(); + + for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { + Json adapter; + adapter.setObject(); + adapter["id"] = (int)i; + adapter["path"] = ::g_lora_adapters[i].path; + adapter["scale"] = ::g_lora_adapters[i].scale; + json_array.push_back(adapter); + } + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, json.toString()); + + } else if (msg_.method == kHttpPost) { + // POST: Apply LoRA adapters by ID and scale (upstream llama.cpp format) + + // Validate content type + if (!HasHeader(kHttpContentType) || + !IsMimeType(HeaderData(kHttpContentType), + HeaderLength(kHttpContentType), + "application/json")) { + return send_error(400, "Content-Type must be application/json"); + } + + // Read the payload + if (!read_payload()) + return false; + + // Parse JSON payload - expecting an array of {id, scale} objects + auto [status, json] = Json::parse(std::string(payload_)); + if (status != Json::success) + return send_error(400, Json::StatusToString(status)); + if (!json.isArray()) + return send_error(400, "Request body must be an array"); + + // Apply the LoRA configuration + return handle_upstream_lora_apply(json); + + } else { + return send_error(405, "Method Not Allowed"); + } +} + +bool +Client::handle_apply_adapters(Json& json) +{ + // Get active slots and apply current adapters to them + if (::g_lora_adapters.empty()) { + Json response; + response["success"] = false; + response["message"] = "No adapters loaded to apply"; + + char* p = append_http_response_message(obuf_.p, 400); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Apply adapters to all slots via the server using llama.cpp unified function + SLOG("applying %d LoRA adapter(s) to all active slots using llama.cpp unified function", + (int)::g_lora_adapters.size()); + + // Apply to all active slots + Slots* slots = worker_->server_->slots_; + pthread_mutex_lock(&slots->lock_); + + for (size_t i = 0; i < slots->slots_.size(); ++i) { + Slot* slot = slots->slots_[i].get(); + if (slot->ctx_) { + SLOG("applying LoRA adapters to slot #%d", slot->id_); + llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); + + // CRITICAL: Mark slot for refresh to handle LoRA changes properly + // The slot's prefill() mechanism will intelligently preserve system prompts + // and only re-evaluate what's necessary when the next request comes in + slot->mark_for_refresh(); + SLOG("marked slot #%d for refresh after LoRA application", slot->id_); + } + } + + pthread_mutex_unlock(&slots->lock_); + + Json response; + response["success"] = true; + response["message"] = "Adapters applied to active slots"; + response["adapters_applied"] = (int)::g_lora_adapters.size(); + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_load_adapter(Json& json) +{ + // Load a new adapter from file + if (!json.contains("path")) { + return send_error(400, "Missing 'path' field for load operation"); + } + + std::string adapter_path = json["path"].getString(); + float scale = json.contains("scale") ? json["scale"].getNumber() : 1.0f; + + // Check if file exists + if (!std::filesystem::exists(adapter_path)) { + Json response; + response["success"] = false; + response["message"] = "Adapter file not found: " + adapter_path; + + char* p = append_http_response_message(obuf_.p, 404); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Load the adapter + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); + SLOG("loading LoRA adapter from %s with scale %s", adapter_path.c_str(), scale_buf); + + struct llama_lora_adapter* adapter = llama_lora_adapter_init(model_, adapter_path.c_str()); + if (!adapter) { + Json response; + response["success"] = false; + response["message"] = "Failed to load adapter from " + adapter_path; + + char* p = append_http_response_message(obuf_.p, 500); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); + } + + // Create the adapter container + llama_lora_adapter_container adapter_container; + adapter_container.path = adapter_path; + adapter_container.scale = scale; + adapter_container.adapter = adapter; + + // Store the adapter + int index = (int)::g_lora_adapters.size(); + ::g_lora_adapters.push_back(adapter_container); + + SLOG("successfully loaded LoRA adapter #%d from %s", index, adapter_path.c_str()); + + Json response; + response["success"] = true; + response["message"] = "Adapter loaded successfully"; + response["index"] = index; + response["path"] = adapter_path; + response["scale"] = scale; + response["total_adapters"] = (int)::g_lora_adapters.size(); + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_clear_adapters() +{ + // Clear all loaded adapters + SLOG("clearing all %d LoRA adapter(s)", (int)::g_lora_adapters.size()); + + int cleared_count = (int)::g_lora_adapters.size(); + for (auto& la : ::g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); + } + } + + ::g_lora_adapters.clear(); + + SLOG("cleared %d LoRA adapter(s)", cleared_count); + + Json response; + response["success"] = true; + response["message"] = "All adapters cleared"; + response["cleared_count"] = cleared_count; + response["remaining_count"] = 0; + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +bool +Client::handle_upstream_lora_apply(Json& json) +{ + // Handle upstream llama.cpp LoRA API format: array of {id, scale} objects + std::vector& json_array = json.getArray(); + SLOG("applying LoRA configuration with %d entries", (int)json_array.size()); + + // Process each entry in the array + for (size_t i = 0; i < json_array.size(); i++) { + Json& entry = json_array[i]; + + if (!entry.isObject()) { + return send_error(400, "Each entry must be an object with 'id' and 'scale' fields"); + } + + if (!entry.contains("id") || !entry.contains("scale")) { + return send_error(400, "Each entry must have 'id' and 'scale' fields"); + } + + int id = entry["id"].getNumber(); + float scale = entry["scale"].getNumber(); + + // Validate ID range + if (id < 0 || id >= (int)::g_lora_adapters.size()) { + return send_error(400, "Invalid adapter ID"); + } + + // Update the adapter configuration + ::g_lora_adapters[id].scale = scale; + + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", scale); + SLOG("set LoRA adapter %d ('%s') scale to %s", + id, ::g_lora_adapters[id].path.c_str(), scale_buf); + } + + // Re-apply LoRA adapters to all active slots with updated scales using llama.cpp unified function + SLOG("re-applying LoRA adapters to all active slots using llama.cpp unified function"); + Slots* slots = worker_->server_->slots_; + + // Lock the slots to prevent concurrent access during LoRA re-application + pthread_mutex_lock(&slots->lock_); + + for (size_t i = 0; i < slots->slots_.size(); ++i) { + Slot* slot = slots->slots_[i].get(); + if (slot->ctx_) { + SLOG("re-applying LoRA adapters to slot #%d", slot->id_); + llama_lora_adapters_apply(slot->ctx_, ::g_lora_adapters); + + // CRITICAL: Mark slot for refresh to handle LoRA changes properly + // The slot's prefill() mechanism will intelligently preserve system prompts + // and only re-evaluate what's necessary when the next request comes in + slot->mark_for_refresh(); + SLOG("marked slot #%d for refresh after LoRA update", slot->id_); + } + } + + pthread_mutex_unlock(&slots->lock_); + SLOG("finished re-applying LoRA adapters to all slots"); + + // Return updated adapter configuration + Json response; + response.setArray(); + std::vector& response_array = response.getArray(); + + for (size_t i = 0; i < ::g_lora_adapters.size(); i++) { + Json adapter; + adapter.setObject(); + adapter["id"] = (int)i; + adapter["path"] = ::g_lora_adapters[i].path; + adapter["scale"] = ::g_lora_adapters[i].scale; + response_array.push_back(adapter); + } + + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, response.toString()); +} + +} // namespace server +} // namespace lf diff --git a/llamafile/server/main.1 b/llamafile/server/main.1 index e5d01adc2a..60c6d3adb7 100644 --- a/llamafile/server/main.1 +++ b/llamafile/server/main.1 @@ -29,6 +29,23 @@ recommended that you run multiple instances of llamafiler behind a reverse proxy such as NGINX or Redbean. .It Fl mm Ar FNAME , Fl Fl mmproj Ar FNAME Path of vision model weights. +.It Fl Fl lora Ar FNAME +Path to LoRA adapter weights. This flag may be repeated to load multiple +LoRA adapters. Each adapter will be applied with a default scale of 1.0. +The base model specified by +.Fl m +will be used as the foundation for all LoRA adaptations. +.It Fl Fl lora-scaled Ar FNAME Ar SCALE +Path to LoRA adapter weights with custom scaling factor. The +.Ar SCALE +parameter is a floating point number that controls the strength of the +LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced strength). +This flag may be repeated to load multiple scaled LoRA adapters. +.It Fl Fl lora-init-without-apply +Load LoRA adapters at startup without automatically applying them. When +this flag is used, adapters are initialized but not active until +explicitly applied via the API. This is useful for dynamic LoRA adapter +management through the HTTP endpoints. .It Fl Fl db Ar FILE Specifies path of sqlite3 database. .Pp @@ -215,6 +232,14 @@ Here's an example of how you might start this server: .Pp .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf" .Pp +Here's how to start with a LoRA adapter: +.Pp +.Dl "llamafiler -m base_model.gguf --lora adapter.gguf" +.Pp +Here's how to use multiple LoRA adapters with custom scaling: +.Pp +.Dl "llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled adapter2.gguf 0.8" +.Pp Here's how to send a tokenization request: .Pp .Dl "curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world" diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc index ab99e21913..b12ee187eb 100644 --- a/llamafile/server/main.1.asc +++ b/llamafile/server/main.1.asc @@ -1,269 +1,292 @@ -LLAMAFILER(1) General Commands Manual LLAMAFILER(1) - -NAME - llamafiler — fast reliable large language model server - -SYNOPSIS - llamafiler -m model.gguf [flags...] - -DESCRIPTION - llamafiler llamafiler is an HTTP server for Large Language Models - (LLMs). It includes a web GUI for both chatbot and text completion. It - can be your OpenAI API compatible embeddings / completions / chat com‐ - pletions server. It's able to more intelligently recycle context win‐ - dows across multiple slots serving multiple clients. - -OPTIONS - The following options are available: - - --version - Print version and exit. - - -h, --help - Show help message and exit. - - -m FNAME, --model FNAME - Path of GGUF model weights. Each server process is currently - limited to serving only one model. If you need to host multiple - models, then it's recommended that you run multiple instances - of llamafiler behind a reverse proxy such as NGINX or Redbean. - - -mm FNAME, --mmproj FNAME - Path of vision model weights. - - --db FILE - Specifies path of sqlite3 database. - - The default is ~/.llamafile/llamafile.sqlite3 - - -ngl N, --gpu-layers N, --n-gpu-layers N - Specifies number of layers to offload to GPU. - - This flag must be passed in order to use GPU on systems with - NVIDIA or AMD GPUs. If you're confident that you have enough - VRAM, then you can pass -ngl 999 to enable full offloading, - since this number is automatically downtuned to however many - number of layers the model has. If VRAM is limited, then the - --verbose flag may be passed to learn how many layers the model - has, e.g. 35, which can then be down-tuned until the out of - memory error goes away. - - On Apple Silicon systems with Metal, GPU offloading is enabled - by default. Since these GPUs use unified memory, they're - treated as having a single layer; therefore, using values - higher than 1 will be treated as 1. You can pass -ngl 0 to dis‐ - able GPU offloading and run in CPU mode on Apple Metal systems. - - -l HOSTPORT, --listen HOSTPORT - Specifies the local [HOST:]PORT on which the HTTP server should - listen. By default this is 0.0.0.0:8080 which means llamafiler - will bind to port 8080 on every locally available IPv4 network - interface. This option may currently only be specified once. - - -c TOKENS, --ctx-size TOKENS - Specifies context size. This specifies how long a completion - can get before it runs out of space. It defaults to 8k which - means 8192 tokens. Many models support a larger context size, - like 128k, but that'll need much more RAM or VRAM per slot. If - this value is larger than the trained context size of the - model, it'll be tuned down to the maximum. If this value is 0 - or negative, the maximum number of tokens will be used. - - -s COUNT, --slots COUNT - Specifies how many slots to maintain. This defaults to 1. Slots - are used by chat completions requests. When such a request - comes in, the client needs to take control of a slot. When the - completion is finished, the slot is relinquished back to the - server. HTTP clients will wait for a slot to be relinquished if - none are available. Tuning this parameter to nicely fit avail‐ - able RAM or VRAM can help you manage your server resources, and - control how much completion parallelism can happen. Please - note that --ctx-size has a strong influence on how many slots - can be created. - - --decay-delay INT - Number of seconds a context window slot needs to be inactive - before the system starts to strongly consider giving it to - other clients. The default is 300 which is five minutes. - - --decay-growth FLOAT - Sets slot decay growth factor. Context window slots are as‐ - signed in a least recently used fashion, based on the formula - age + e sup {growth * (age - delay)} - - -p TEXT, --prompt TEXT, --system-prompt TEXT - Specifies system prompt. This value is passed along to the web - frontend. - - --no-display-prompt - Hide system prompt from web user interface. - - --nologo - Hide llamafile logo icon from web ui. - - --url-prefix URLPREFIX - Specifies a URL prefix (subdirectory) under which the HTTP - server will make the API accessible, e.g. /lamafiler. Useful - when running llamafiler behind a reverse proxy such as NGINX or - Redbean. By default, this is set to / (root). - - --verbose - Enable logging of diagnostic information. This flag is useful - for learning more about the model and hardware. It can also be - helpful for troubleshooting errors. We currently recommend that - this flag be avoided in production since the llama.cpp logger - may disrupt thread cancelation. - - -w N, --workers N - Number of HTTP client handling threads. - - --trust CIDR - Adds a network to the trusted network list. This argument is - specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By - default, all clients are untrusted, which means they're subject - to token bucket throttling, and additional security precautions - that may cause request handling to go slightly slower. There‐ - fore this flag is important to use if you want to accurately - benchmark llamafiler, since the server will otherwise see the - benchmark as a DDOS and deprioritize its traffic accordingly. - - --ip-header STR - If this flag is passed a value, e.g. X-Forwarded-For, then any - trusted may send this header to your llamafile server to let it - know what the true effective client IPv4 address actually is. - After this happens the default security restrictions, e.g. to‐ - ken bucket, will be measured and applied against that IPv4 ad‐ - dress and its adjacent networks. - - --token-rate N - Specifies how many times per second a token is dropped in each - bucket. This setting is used to define a limitation on how - many TCP connects and HTTP messages each chunk of the IPv4 ad‐ - dress space is permitted to send to llamafiler over a sustained - period of time. The default token rate is 1, which means that, - on a long enough timeline, a class-C network will be depriori‐ - tized if it sends more than one request per second. No real - penalty actually applies though until the server runs out of - resources, e.g. HTTP request workers. - - --token-burst N - Specifies how many HTTP requests and TCP connects a given slice - of the IPv4 address space is permitted to send within a short - period of time, before token bucket restrictions kick in, and - cause the client to be deprioritized. By default, this value is - set to 100. It may be tuned to any value between 1 and 127 in‐ - clusive. - - --token-cidr N - Specifies IPv4 address space granularity of token bucket algo‐ - rithm, in network bits. By default, this value is set to 24 - which means individual IPv4 addresses are viewed as being rep‐ - resentative members of a class-C network, or in other words, - each group of 256 IPv4 addresses is lumped together. If one IP - in the group does something bad, then bad things happen to all - the other IPv4 addresses in that granule. This number may be - set to any integer between 3 and 32 inclusive. Specifying a - higher number will trade away system memory to increase network - specificity. For example, using 32 means that 4 billion indi‐ - vidual token buckets will be created. By default, a background - thread drops one token in each bucket every second, so that - could potentially be a lot of busy work. A value of three means - that everyone on the Internet who talks to your server will - have to fight over only eight token buckets in total. - - --unsecure - Disables sandboxing. By default, llamafiler puts itself in a - SECCOMP BPF sandbox, so that even if your server gets hacked in - the worst possible way (some kind of C++ memory bug) then - there's very little damage an attacker will be able to do. This - works by restricting system calls using Cosmopolitan Libc's im‐ - plementation of pledge() which is currently only supported on - Linux (other OSes will simply be unsecured by default). The - pledge security policy that's used by default is "stdio anet" - which means that only relatively harmless system calls like - read(), write(), and accept() are allowed once the server has - finished initializing. It's not possible for remotely executed - code to do things like launch subprocesses, read or write to - the filesystem, or initiate a new connection to a server. - - -k N, --keepalive N - Specifies the TCP keepalive interval in seconds. This value is - passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're - supported by the host operating system. If this value is - greater than 0, then the the SO_KEEPALIVE and TCP_NODELAY op‐ - tions are enabled on network sockets, if supported by the host - operating system. The default keepalive is 5. - - --http-obuf-size N - Size of HTTP output buffer size, in bytes. Default is 1048576. - - --http-ibuf-size N - Size of HTTP input buffer size, in bytes. Default is 1048576. - - --chat-template NAME - Specifies or overrides chat template for model. - - Normally the GGUF metadata tokenizer.chat_template will specify - this value for instruct models. This flag may be used to either - override the chat template, or specify one when the GGUF meta‐ - data field is absent, which effectively forces the web ui to - enable chatbot mode. - - Supported chat template names are: chatml, llama2, llama3, mis‐ - tral (alias for llama2), phi3, zephyr, monarch, gemma, gemma2 - (alias for gemma), orion, openchat, vicuna, vicuna-orca, - deepseek, command-r, chatglm3, chatglm4, minicpm, deepseek2, or - exaone3. - - It is also possible to pass the jinja2 template itself to this - argument. Since llamafiler doesn't currently support jinja2, a - heuristic will be used to guess which of the above templates - the template represents. - - --completion-mode - Forces web ui to operate in completion mode, rather than chat - mode. Normally the web ui chooses its mode based on the GGUF - metadata. Base models normally don't define tokenizer.chat_tem‐ - plate whereas instruct models do. If it's a base model, then - the web ui will automatically use completion mode only, without - needing to specify this flag. This flag is useful in cases - where a prompt template is defined by the gguf, but it is de‐ - sirable for the chat interface to be disabled. - - --db-startup-sql CODE - Specifies SQL code that should be executed whenever connecting - to the SQLite database. The default is the following code, - which enables the write-ahead log. - - PRAGMA journal_mode=WAL; - PRAGMA synchronous=NORMAL; - - --reserve-tokens N - Percent of context window to reserve for predicted tokens. When - the server runs out of context window, old chat messages will - be forgotten until this percent of the context is empty. The - default is 15%. If this is specified as a floating point num‐ - ber, e.g. 0.15, then it'll be multiplied by 100 to get the per‐ - cent. - -EXAMPLES - Here's an example of how you might start this server: - - llamafiler -m all-MiniLM-L6-v2.F32.gguf - - Here's how to send a tokenization request: - - curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world - - Here's how to send a embedding request: - - curl -v http://127.0.0.1:8080/embedding?content=hello+world - -DOCUMENTATION - Read our Markdown documentation for additional help and tutorials. See - llamafile/server/doc/index.md in the source repository on GitHub. - -SEE ALSO - llamafile(1), whisperfile(1) - -Mozilla Ocho November 30, 2024 LLAMAFILER(1) +LLAMAFILER(1) General Commands Manual LLAMAFILER(1) + +NNAAMMEE + llllaammaaffiilleerr – fast reliable large language model server + +SSYYNNOOPPSSIISS + llllaammaaffiilleerr --mm _m_o_d_e_l_._g_g_u_f [flags...] + +DDEESSCCRRIIPPTTIIOONN + llllaammaaffiilleerr llamafiler is an HTTP server for Large Language Models (LLMs). + It includes a web GUI for both chatbot and text completion. It can be your + OpenAI API compatible embeddings / completions / chat completions server. + It's able to more intelligently recycle context windows across multiple + slots serving multiple clients. + +OOPPTTIIOONNSS + The following options are available: + + ----vveerrssiioonn + Print version and exit. + + --hh, ----hheellpp + Show help message and exit. + + --mm _F_N_A_M_E, ----mmooddeell _F_N_A_M_E + Path of GGUF model weights. Each server process is currently + limited to serving only one model. If you need to host multiple + models, then it's recommended that you run multiple instances of + llamafiler behind a reverse proxy such as NGINX or Redbean. + + --mmmm _F_N_A_M_E, ----mmmmpprroojj _F_N_A_M_E + Path of vision model weights. + + ----lloorraa _F_N_A_M_E + Path to LoRA adapter weights. This flag may be repeated to load + multiple LoRA adapters. Each adapter will be applied with a default + scale of 1.0. The base model specified by --mm will be used as the + foundation for all LoRA adaptations. + + ----lloorraa--ssccaalleedd _F_N_A_M_E _S_C_A_L_E + Path to LoRA adapter weights with custom scaling factor. The _S_C_A_L_E + parameter is a floating point number that controls the strength of + the LoRA adaptation (e.g., 0.5 for half strength, 1.5 for enhanced + strength). This flag may be repeated to load multiple scaled LoRA + adapters. + + ----lloorraa--iinniitt--wwiitthhoouutt--aappppllyy + Load LoRA adapters at startup without automatically applying them. + When this flag is used, adapters are initialized but not active + until explicitly applied via the API. This is useful for dynamic + LoRA adapter management through the HTTP endpoints. + + ----ddbb _F_I_L_E + Specifies path of sqlite3 database. + + The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3 + + --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N + Specifies number of layers to offload to GPU. + + This flag must be passed in order to use GPU on systems with NVIDIA + or AMD GPUs. If you're confident that you have enough VRAM, then + you can pass --nnggll _9_9_9 to enable full offloading, since this number + is automatically downtuned to however many number of layers the + model has. If VRAM is limited, then the ----vveerrbboossee flag may be + passed to learn how many layers the model has, e.g. 35, which can + then be down-tuned until the out of memory error goes away. + + On Apple Silicon systems with Metal, GPU offloading is enabled by + default. Since these GPUs use unified memory, they're treated as + having a single layer; therefore, using values higher than 1 will + be treated as 1. You can pass --nnggll _0 to disable GPU offloading and + run in CPU mode on Apple Metal systems. + + --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T + Specifies the local [HOST:]PORT on which the HTTP server should + listen. By default this is 0.0.0.0:8080 which means llamafiler + will bind to port 8080 on every locally available IPv4 network + interface. This option may currently only be specified once. + + --cc _T_O_K_E_N_S, ----ccttxx--ssiizzee _T_O_K_E_N_S + Specifies context size. This specifies how long a completion can + get before it runs out of space. It defaults to 8k which means 8192 + tokens. Many models support a larger context size, like 128k, but + that'll need much more RAM or VRAM per slot. If this value is + larger than the trained context size of the model, it'll be tuned + down to the maximum. If this value is 0 or negative, the maximum + number of tokens will be used. + + --ss _C_O_U_N_T, ----sslloottss _C_O_U_N_T + Specifies how many slots to maintain. This defaults to 1. Slots are + used by chat completions requests. When such a request comes in, + the client needs to take control of a slot. When the completion is + finished, the slot is relinquished back to the server. HTTP clients + will wait for a slot to be relinquished if none are available. + Tuning this parameter to nicely fit available RAM or VRAM can help + you manage your server resources, and control how much completion + parallelism can happen. Please note that ----ccttxx--ssiizzee has a strong + influence on how many slots can be created. + + ----ddeeccaayy--ddeellaayy _I_N_T + Number of seconds a context window slot needs to be inactive before + the system starts to strongly consider giving it to other clients. + The default is 300 which is five minutes. + + ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T + Sets slot decay growth factor. Context window slots are assigned in + a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h + * (_a_g_e − _d_e_l_a_y)) + + --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T + Specifies system prompt. This value is passed along to the web + frontend. + + ----nnoo--ddiissppllaayy--pprroommpptt + Hide system prompt from web user interface. + + ----nnoollooggoo + Hide llamafile logo icon from web ui. + + ----uurrll--pprreeffiixx _U_R_L_P_R_E_F_I_X + Specifies a URL prefix (subdirectory) under which the HTTP server + will make the API accessible, e.g. /lamafiler. Useful when running + llamafiler behind a reverse proxy such as NGINX or Redbean. By + default, this is set to / (root). + + ----vveerrbboossee + Enable logging of diagnostic information. This flag is useful for + learning more about the model and hardware. It can also be helpful + for troubleshooting errors. We currently recommend that this flag + be avoided in production since the llama.cpp logger may disrupt + thread cancelation. + + --ww _N, ----wwoorrkkeerrss _N + Number of HTTP client handling threads. + + ----ttrruusstt _C_I_D_R + Adds a network to the trusted network list. This argument is + specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By + default, all clients are untrusted, which means they're subject to + token bucket throttling, and additional security precautions that + may cause request handling to go slightly slower. Therefore this + flag is important to use if you want to accurately benchmark + llamafiler, since the server will otherwise see the benchmark as a + DDOS and deprioritize its traffic accordingly. + + ----iipp--hheeaaddeerr _S_T_R + If this flag is passed a value, e.g. X-Forwarded-For, then any + trusted may send this header to your llamafile server to let it + know what the true effective client IPv4 address actually is. After + this happens the default security restrictions, e.g. token bucket, + will be measured and applied against that IPv4 address and its + adjacent networks. + + ----ttookkeenn--rraattee _N + Specifies how many times per second a token is dropped in each + bucket. This setting is used to define a limitation on how many + TCP connects and HTTP messages each chunk of the IPv4 address space + is permitted to send to llamafiler over a sustained period of time. + The default token rate is 1, which means that, on a long enough + timeline, a class-C network will be deprioritized if it sends more + than one request per second. No real penalty actually applies + though until the server runs out of resources, e.g. HTTP request + workers. + + ----ttookkeenn--bbuurrsstt _N + Specifies how many HTTP requests and TCP connects a given slice of + the IPv4 address space is permitted to send within a short period + of time, before token bucket restrictions kick in, and cause the + client to be deprioritized. By default, this value is set to 100. + It may be tuned to any value between 1 and 127 inclusive. + + ----ttookkeenn--cciiddrr _N + Specifies IPv4 address space granularity of token bucket algorithm, + in network bits. By default, this value is set to 24 which means + individual IPv4 addresses are viewed as being representative + members of a class-C network, or in other words, each group of 256 + IPv4 addresses is lumped together. If one IP in the group does + something bad, then bad things happen to all the other IPv4 + addresses in that granule. This number may be set to any integer + between 3 and 32 inclusive. Specifying a higher number will trade + away system memory to increase network specificity. For example, + using 32 means that 4 billion individual token buckets will be + created. By default, a background thread drops one token in each + bucket every second, so that could potentially be a lot of busy + work. A value of three means that everyone on the Internet who + talks to your server will have to fight over only eight token + buckets in total. + + ----uunnsseeccuurree + Disables sandboxing. By default, llamafiler puts itself in a + SECCOMP BPF sandbox, so that even if your server gets hacked in the + worst possible way (some kind of C++ memory bug) then there's very + little damage an attacker will be able to do. This works by + restricting system calls using Cosmopolitan Libc's implementation + of pledge() which is currently only supported on Linux (other OSes + will simply be unsecured by default). The pledge security policy + that's used by default is "stdio anet" which means that only + relatively harmless system calls like read(), write(), and accept() + are allowed once the server has finished initializing. It's not + possible for remotely executed code to do things like launch + subprocesses, read or write to the filesystem, or initiate a new + connection to a server. + + --kk _N, ----kkeeeeppaalliivvee _N + Specifies the TCP keepalive interval in seconds. This value is + passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're + supported by the host operating system. If this value is greater + than 0, then the the SO_KEEPALIVE and TCP_NODELAY options are + enabled on network sockets, if supported by the host operating + system. The default keepalive is 5. + + ----hhttttpp--oobbuuff--ssiizzee _N + Size of HTTP output buffer size, in bytes. Default is 1048576. + + ----hhttttpp--iibbuuff--ssiizzee _N + Size of HTTP input buffer size, in bytes. Default is 1048576. + + ----cchhaatt--tteemmppllaattee _N_A_M_E + Specifies or overrides chat template for model. + + Normally the GGUF metadata tokenizer.chat_template will specify + this value for instruct models. This flag may be used to either + override the chat template, or specify one when the GGUF metadata + field is absent, which effectively forces the web ui to enable + chatbot mode. + + Supported chat template names are: chatml, llama2, llama3, mistral + (alias for llama2), phi3, zephyr, monarch, gemma, gemma2 (alias for + gemma), orion, openchat, vicuna, vicuna-orca, deepseek, command-r, + chatglm3, chatglm4, minicpm, deepseek2, or exaone3. + + It is also possible to pass the jinja2 template itself to this + argument. Since llamafiler doesn't currently support jinja2, a + heuristic will be used to guess which of the above templates the + template represents. + + ----ccoommpplleettiioonn--mmooddee + Forces web ui to operate in completion mode, rather than chat mode. + Normally the web ui chooses its mode based on the GGUF metadata. + Base models normally don't define tokenizer.chat_template whereas + instruct models do. If it's a base model, then the web ui will + automatically use completion mode only, without needing to specify + this flag. This flag is useful in cases where a prompt template is + defined by the gguf, but it is desirable for the chat interface to + be disabled. + + ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E + Specifies SQL code that should be executed whenever connecting to + the SQLite database. The default is the following code, which + enables the write-ahead log. + + PRAGMA journal_mode=WAL; + PRAGMA synchronous=NORMAL; + + ----rreesseerrvvee--ttookkeennss _N + Percent of context window to reserve for predicted tokens. When the + server runs out of context window, old chat messages will be + forgotten until this percent of the context is empty. The default + is 15%. If this is specified as a floating point number, e.g. 0.15, + then it'll be multiplied by 100 to get the percent. + +EEXXAAMMPPLLEESS + Here's an example of how you might start this server: + + llamafiler -m all-MiniLM-L6-v2.F32.gguf + + Here's how to start with a LoRA adapter: + + llamafiler -m base_model.gguf --lora adapter.gguf + + Here's how to use multiple LoRA adapters with custom scaling: + + llamafiler -m base_model.gguf --lora adapter1.gguf --lora-scaled + adapter2.gguf 0.8 + + Here's how to send a tokenization request: + + curl -v http://127.0.0.1:8080/tokenize?prompt=hello+world + + Here's how to send a embedding request: + + curl -v http://127.0.0.1:8080/embedding?content=hello+world + +DDOOCCUUMMEENNTTAATTIIOONN + Read our Markdown documentation for additional help and tutorials. See + llamafile/server/doc/index.md in the source repository on GitHub. + +SSEEEE AALLSSOO + llamafile(1), whisperfile(1) + +Mozilla Ocho November 30, 2024 Mozilla Ocho diff --git a/llamafile/server/prog.cpp b/llamafile/server/prog.cpp index bd6e6b6a24..89ae3f12d7 100644 --- a/llamafile/server/prog.cpp +++ b/llamafile/server/prog.cpp @@ -26,9 +26,28 @@ #include "llamafile/server/tokenbucket.h" #include "llamafile/server/utils.h" #include "llamafile/version.h" +#include "llama.cpp/common.h" #include #include +// Global LoRA adapter storage using llama.cpp structures +std::vector g_lora_adapters; + +// Function to get the first global LoRA adapter for backward compatibility +extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter() { + return g_lora_adapters.empty() ? nullptr : g_lora_adapters[0].adapter; +} + +// Function to get all LoRA adapters and their count +extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters) { + int count = std::min((int)g_lora_adapters.size(), max_adapters); + for (int i = 0; i < count; i++) { + adapters[i] = g_lora_adapters[i].adapter; + scales[i] = g_lora_adapters[i].scale; + } + return count; +} + namespace lf { namespace server { @@ -69,6 +88,8 @@ main(int argc, char* argv[]) FLAG_log_disable = true; // load model + // --lora implies --no-mmap (as per llama.cpp server) + bool use_mmap = FLAG_mmap && (FLAG_lora_adapters_count == 0); llama_model_params mparams = { .n_gpu_layers = FLAG_n_gpu_layers, .split_mode = (enum llama_split_mode)FLAG_split_mode, @@ -79,8 +100,8 @@ main(int argc, char* argv[]) .progress_callback_user_data = nullptr, .kv_overrides = nullptr, .vocab_only = false, - .use_mmap = true, - .use_mlock = false, + .use_mmap = use_mmap, + .use_mlock = FLAG_mlock, .check_tensors = false, }; llama_model* model = llama_load_model_from_file(FLAG_model, mparams); @@ -89,6 +110,49 @@ main(int argc, char* argv[]) exit(1); } + // load LoRA adapters if specified + if (FLAG_lora_adapters_count > 0) { + const char* apply_mode = FLAG_lora_init_without_apply ? "without applying" : "and applying"; + SLOG("loading %d LoRA adapter(s) %s", FLAG_lora_adapters_count, apply_mode); + + for (int i = 0; i < FLAG_lora_adapters_count; i++) { + char scale_buf[32]; + snprintf(scale_buf, sizeof(scale_buf), "%.2f", FLAG_lora_adapters[i].scale); + + // Generate model name from filename for identification + const char* path = FLAG_lora_adapters[i].path; + const char* filename = strrchr(path, '/'); + filename = filename ? filename + 1 : path; + + SLOG("loading LoRA adapter %d ('%s') from %s with scale %s", i + 1, + filename, path, scale_buf); + + llama_lora_adapter_container adapter_container; + adapter_container.path = std::string(path); + adapter_container.scale = FLAG_lora_adapters[i].scale; + adapter_container.adapter = llama_lora_adapter_init(model, path); + + if (!adapter_container.adapter) { + fprintf(stderr, "%s: failed to load LoRA adapter from %s\n", FLAG_model, path); + // Cleanup previously loaded adapters + for (auto& la : g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); + } + } + llama_free_model(model); + exit(1); + } + g_lora_adapters.push_back(adapter_container); + } + + if (FLAG_lora_init_without_apply) { + SLOG("all LoRA adapters loaded successfully but not applied (use /lora-adapters API to apply)"); + } else { + SLOG("all LoRA adapters loaded and applied successfully"); + } + } + // create slots Slots* slots = new Slots(model); if (!slots->start(FLAG_slots)) { @@ -120,6 +184,14 @@ main(int argc, char* argv[]) g_server->close(); delete g_server; delete slots; + + // Cleanup LoRA adapters + for (auto& la : g_lora_adapters) { + if (la.adapter) { + llama_lora_adapter_free(la.adapter); + } + } + llama_free_model(model); tokenbucket_destroy(); time_destroy(); diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index c57ca2541c..a081d69dfb 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -18,6 +18,7 @@ #include "slot.h" #include "llama.cpp/llava/clip.h" #include "llama.cpp/llava/llava.h" +#include "llama.cpp/common.h" #include "llamafile/image.h" #include "llamafile/llama.h" #include "llamafile/llamafile.h" @@ -32,6 +33,9 @@ #include #include +// External declaration for global LoRA adapter storage +extern std::vector g_lora_adapters; + namespace lf { namespace server { @@ -79,7 +83,7 @@ Slot::describe_error(int err) } } -Slot::Slot(int id, llama_model* model) : id_(id), model_(model) +Slot::Slot(int id, llama_model* model) : id_(id), model_(model), needs_refresh_(false) { dll_init(&elem_); last_used_ = time(0); @@ -125,6 +129,19 @@ Slot::start() system_fingerprint_ = generate_system_fingerprint(&cparams); if (!(ctx_ = llama_new_context_with_model(model_, cparams))) return false; + + // Apply LoRA adapters if available using llama.cpp's unified function + if (!::g_lora_adapters.empty() && !FLAG_lora_init_without_apply) { + SLOG("applying %d LoRA adapter(s) to slot #%d using llama.cpp unified function", + (int)::g_lora_adapters.size(), id_); + llama_lora_adapters_apply(ctx_, ::g_lora_adapters); + } else if (!::g_lora_adapters.empty() && FLAG_lora_init_without_apply) { + // When --lora-init-without-apply is set, explicitly clear any LoRA state + // to ensure no residual LoRA effects from model initialization + SLOG("clearing LoRA state for slot #%d (--lora-init-without-apply mode)", id_); + llama_lora_adapter_clear(ctx_); + } + if (FLAG_mmproj) if (!(clip_ctx_ = clip_model_load(FLAG_mmproj, FLAG_verbose))) return false; @@ -293,6 +310,15 @@ Slot::prefill(const std::vector& atoms, const ProgressCallback& progress) if (!ctx_) return uninitialized; + // Check if we need to refresh due to LoRA adapter changes + if (needs_refresh_) { + SLOG("Refreshing slot due to LoRA adapter changes"); + llama_kv_cache_clear(ctx_); + history_.clear(); + needs_refresh_ = false; + // Fall through to normal prefill logic with cleared state + } + // handle special case of empty prefill if (atoms.empty()) { llama_kv_cache_clear(ctx_); @@ -437,5 +463,11 @@ Slot::dump(std::string* result) } } +void +Slot::mark_for_refresh() +{ + needs_refresh_ = true; +} + } // namespace server } // namespace lf diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h index 7fdd7bf881..104aa7623c 100644 --- a/llamafile/server/slot.h +++ b/llamafile/server/slot.h @@ -26,8 +26,15 @@ struct llama_context; struct llama_model; +struct llama_lora_adapter; struct clip_ctx; +// Function to get the global LoRA adapter +extern "C" struct llama_lora_adapter* llamafiler_get_lora_adapter(); + +// Function to get multiple LoRA adapters with their scales +extern "C" int llamafiler_get_lora_adapters(struct llama_lora_adapter** adapters, float* scales, int max_adapters); + namespace lf { namespace server { @@ -58,6 +65,7 @@ struct Slot llama_context* ctx_ = nullptr; std::vector history_; std::string system_fingerprint_; + bool needs_refresh_ = false; ~Slot(); Slot(int, llama_model*); @@ -71,6 +79,7 @@ struct Slot int prefill(const std::vector&, const ProgressCallback& = nullptr); void tokenize(std::vector*, std::string_view, bool); void dump(std::string*); + void mark_for_refresh(); }; } // namespace server