simonw · entropie · Sep 3, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ llm install llm-llama-server
 ```
 ## Usage
 
-You'll need to be running a [llama-server](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) on port 8080 to use this plugin.
+You'll need a running instance of [llama-server](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md). By default, it connects to `http://127.0.0.1:8080`. You can specify a different host and port by setting the `LLAMACIP_HOST` environment variable, for example: `export LLAMACPP_HOST="http://your-server-ip:8080"`.
 
 You can `brew install llama.cpp` to obtain that binary. Then run it like this:
 ```bash

diff --git a/llm_llama_server.py b/llm_llama_server.py
@@ -1,4 +1,5 @@
 import llm
+import os
 from llm.default_plugins.openai_models import Chat, AsyncChat
 
 
@@ -7,10 +8,13 @@ class LlamaServer(Chat):
     key = "sk-llama-server"
 
     def __init__(self, **kwargs):
+        host = os.getenv("LLAMACPP_HOST", "http://localhost:8080")
+        api_base_url = f"{host.rstrip('/')}/v1"
+
         super().__init__(
             model_name="llama-server",
             model_id=self.model_id,
-            api_base="http://localhost:8080/v1",
+            api_base=api_base_url,
             **kwargs,
         )
 
@@ -23,10 +27,13 @@ class AsyncLlamaServer(AsyncChat):
     key = "sk-llama-server"
 
     def __init__(self, **kwargs):
+        host = os.getenv("LLAMACPP_HOST", "http://localhost:8080")
+        api_base_url = f"{host.rstrip('/')}/v1"
+
         super().__init__(
             model_name="llama-server",
             model_id=self.model_id,
-            api_base="http://localhost:8080/v1",
+            api_base=api_base_url,
             **kwargs,
         )