diff --git a/README.md b/README.md index 544ec26..b41d425 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ llm install llm-llama-server ``` ## Usage -You'll need to be running a [llama-server](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) on port 8080 to use this plugin. +You'll need a running instance of [llama-server](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md). By default, it connects to `http://127.0.0.1:8080`. You can specify a different host and port by setting the `LLAMACIP_HOST` environment variable, for example: `export LLAMACPP_HOST="http://your-server-ip:8080"`. You can `brew install llama.cpp` to obtain that binary. Then run it like this: ```bash diff --git a/llm_llama_server.py b/llm_llama_server.py index 83306ca..79d4b62 100644 --- a/llm_llama_server.py +++ b/llm_llama_server.py @@ -1,4 +1,5 @@ import llm +import os from llm.default_plugins.openai_models import Chat, AsyncChat @@ -7,10 +8,13 @@ class LlamaServer(Chat): key = "sk-llama-server" def __init__(self, **kwargs): + host = os.getenv("LLAMACPP_HOST", "http://localhost:8080") + api_base_url = f"{host.rstrip('/')}/v1" + super().__init__( model_name="llama-server", model_id=self.model_id, - api_base="http://localhost:8080/v1", + api_base=api_base_url, **kwargs, ) @@ -23,10 +27,13 @@ class AsyncLlamaServer(AsyncChat): key = "sk-llama-server" def __init__(self, **kwargs): + host = os.getenv("LLAMACPP_HOST", "http://localhost:8080") + api_base_url = f"{host.rstrip('/')}/v1" + super().__init__( model_name="llama-server", model_id=self.model_id, - api_base="http://localhost:8080/v1", + api_base=api_base_url, **kwargs, )