flyingcircusio · elikoga · Nov 26, 2025 · Nov 26, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
@@ -9,6 +9,22 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+    - uses: nixbuild/nix-quick-install-action@v34
+    - name: Restore and save Nix store
+      uses: nix-community/cache-nix-action@v6
+      with:
+        primary-key: nix-${{ runner.os }}-${{ hashFiles('**/devenv.nix', '**/devenv.lock') }}
+        restore-prefixes-first-match: nix-${{ runner.os }}-
+        gc-max-store-size: 5G
+        purge: true
+        purge-prefixes: nix-${{ runner.os }}-
+        purge-created: 0
+        purge-last-accessed: 0
+        purge-primary-key: never
+    - name: Install devenv.sh
+      run: nix profile install nixpkgs#devenv
+    - name: Set up Python venv
+      run: devenv shell uv sync && git restore devenv.lock
     - uses: actions/setup-python@v4
     - uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -10,7 +10,7 @@ jobs:
     runs-on: "${{ matrix.os }}"
     steps:
       - uses: actions/checkout@v5
-      - uses: nixbuild/nix-quick-install-action@v33
+      - uses: nixbuild/nix-quick-install-action@v34
       - name: Restore and save Nix store
         uses: nix-community/cache-nix-action@v6
         with:
@@ -25,13 +25,12 @@ jobs:
           purge-primary-key: never
       - name: Install devenv.sh
         run: nix profile install nixpkgs#devenv
-      - name: Ollama Model Directorys
-        id: ollama-models
+      - name: Model Directories
+        id: models
         uses: actions/cache@v4
         with:
           path: |
-            .ollama1
-            .ollama2
-          key: ${{ runner.os }}-ollama-models
+            .models
+          key: ${{ runner.os }}-models
       - name: Run tests
         run: devenv test
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ build/
 dist/
 wheels/
 *.egg-info
+.claude/
+.zed/
 
 # Virtual environments
 .venv
@@ -18,6 +20,7 @@ result
 # test generated files
 .coverage*
 htmlcov
+.models
 
 # Devenv
 .devenv*
@@ -29,6 +32,8 @@ devenv.local.nix
 .aramaki-workdir
 
 .DS_store
-.access_log
+.access_log*
 
-.ollama?
+.ollama*/models*
+models/
+models-2/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,41 +1,42 @@
 exclude: ^secrets/|^appenv$
 repos:
-- hooks:
-  - id: detect-private-key
-  - id: check-added-large-files
-  - exclude: "(?x)^(\n  secrets/|environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: trailing-whitespace
-  - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: end-of-file-fixer
-  - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: check-yaml
-  - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: check-json
-  - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: check-xml
-  - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
-    id: check-toml
-  repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v5.0.0
-- hooks:
-  - args:
-    - --profile
-    - black
-    - --filter-files
-    id: isort
-    name: isort (python)
-  repo: https://github.com/pycqa/isort
-  rev: 6.0.1
-- hooks:
-  - id: black
-  repo: https://github.com/psf/black
-  rev: 25.1.0
-- hooks:
-  - args:
-    - --ignore
-    - E501
-    - --ignore
-    - F401
-    id: ruff
-  repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.9
+  - hooks:
+      - id: detect-private-key
+      - id: check-added-large-files
+      - exclude: "(?x)^(\n  secrets/|environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: trailing-whitespace
+      - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: end-of-file-fixer
+      - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: check-yaml
+      - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: check-json
+      - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: check-xml
+      - exclude: "(?x)^(\n  environments/.*/secret.*|\n  .*\\.patch\n)$\n"
+        id: check-toml
+    repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+  - hooks:
+      - args:
+          - --profile
+          - black
+          - --filter-files
+        id: isort
+        name: isort (python)
+    repo: https://github.com/pycqa/isort
+    rev: 7.0.0
+  - hooks:
+      - args:
+          - --ignore
+          - E501
+          - --ignore
+          - F401
+        id: ruff
+      - id: ruff-format
+    repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.14
+  - repo: https://github.com/DetachHead/basedpyright-pre-commit-mirror
+    rev: 1.37.1 # or whatever the latest version is at the time
+    hooks:
+      - id: basedpyright
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,148 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Build and Test Commands
+
+```bash
+# Enter development shell (requires Nix + devenv)
+devenv shell
+
+# Run all tests
+run-tests
+
+# Run a single test file
+uv run pytest src/skvaider/inference/tests/test_manager.py -vv
+
+# Run a specific test
+uv run pytest src/skvaider/inference/tests/test_manager.py::test_manager_start_model -vv
+
+# Start all services in the background (terminal stays free)
+devenv up -d
+
+# Stop background services
+devenv down
+
+# Type checking, linting, formatting, etc. all in one:
+pre-commit run -a
+
+```
+
+## Architecture
+
+Skvaider is an OpenAI-compatible API proxy with two parts.
+
+
+### The OpenAI compatible gateway facing application clients (`skvaider:app_factory()`)
+
+Routes requests to inference backends with load balancing, authentication, health checks and resource management.
+
+- **Entry point**: `src/skvaider/__init__.py`
+- **Config file**: `config.toml`
+- **Port**: 8000
+
+Key components:
+- `proxy/pool.py` - Request queue and backend load balancing
+- `proxy/backends.py` - Backend interface (SkvaiderBackend)
+- `routers/openai.py` - OpenAI-compatible endpoints (`/openai/v1/...`)
+- `auth.py` - Token authentication via aramaki
+
+### Inference server (`skvaider.inference:app_factory()`)
+
+Runs local LLMs via llama-server subprocesses.
+
+- **Entry point**: `src/skvaider/inference/__init__.py`
+- **Config file**: `config-inference-{1,2}.toml` (via `SKVAIDER_CONFIG_FILE` env var)
+- **Ports**: 8001, 8002
+
+Key components:
+- `inference/manager.py` - Model lifecycle (download, start, health check, terminate)
+- `inference/routers/models.py` - Model management endpoints (`/models/{name}/load`, `/models/{name}/proxy/{path}`)
+- `inference/routers/manager.py` - Health and VRAM usage endpoints
+
+### Aramaki (`src/aramaki/`)
+
+WebSocket-based distributed state management for authentication tokens.
+
+Aramaki is intended to be split off later into a separate package. It is extremely important that no references (imports) from aramaki (`src/aramaki`) to the skvaider code base (`src/skvaider`) are
+introduced under any circumstances.
+
+- `manager.py` - WebSocket connections and subscriptions
+- `collection.py` - Collection protocol and replication
+- `db.py` - SQLite persistence
+
+## Request Flow
+
+1. Client → Proxy (`/openai/v1/chat/completions`)
+2. Proxy authenticates via aramaki tokens
+3. Pool assigns request to least-loaded backend but batches requests that are incoming at the same time.
+4. Backend proxies to inference server (`/models/{model}/proxy/v1/chat/completions`)
+5. Proxy starts models as needed (llama-server subprocess). At least one reserved model instance should always be available. Additional models are stopped and started as needed.
+6. Response streams back through the chain
+
+## Model Status System
+
+Models track two status dimensions (inspired by Ceph):
+- `process_status`: stopped → starting → running → stopping
+- `health_status`: "" → healthy/unhealthy
+
+Combined into `status` set with "active" (running+healthy) or "inactive".
+
+## Configuration
+
+Pydantic models in `config.py` files. Key patterns:
+- Model files: URL + SHA256 hash for verification
+- Logging: structlog with IP anonymization
+
+## Code Style
+
+- "-> None"  is not needed on `__init__` methods
+- if filtering through lists in a compound statement, prefer to use the `guardian` pattern to avoid long indentations.
+
+  Good:
+
+   ```
+    for x in mylist:
+      if not condition(x):
+        continue
+      ... do the happy path work ...
+  ```
+
+  Bad:
+
+  ```
+    for x in mylist:
+      if condition(x):
+        ... do the happy path work ...
+  ```
+
+- do not add superfluous comments to code that is already there. when making comments to
+  new code you generate then do not make the comment if its basically exactly what the
+  code already reads like or is sensibly obvious. stick to higher order "why" comments
+  instead of superfluous comments
+
+  bad examples:
+
+  ```
+    # do the foo bar thing
+    do_foo_bar()
+
+    # Get per-process VRAM usage from --showpids
+    await self._update_per_model_vram_rocm()
+
+    # Get total VRAM from --showmeminfo
+    proc = await asyncio.create_subprocess_exec(
+        "rocm-smi",
+        "--json",
+        "--showmeminfo",
+        "all",
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+  ```
+
+- if you log an exception, use the log.exception() function to ensure we see a proper traceback
+
+- basedpyright strict mode
+- black + isort (line length 80)
+- ruff (ignoring E501, F401)
diff --git a/config-inference-1.toml b/config-inference-1.toml
@@ -0,0 +1,41 @@
+# Sample config used by the development environment
+models_dir = "models"
+
+[server]
+host = "127.0.0.1"
+port = 8001
+
+[logging]
+log_level = "DEBUG"
+access_log_path = ".access_log-inference1"
+
+[[openai.models]]
+type = "llama-server"
+id = "gemma"
+context_size = 4096
+port = 8100
+cmd_args = []
+max_requests = 21
+
+[[openai.models.files]]
+url = "https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-UD-Q4_K_XL.gguf?download=true"
+hash = "e5420636e0cbfee24051ff22e9719380a3a93207a472edb18dd0c89a95f6ef80"
+
+[[openai.models]]
+type = "llama-server"
+id = "embeddinggemma"
+cmd_args = ["--embeddings"]
+port = 8101
+context_size = 4096
+
+[[openai.models.files]]
+url = "https://huggingface.co/unsloth/embeddinggemma-300m-GGUF/resolve/main/embeddinggemma-300M-F32.gguf"
+hash = "a3125072128fc76d1c1d8d19f7b095c7e3bfbf00594dcf8a8bd3bcb334935d57"
+
+# It would be useful to have a reasoning model, but the 12G are unwieldy
+# for local development and CI/CD caching.
+#
+# [openai.models."unsloth/gpt-oss-20b-GGUF/gpt-oss-20b-UD-Q4_K_XL.gguf"]
+# id = "gpt-oss"
+# url = "https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/d449b42d93e1c2c7bda5312f5c25c8fb91dfa9b4/gpt-oss-20b-UD-Q4_K_XL.gguf"
+# hash = "10fe673de12c20b74b8d670a9fdf0fd36b43b0a86ffc04daeb175c0a2b98c4f9"
diff --git a/config-inference-2.toml b/config-inference-2.toml
@@ -0,0 +1,42 @@
+# Sample config used by the development environment
+models_dir = "models-2"
+
+[server]
+host = "127.0.0.1"
+port = 8002
+
+[logging]
+log_level = "DEBUG"
+access_log_path = ".access_log-inference2"
+
+[[openai.models]]
+type = "llama-server"
+id = "gemma"
+context_size = 4096
+cmd_args = []
+max_requests = 42
+port = 8200
+
+[[openai.models.files]]
+url = "https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-UD-Q4_K_XL.gguf"
+hash = "e5420636e0cbfee24051ff22e9719380a3a93207a472edb18dd0c89a95f6ef80"
+
+[[openai.models]]
+type = "llama-server"
+id = "embeddinggemma"
+cmd_args = ["--embeddings"]
+port = 8201
+context_size = 4096
+
+[[openai.models.files]]
+url = "https://huggingface.co/unsloth/embeddinggemma-300m-GGUF/resolve/main/embeddinggemma-300M-F32.gguf"
+hash = "a3125072128fc76d1c1d8d19f7b095c7e3bfbf00594dcf8a8bd3bcb334935d57"
+
+
+# It would be useful to have a reasoning model, but the 12G are unwieldy
+# for local development and CI/CD caching.
+#
+# [openai.models."unsloth/gpt-oss-20b-GGUF/gpt-oss-20b-UD-Q4_K_XL.gguf"]
+# id = "gpt-oss"
+# url = "https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/d449b42d93e1c2c7bda5312f5c25c8fb91dfa9b4/gpt-oss-20b-UD-Q4_K_XL.gguf"
+# hash = "10fe673de12c20b74b8d670a9fdf0fd36b43b0a86ffc04daeb175c0a2b98c4f9"