ikawrakow · yadirhb · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -1,7 +1,8 @@
 *.o
 *.a
+*.md
 .cache/
-.git/
+.git
 .github/
 .gitignore
 .vs/
@@ -18,3 +19,5 @@ models/*
 arm_neon.h
 compile_commands.json
 Dockerfile
+
+**/*.md
diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml
@@ -0,0 +1,103 @@
+name: Build and Push Docker Image
+
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  packages: write
+  actions: read
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - variant: "cu12"
+            cuda_version: "12.6.2"
+            containerfile: "ik_llama-cuda.Containerfile"
+          - variant: "cu13"
+            cuda_version: "13.1.1"
+            containerfile: "ik_llama-cuda.Containerfile"
+          - variant: "cpu"
+            cuda_version: "none"
+            containerfile: "ik_llama-cpu.Containerfile"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Prepare Environment
+        id: prep
+        run: |
+          echo "BUILD_NUMBER=$(git rev-list --count HEAD)" >> $GITHUB_ENV
+          echo "LLAMA_COMMIT=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
+          echo "REPO_LOWER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
+
+      # 5.1 Restore the cache from GitHub's storage to a host folder
+      - name: Cache ccache
+        uses: actions/cache@v4
+        with:
+          path: .buildkit-cache
+          key: ccache-${{ matrix.variant }}-${{ github.run_id }}
+          restore-keys: |
+            ccache-${{ matrix.variant }}-
+
+      # 5.2. "Inject" that host folder into BuildKit's internal mount system
+      - name: Inject ccache into BuildKit
+        uses: reproducible-containers/buildkit-cache-dance@v3
+        with:
+          cache-map: |
+            {
+              ".buildkit-cache": "/ccache"
+            }
+          skip-extraction: ${{ github.event_name == 'pull_request' }}
+
+      # 5.3 Build and push using the cache
+      - name: Build and Push
+        uses: docker/bake-action@v7
+        env:
+          REPO_OWNER: ${{ env.REPO_LOWER }}
+          VARIANT: ${{ matrix.variant }}
+          BUILD_NUMBER: ${{ env.BUILD_NUMBER }}
+          LLAMA_COMMIT: ${{ env.LLAMA_COMMIT }}
+          CUDA_VERSION: ${{ matrix.cuda_version }}
+          GGML_NATIVE: "OFF" # Force OFF for CI portability
+          USE_CCACHE: "true"
+        with:
+          push: true
+          files: ./docker-bake.hcl
+          set: |
+            *.dockerfile=./docker/${{ matrix.containerfile }}
+            *.cache-from=type=gha,scope=ccache-${{ matrix.variant }}
+            *.cache-to=type=gha,mode=max,scope=ccache-${{ matrix.variant }}
+
+  cleanup:
+    runs-on: ubuntu-latest
+    needs: build-and-push
+    if: success()
+    steps:
+      - name: Delete untagged images
+        uses: vlaurin/action-ghcr-prune@v0.6.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          organization: ${{ github.repository_owner }}
+          container: ik-llama-cpp
+          keep-younger-than: 0
+          untagged-only: true
diff --git a/docker-bake.hcl b/docker-bake.hcl
@@ -0,0 +1,59 @@
+variable "REPO_OWNER" {}
+variable "VARIANT" {}
+variable "SHA_SHORT" {}
+variable "BUILD_NUMBER" {}
+variable "LLAMA_COMMIT" {}
+variable "CUDA_VERSION" {}
+variable "CUDA_DOCKER_ARCH" { default = "86;90" }
+variable "USE_CCACHE" { default = "true" }
+variable "GGML_NATIVE" { default = "ON" }
+
+# Common cache configuration for GitHub Actions
+target "cache_settings" {
+  cache-from = ["type=gha,scope=ccache-${VARIANT}"]
+  cache-to   = ["type=gha,mode=max,scope=ccache-${VARIANT}"]
+}
+
+group "default" {
+  targets = ["server", "full", "swap"]
+}
+
+target "settings" {
+  context = "."
+  inherits = ["cache_settings"]
+  args = {
+    BUILD_NUMBER     = "${BUILD_NUMBER}"
+    LLAMA_COMMIT     = "${LLAMA_COMMIT}"
+    CUDA_VERSION     = "${CUDA_VERSION}"
+    CUDA_DOCKER_ARCH = "${CUDA_DOCKER_ARCH}"
+    GGML_NATIVE      = "${GGML_NATIVE}"
+    USE_CCACHE       = "${USE_CCACHE}"
+  }
+}
+
+target "server" {
+  inherits = ["settings"]
+  target = "server"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server-${BUILD_NUMBER}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server"
+  ]
+}
+
+target "full" {
+  inherits = ["settings"]
+  target = "full"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full-${BUILD_NUMBER}-${LLAMA_COMMIT}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full"
+  ]
+}
+
+target "swap" {
+  inherits = ["settings"]
+  target = "swap"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap-${BUILD_NUMBER}-${LLAMA_COMMIT}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap"
+  ]
+}
diff --git a/docker/README.md b/docker/README.md
@@ -14,51 +14,46 @@ CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up
 - [Extra Features](#Extra)
 - [Credits](#Credits)
 
-# Build
+## Build
 
 Builds two image tags:
 
 - `swap`: Includes only `llama-swap` and `llama-server`.
 - `full`: Includes `llama-server`, `llama-quantize`, and other utilities.
 
-Start: download the 4 files to a new directory (e.g. `~/ik_llama/`) then follow the next steps.
+### Start:
 
-```
-└── ik_llama
-    ├── ik_llama-cpu.Containerfile
-    ├── ik_llama-cpu-swap.config.yaml
-    ├── ik_llama-cuda.Containerfile
-    └── ik_llama-cuda-swap.config.yaml
-```
+1. Clone the repository as `git clone https://github.com/ikawrakow/ik_llama.cpp`
+2. Enter the repo: `cd ik_llama.cpp`, then follow the next steps.
 
-## CPU
+### CPU
 
 ```
-podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full && podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap
+podman image build --format Dockerfile --file ./docker/ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full . && podman image build --format Dockerfile --file ./docker/ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap .
 ```
 
 ```
-docker image build --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full . && docker image build --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap .
+docker image build --file ./docker/ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full . && docker image build --file ./docker/ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap .
 ```
 
-## CUDA
+### CUDA
 
 ```
-podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full && podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap
+podman image build --format Dockerfile --file ./docker/ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && podman image build --format Dockerfile --file ./docker/ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap .
 ```
 
 ```
-docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap .
+docker image build --file ./docker/ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ./docker/ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap .
 ```
 
-# Run
+## Run
 
 - Download `.gguf` model files to your favorite directory (e.g. `/my_local_files/gguf`).
 - Map it to `/models` inside the container.
 - Open browser `http://localhost:9292` and enjoy the features.
 - API endpoints are available at `http://localhost:9292/v1` for use in other applications.
 
-## CPU
+### CPU
 
 ```
 podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro localhost/ik_llama-cpu:swap
@@ -68,7 +63,7 @@ podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models
 docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro ik_llama-cpu:swap
 ```
 
-## CUDA
+### CUDA
 
 - Install Nvidia Drivers and CUDA on the host.
 - For Docker, install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
@@ -85,13 +80,13 @@ podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models
 docker run  -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --runtime nvidia ik_llama-cuda:swap
 ```
 
-# Troubleshooting
+## Troubleshooting
 
 - If CUDA is not available, use `ik_llama-cpu` instead.
 - If models are not found, ensure you mount the correct directory: `-v /my_local_files/gguf:/models:ro`
 - If you need to install `podman` or `docker` follow the [Podman Installation](https://podman.io/docs/installation) or [Install Docker Engine](https://docs.docker.com/engine/install) for your OS.
 
-# Extra
+## Extra
 
 - `CUSTOM_COMMIT` can be used to build a specific `ik_llama.cpp` commit (e.g. `1ec12b8`).
 
@@ -121,7 +116,7 @@ docker run  -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r
 # ./llama-sweep-bench ...
 ```
 
-- Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy  (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
+- Customize `llama-swap` config: save the `./docker/ik_llama-cpu-swap.config.yaml` or `./docker/ik_llama-cuda-swap.config.yaml` localy  (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
 - To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
 - If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
 - If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
@@ -133,7 +128,7 @@ docker run  -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r
 - Download from [ik_llama.cpp's Thireus fork with release builds for macOS/Windows/Ubuntu CPU and Windows CUDA](https://github.com/Thireus/ik_llama.cpp) if you cannot build.
 - For a KoboldCPP experience [Croco.Cpp is fork of KoboldCPP infering GGML/GGUF models on CPU/Cuda with KoboldAI's UI. It's powered partly by IK_LLama.cpp, and compatible with most of Ikawrakow's quants except Bitnet. ](https://github.com/Nexesenex/croco.cpp)
 
-# Credits
+## Credits
 
 All credits to the awesome community: