diff --git a/.dockerignore b/.dockerignore
index 8916e2a660..9f507cadac 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,13 +1,15 @@
 *.o
 *.a
+*.md
 .cache/
-.git/
+
+# Ensure .git is NOT ignored so it can be mounted/copied
+!.git
 .github/
 .gitignore
 .vs/
 .vscode/
 .DS_Store
-
 build*/
 
 models/*
@@ -18,3 +20,5 @@ models/*
 arm_neon.h
 compile_commands.json
 Dockerfile
+
+**/*.md
\ No newline at end of file
diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml
new file mode 100644
index 0000000000..295063c9fd
--- /dev/null
+++ b/.github/workflows/build-container.yml
@@ -0,0 +1,118 @@
+name: Build and Push Docker Image
+
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  packages: write
+  actions: read
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - variant: "cu12"
+            cuda_version: "12.6.2"
+            containerfile: "ik_llama-cuda.Containerfile"
+          - variant: "cu13"
+            cuda_version: "13.1.1"
+            containerfile: "ik_llama-cuda.Containerfile"
+          - variant: "cpu"
+            cuda_version: "none"
+            containerfile: "ik_llama-cpu.Containerfile"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100 # Enough for rev-list, but saves GBs of history
+      
+      - name: Free Disk Space (Ubuntu)
+        run: |
+          echo "Listing initial disk usage..."
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/lib/jvm
+          sudo docker image prune -af
+          echo "Listing disk usage after cleanup..."
+          df -h
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Prepare Environment
+        id: prep
+        run: |
+          echo "BUILD_NUMBER=$(git rev-list --count HEAD)" >> $GITHUB_ENV
+          echo "LLAMA_COMMIT=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
+          echo "REPO_LOWER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
+      
+      # 5.1 Restore the cache from GitHub's storage to a host folder
+      - name: Cache ccache
+        uses: actions/cache@v4
+        with:
+          path: .buildkit-cache
+          key: ccache-${{ matrix.variant }}-${{ github.run_id }}
+          restore-keys: |
+            ccache-${{ matrix.variant }}-
+
+      # 5.2. "Inject" that host folder into BuildKit's internal mount system
+      - name: Inject ccache into BuildKit
+        uses: reproducible-containers/buildkit-cache-dance@v3
+        with:
+          cache-map: |
+            {
+              ".buildkit-cache": "/ccache"
+            }
+          skip-extraction: ${{ github.event_name == 'pull_request' }}
+
+      # 5.3 Build and push using the cache
+      - name: Build and Push
+        uses: docker/bake-action@v7
+        env:
+          REPO_OWNER: ${{ env.REPO_LOWER }}
+          VARIANT: ${{ matrix.variant }}
+          BUILD_NUMBER: ${{ env.BUILD_NUMBER }}
+          LLAMA_COMMIT: ${{ env.LLAMA_COMMIT }}
+          CUDA_VERSION: ${{ matrix.cuda_version }}
+          GGML_NATIVE: "OFF" # Force OFF for CI portability
+          USE_CCACHE: "true"
+        with:
+          push: true
+          files: ./docker-bake.hcl
+          set: |
+            *.context=.
+            *.dockerfile=./docker/${{ matrix.containerfile }}
+            *.cache-from=type=gha,scope=ccache-${{ matrix.variant }}
+            *.cache-to=type=gha,mode=max,scope=ccache-${{ matrix.variant }}
+          source: .
+
+  cleanup:
+    runs-on: ubuntu-latest
+    needs: build-and-push
+    if: success()
+    steps:
+      - name: Delete untagged images
+        uses: vlaurin/action-ghcr-prune@v0.6.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          organization: ${{ github.repository_owner }}
+          container: ik-llama-cpp
+          keep-younger-than: 0
+          untagged-only: true
\ No newline at end of file
diff --git a/docker-bake.hcl b/docker-bake.hcl
new file mode 100644
index 0000000000..568503663e
--- /dev/null
+++ b/docker-bake.hcl
@@ -0,0 +1,58 @@
+variable "REPO_OWNER" { default = "local" }
+variable "VARIANT" { default = "cpu" }
+variable "BUILD_NUMBER" { default = "0" }
+variable "LLAMA_COMMIT" { default = "local-dev" }
+variable "CUDA_VERSION" {}
+variable "CUDA_DOCKER_ARCH" { default = "86;90" }
+variable "USE_CCACHE" { default = "true" }
+variable "GGML_NATIVE" { default = "ON" }
+
+# Common cache configuration for GitHub Actions
+target "cache_settings" {
+  cache-from = ["type=gha,scope=ccache-${VARIANT}"]
+  cache-to   = ["type=gha,mode=max,scope=ccache-${VARIANT}"]
+}
+
+group "default" {
+  targets = ["server", "full", "swap"]
+}
+
+target "settings" {
+  context = "."
+  inherits = ["cache_settings"]
+  args = {
+    BUILD_NUMBER     = "${BUILD_NUMBER}"
+    BUILD_COMMIT     = "${LLAMA_COMMIT}"
+    CUDA_VERSION     = "${CUDA_VERSION}"
+    CUDA_DOCKER_ARCH = "${CUDA_DOCKER_ARCH}"
+    GGML_NATIVE      = "${GGML_NATIVE}"
+    USE_CCACHE       = "${USE_CCACHE}"
+  }
+}
+
+target "server" {
+  inherits = ["settings"]
+  target = "server"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server-${BUILD_NUMBER}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server"
+  ]
+}
+
+target "full" {
+  inherits = ["settings"]
+  target = "full"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full-${BUILD_NUMBER}-${LLAMA_COMMIT}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full"
+  ]
+}
+
+target "swap" {
+  inherits = ["settings"]
+  target = "swap"
+  tags = [
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap-${BUILD_NUMBER}-${LLAMA_COMMIT}",
+    "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap"
+  ]
+}
\ No newline at end of file
diff --git a/docker-bake.override.hcl b/docker-bake.override.hcl
new file mode 100644
index 0000000000..cf10286995
--- /dev/null
+++ b/docker-bake.override.hcl
@@ -0,0 +1,15 @@
+# Local development override - automatically sets BUILD_NUMBER and BUILD_COMMIT
+variable "BUILD_NUMBER" { default = "0" }
+variable "BUILD_COMMIT" { default = "local-dev" }
+
+target "server" {
+  dockerfile = "./docker/ik_llama-cpu.Containerfile"
+}
+
+target "swap" {
+  dockerfile = "./docker/ik_llama-cpu.Containerfile"
+}
+
+target "full" {
+  dockerfile = "./docker/ik_llama-cpu.Containerfile"
+}
diff --git a/docker/README.md b/docker/README.md
index d6501d0154..0618e76a16 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -4,7 +4,7 @@ Built on top of [ikawrakow/ik_llama.cpp](https://github.com/ikawrakow/ik_llama.c
 
 All commands are provided for Podman and Docker.
 
-CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up and running. 
+CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up and running.
 
 ## Overview
 
@@ -14,98 +14,100 @@ CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up
 - [Extra Features](#Extra)
 - [Credits](#Credits)
 
-# Build
+## Build
 
-Builds two image tags:
+### Using docker-bake (Recommended)
 
-- `swap`: Includes only `llama-swap` and `llama-server`.
-- `full`: Includes `llama-server`, `llama-quantize`, and other utilities.
+The project uses Docker Bake for building multiple targets efficiently.
 
-Start: download the 4 files to a new directory (e.g. `~/ik_llama/`) then follow the next steps.
+#### CPU Variant
 
-```
-└── ik_llama
-    ├── ik_llama-cpu.Containerfile
-    ├── ik_llama-cpu-swap.config.yaml
-    ├── ik_llama-cuda.Containerfile
-    └── ik_llama-cuda-swap.config.yaml
+```bash
+docker buildx bake --builder ik-llama-builder full swap
 ```
 
-## CPU
+Or with custom tags:
 
-```
-podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full && podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap
+```bash
+REPO_OWNER=yourname docker buildx bake --builder ik-llama-builder \
+  -f ./docker-bake.hcl \
+  full swap
 ```
 
-```
-docker image build --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full . && docker image build --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap .
-```
+#### CUDA Variant
 
-## CUDA
+First, set the CUDA version and GPU architecture in `ik_llama-cuda.Containerfile`:
+- `CUDA_DOCKER_ARCH`: Your GPU's compute capability (e.g., `86` for RTX 30*, `89` for RTX 40*, `12.0` for RTX 50*)
+- `CUDA_VERSION`: CUDA Toolkit version (e.g., `12.6.2`, `13.1.1`)
 
-```
-podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full && podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap
+```bash
+VARIANT=cu12 docker buildx bake --builder ik-llama-builder full swap
 ```
 
-```
-docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap .
-```
+### Build Targets
+
+Builds two image tags per variant:
 
-# Run
+- **`full`**: Includes `llama-server`, `llama-quantize`, and other utilities.
+- **`swap`**: Includes only `llama-swap` and `llama-server`.
 
-- Download `.gguf` model files to your favorite directory (e.g. `/my_local_files/gguf`).
+### Local Development
+
+1. Clone the repository: `git clone https://github.com/ikawrakow/ik_llama.cpp`
+2. Enter the repo: `cd ik_llama.cpp`
+3. Use either docker-bake or build-local.sh as shown above.
+
+## Run
+
+- Download `.gguf` model files to your favorite directory (e.g., `/my_local_files/gguf`).
 - Map it to `/models` inside the container.
 - Open browser `http://localhost:9292` and enjoy the features.
 - API endpoints are available at `http://localhost:9292/v1` for use in other applications.
 
-## CPU
+### CPU
 
-```
+```bash
 podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro localhost/ik_llama-cpu:swap
 ```
 
-```
-docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro ik_llama-cpu:swap
+```bash
+docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro localhost/ik_llama-cpu:swap
 ```
 
-## CUDA
+### CUDA
 
 - Install Nvidia Drivers and CUDA on the host.
 - For Docker, install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 - For Podman, install [CDI Container Device Interface](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html)
-- Identify for your GPU:
-  - [CUDA GPU Compute Capability](https://developer.nvidia.com/cuda/gpus) (e.g. `8.6` for `RTX30*0`, `8.9` for `RTX40*0`, `12.0` for `RTX50*0`) then change `CUDA_DOCKER_ARCH` in `ik_llama-cuda.Containerfile` to your GPU architecture (e.g. `CUDA_DOCKER_ARCH=86` for `RTX30*0`, `CUDA_DOCKER_ARCH=89` for `RTX40*0`, `CUDA_DOCKER_ARCH=120` for `RTX50*0`). If you have a mix of different GPUs add them like `CUDA_DOCKER_ARCH=86;89;120`).
-  - [CUDA Toolkit supported version](https://developer.nvidia.com/cuda-toolkit-archive) then adjust `CUDA_VERSION` in `ik_llama-cuda.Containerfile` to your GPU (e.g. `CUDA_VERSION=13.1` for `RTX50*0`).
+- Identify your GPU:
+  - [CUDA GPU Compute Capability](https://developer.nvidia.com/cuda/gpus) (e.g., `8.6` for RTX30*, `8.9` for RTX40*, `12.0` for RTX50*)
+  - [CUDA Toolkit supported version](https://developer.nvidia.com/cuda-toolkit-archive)
 
-```
+```bash
 podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --device nvidia.com/gpu=all --security-opt=label=disable localhost/ik_llama-cuda:swap
 ```
 
-```
-docker run  -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --runtime nvidia ik_llama-cuda:swap
+```bash
+docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --runtime nvidia localhost/ik_llama-cuda:swap
 ```
 
-# Troubleshooting
+## Troubleshooting
 
 - If CUDA is not available, use `ik_llama-cpu` instead.
 - If models are not found, ensure you mount the correct directory: `-v /my_local_files/gguf:/models:ro`
 - If you need to install `podman` or `docker` follow the [Podman Installation](https://podman.io/docs/installation) or [Install Docker Engine](https://docs.docker.com/engine/install) for your OS.
 
-# Extra
-
-- `CUSTOM_COMMIT` can be used to build a specific `ik_llama.cpp` commit (e.g. `1ec12b8`).
+## Extra
 
-```
-podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target full --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cpu-1ec12b8:full && podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target swap --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cpu-1ec12b8:swap
-```
+- **Custom commit**: Build a specific `ik_llama.cpp` commit by modifying the Containerfile or using build args.
 
-```
-docker image build --file ik_llama-cuda.Containerfile --target full --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cuda-1ec12b8:full . && docker image build --file ik_llama-cuda.Containerfile --target swap --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cuda-1ec12b8:swap .
+```bash
+docker buildx bake --builder ik-llama-builder --set full.args.BUILD_COMMIT=1ec12b8 full
 ```
 
-- Using the tools in the `full` image:
+- **Using the tools in the `full` image**:
 
-```
+```bash
 $ podman run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --entrypoint bash localhost/ik_llama-cpu:full
 # ./llama-quantize ...
 # python3 gguf-py/scripts/gguf_dump.py ...
@@ -113,28 +115,40 @@ $ podman run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --
 # ./llama-sweep-bench ...
 ```
 
-```
-docker run  -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --runtime nvidia --entrypoint bash ik_llama-cuda:full
+```bash
+docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --runtime nvidia --entrypoint bash localhost/ik_llama-cuda:full
 # ./llama-quantize ...
 # python3 gguf-py/scripts/gguf_dump.py ...
 # ./llama-perplexity ...
 # ./llama-sweep-bench ...
 ```
 
-- Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy  (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
-- To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
-- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
-- If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
-- If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB.
-- If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .`
-- Look for premade quants (and imatrix files) that work well on most standard systems and are designed around ik_llama.cpp (with helpful metrics in the model card) from [ubergarm](https://huggingface.co/ubergarm/models).
-- Usefull graphs and numbers on @magikRUKKOLA [Perplexity vs Size Graphs for the recent quants (GLM-4.7, Kimi-K2-Thinking, Deepseek-V3.1-Terminus, Deepseek-R1, Qwen3-Coder, Kimi-K2, Chimera etc.)](https://github.com/ikawrakow/ik_llama.cpp/discussions/715) topic.
-- Build custom quants with [Thireus](https://github.com/Thireus/GGUF-Tool-Suite)'s tools.
-- Download from [ik_llama.cpp's Thireus fork with release builds for macOS/Windows/Ubuntu CPU and Windows CUDA](https://github.com/Thireus/ik_llama.cpp) if you cannot build.
-- For a KoboldCPP experience [Croco.Cpp is fork of KoboldCPP infering GGML/GGUF models on CPU/Cuda with KoboldAI's UI. It's powered partly by IK_LLama.cpp, and compatible with most of Ikawrakow's quants except Bitnet. ](https://github.com/Nexesenex/croco.cpp)
-
-# Credits
-
-All credits to the awesome community:  
+- **Customize `llama-swap` config**: Save the `./docker/ik_llama-cpu-swap.config.yaml` or `./docker/ik_llama-cuda-swap.config.yaml` locally (e.g., under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your `podman run ...` or `docker run ...`.
+
+- **Run in background**: Replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
+
+- **GGML_NATIVE**: If you build the image on a different machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
+
+- **KV quantization types**: To use more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
+
+- **Cleanup unused CUDA images**: If you experiment with several `CUDA_VERSION`, delete unused images (they are several GB):
+  ```bash
+  podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && \
+    podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04
+  ```
+
+- **Build without `llama-swap`**: Change `--target swap` to `--target server` in docker-bake or Containerfiles.
+
+- **Pre-made quants**: Look for premade quants from [ubergarm](https://huggingface.co/ubergarm/models).
+
+- **GGUF tools**: Build custom quants with [Thireus](https://github.com/Thireus/GGUF-Tool-Suite)'s tools.
+
+- **Download prebuilt binaries**: Download from [ik_llama.cpp's Thireus fork with release builds for macOS/Windows/Ubuntu CPU and Windows CUDA](https://github.com/Thireus/ik_llama.cpp).
+
+- **KoboldCPP experience**: [Croco.Cpp is a fork of KoboldCPP inferring GGUF/GGML models on CPU/Cuda with KoboldAI's UI. It's powered partly by IK_LLama.cpp, and compatible with most of Ikawrakow's quants except Bitnet.](https://github.com/Nexesenex/croco.cpp)
+
+## Credits
+
+All credits to the awesome community:
 
 [llama-swap](https://github.com/mostlygeek/llama-swap)
diff --git a/docker/ik_llama-cpu.Containerfile b/docker/ik_llama-cpu.Containerfile
index 3aa23f3d81..87d90f1282 100644
--- a/docker/ik_llama-cpu.Containerfile
+++ b/docker/ik_llama-cpu.Containerfile
@@ -1,73 +1,94 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04
 
 # Stage 1: Build
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
+
+# Build arguments
+ARG GGML_NATIVE=ON
+ARG GGML_AVX2=ON
+ARG USE_CCACHE=true
+
+# Environment variables for portability and GitHub Actions
 ENV LLAMA_CURL=1
 ENV LC_ALL=C.utf8
-ARG CUSTOM_COMMIT
 
-RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-dev curl libgomp1 cmake
-RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
+# ccache configuration
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_MAXSIZE=1G
+ENV CCACHE_COMPRESS=1
+ENV CCACHE_COMPRESSLEVEL=6
+# This is CRITICAL for GitHub Actions: it ignores the absolute path of the runner
+ENV CCACHE_BASEDIR=/app
+
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends ca-certificates build-essential libcurl4-openssl-dev curl libgomp1 cmake ccache git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy source code (excluding hidden files/dirs via .dockerignore)
+COPY . /app
+
 WORKDIR /app
-RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi
-RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
-    cmake --build build --config Release -j$(nproc)
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-RUN mkdir -p /app/build/src && \
-    find build -name "*.so" -exec cp {} /app/build/src \;
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
 
-# Stage 2: Base
+# Build using ccache and optional custom commit
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "${USE_CCACHE}" = "true" ]; then \
+        export PATH="/usr/lib/ccache:$PATH"; \
+        ccache -z; \
+    fi && \
+    cmake -B build \
+        -DGGML_NATIVE=${GGML_NATIVE} \
+        -DLLAMA_CURL=ON && \
+    cmake --build build --config Release -j$(nproc) && \
+    if [ "${USE_CCACHE}" = "true" ]; then \
+        ccache -s; \
+    fi
+
+# Collect build artifacts
+RUN mkdir -p /app/dist/lib /app/dist/full /app/dist/bin && \
+    find build -name "*.so" -exec cp {} /app/dist/lib \; && \
+    cp build/bin/* /app/dist/bin/ && \
+    cp build/bin/* /app/dist/full/ && \
+    cp *.py /app/dist/full/ && \
+    cp -r gguf-py /app/dist/full/ && \
+    cp -r requirements /app/dist/full/ && \
+    cp requirements.txt /app/dist/full/ && \
+    cp .devops/tools.sh /app/dist/full/
+
+# Stage 2: Base (Shared Runtime)
 FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
-RUN apt-get update && apt-get install -yq libgomp1 curl \
-    && apt-get autoremove -y \
-    && apt-get clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-COPY --from=build /app/lib/ /app
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends libgomp1 curl ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+ENV LD_LIBRARY_PATH=/app/lib
+COPY --from=build /app/dist/lib /app/lib
 
-# Stage 3: Full
+# Stage 3: Full (Python/Dev Tools)
 FROM base AS full
-COPY --from=build /app/full /app
-RUN mkdir -p /app/build/src
-COPY --from=build /app/build/src /app/build/src
-WORKDIR /app
-RUN apt-get update && apt-get install -yq \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt-get autoremove -y \
-    && apt-get clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-ENTRYPOINT ["/app/full/tools.sh"]
+COPY --from=build /app/dist/full /app
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends git python3 python3-pip && \
+    pip install --break-system-packages -r requirements.txt && \
+    rm -rf /var/lib/apt/lists/*
+ENTRYPOINT ["/app/tools.sh"]
 
 # Stage 4: Server
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
-COPY --from=build /app/full/llama-server /app/llama-server
-WORKDIR /app
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+COPY --from=build /app/dist/bin/llama-server /app/llama-server
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
 
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=198
-RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
-    && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
-    && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
-COPY ./ik_llama-cpu-swap.config.yaml /app/config.yaml
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080"]
-ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ]
+ARG LS_VER=199
+RUN curl -sSL "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
+    | tar -xz
+
+COPY --from=build /app/docker/ik_llama-cpu-swap.config.yaml /app/config.yaml
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD [ "curl", "-f", "http://localhost:8080"]
+ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ]
\ No newline at end of file
diff --git a/docker/ik_llama-cuda.Containerfile b/docker/ik_llama-cuda.Containerfile
index 0ac0423620..7a382a0548 100644
--- a/docker/ik_llama-cuda.Containerfile
+++ b/docker/ik_llama-cuda.Containerfile
@@ -5,72 +5,92 @@ ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu
 
 # Stage 1: Build
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-ARG CUDA_DOCKER_ARCH=86 # CUDA architecture to build for
-RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-dev curl libgomp1 cmake
 
-RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
+# Build arguments
+ARG CUDA_DOCKER_ARCH="86;90"
+ARG GGML_NATIVE=ON
+ARG USE_CCACHE=true
+
+# Environment variables for portability and GitHub Actions
+ENV CCACHE_DIR=/ccache
+ENV CCACHE_UMASK=000
+ENV CCACHE_MAXSIZE=5G
+ENV CCACHE_COMPRESS=1
+ENV CCACHE_BASEDIR=/app
+
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends \
+    ca-certificates build-essential libcurl4-openssl-dev curl libgomp1 cmake ccache git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy non-hidden files first
+COPY . /app
+
 WORKDIR /app
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+
+# Build using ccache and optional custom commit
+RUN --mount=type=cache,target=/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "${USE_CCACHE}" = "true" ]; then \
+        export PATH="/usr/lib/ccache:$PATH"; \
+        ccache -z; \
     fi && \
-    cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
-RUN mkdir -p /app/build/src && \
-    find build -name "*.so" -exec cp {} /app/build/src \;
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
+    cmake -B build \
+        -DGGML_NATIVE=${GGML_NATIVE} \
+        -DGGML_CUDA=ON \
+        -DCMAKE_CUDA_ARCHITECTURES="${CUDA_DOCKER_ARCH}" \
+        -DLLAMA_CURL=ON \
+        -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined && \
+    cmake --build build --config Release -j$(nproc) && \
+    if [ "${USE_CCACHE}" = "true" ]; then \
+        ccache -s; \
+    fi
 
-# Stage 2: base
+# Collect build artifacts
+RUN mkdir -p /app/dist/lib /app/dist/full /app/dist/bin && \
+    find build -name "*.so" -exec cp {} /app/dist/lib \; && \
+    cp build/bin/* /app/dist/bin/ && \
+    cp build/bin/* /app/dist/full/ && \
+    cp *.py /app/dist/full/ && \
+    cp -r gguf-py /app/dist/full/ && \
+    cp -r requirements /app/dist/full/ && \
+    cp requirements.txt /app/dist/full/ && \
+    cp .devops/tools.sh /app/dist/full/
+
+# Stage 2: Base (Shared Runtime)
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base
-RUN apt-get update && apt-get install -yq libgomp1 curl \
-    && update-ca-certificates \
-    && apt-get autoremove -y \
-    && apt-get clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-COPY --from=build /app/lib/ /app
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends libgomp1 curl ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+ENV LD_LIBRARY_PATH=/app/lib
+COPY --from=build /app/dist/lib /app/lib
 
-# Stage 3: full
+# Stage 3: Full (Python/Dev Tools)
 FROM base AS full
-COPY --from=build /app/full /app
-RUN mkdir -p /app/build/src
-COPY --from=build /app/build/src /app/build/src
-WORKDIR /app
-RUN apt-get update && apt-get install -yq \
-    git \
-    python3 \
-    python3-pip \
-    && pip3 install --break-system-packages -r requirements.txt \
-    && apt-get autoremove -y \
-    && apt-get clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
+COPY --from=build /app/dist/full /app
+RUN apt-get update && \
+    apt-get install -yq --no-install-recommends git python3 python3-pip && \
+    pip install --break-system-packages -r requirements.txt && \
+    rm -rf /var/lib/apt/lists/*
 ENTRYPOINT ["/app/tools.sh"]
 
 # Stage 4: Server
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
-COPY --from=build /app/full/llama-server /app/llama-server
-WORKDIR /app
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+COPY --from=build /app/dist/bin/llama-server /app/llama-server
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
 
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=198
-RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
-    && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
-    && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
-COPY ./ik_llama-cuda-swap.config.yaml /app/config.yaml
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080"]
-ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ]
+ARG LS_VER=199
+RUN curl -sSL "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
+    | tar -xz
+
+COPY --from=build /app/docker/ik_llama-cuda-swap.config.yaml /app/config.yaml
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD [ "curl", "-f", "http://localhost:8080"]
+ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ]
\ No newline at end of file