diff --git a/.dockerignore b/.dockerignore index 8916e2a660..9f507cadac 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,13 +1,15 @@ *.o *.a +*.md .cache/ -.git/ + +# Ensure .git is NOT ignored so it can be mounted/copied +!.git .github/ .gitignore .vs/ .vscode/ .DS_Store - build*/ models/* @@ -18,3 +20,5 @@ models/* arm_neon.h compile_commands.json Dockerfile + +**/*.md \ No newline at end of file diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml new file mode 100644 index 0000000000..295063c9fd --- /dev/null +++ b/.github/workflows/build-container.yml @@ -0,0 +1,118 @@ +name: Build and Push Docker Image + +on: + push: + branches: + - main + +permissions: + contents: read + packages: write + actions: read + +jobs: + build-and-push: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - variant: "cu12" + cuda_version: "12.6.2" + containerfile: "ik_llama-cuda.Containerfile" + - variant: "cu13" + cuda_version: "13.1.1" + containerfile: "ik_llama-cuda.Containerfile" + - variant: "cpu" + cuda_version: "none" + containerfile: "ik_llama-cpu.Containerfile" + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 100 # Enough for rev-list, but saves GBs of history + + - name: Free Disk Space (Ubuntu) + run: | + echo "Listing initial disk usage..." + df -h + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf /usr/lib/jvm + sudo docker image prune -af + echo "Listing disk usage after cleanup..." + df -h + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v4 + + - name: Log in to GHCR + uses: docker/login-action@v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Prepare Environment + id: prep + run: | + echo "BUILD_NUMBER=$(git rev-list --count HEAD)" >> $GITHUB_ENV + echo "LLAMA_COMMIT=$(git rev-parse --short HEAD)" >> $GITHUB_ENV + echo "REPO_LOWER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV + + # 5.1 Restore the cache from GitHub's storage to a host folder + - name: Cache ccache + uses: actions/cache@v4 + with: + path: .buildkit-cache + key: ccache-${{ matrix.variant }}-${{ github.run_id }} + restore-keys: | + ccache-${{ matrix.variant }}- + + # 5.2. "Inject" that host folder into BuildKit's internal mount system + - name: Inject ccache into BuildKit + uses: reproducible-containers/buildkit-cache-dance@v3 + with: + cache-map: | + { + ".buildkit-cache": "/ccache" + } + skip-extraction: ${{ github.event_name == 'pull_request' }} + + # 5.3 Build and push using the cache + - name: Build and Push + uses: docker/bake-action@v7 + env: + REPO_OWNER: ${{ env.REPO_LOWER }} + VARIANT: ${{ matrix.variant }} + BUILD_NUMBER: ${{ env.BUILD_NUMBER }} + LLAMA_COMMIT: ${{ env.LLAMA_COMMIT }} + CUDA_VERSION: ${{ matrix.cuda_version }} + GGML_NATIVE: "OFF" # Force OFF for CI portability + USE_CCACHE: "true" + with: + push: true + files: ./docker-bake.hcl + set: | + *.context=. + *.dockerfile=./docker/${{ matrix.containerfile }} + *.cache-from=type=gha,scope=ccache-${{ matrix.variant }} + *.cache-to=type=gha,mode=max,scope=ccache-${{ matrix.variant }} + source: . + + cleanup: + runs-on: ubuntu-latest + needs: build-and-push + if: success() + steps: + - name: Delete untagged images + uses: vlaurin/action-ghcr-prune@v0.6.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + organization: ${{ github.repository_owner }} + container: ik-llama-cpp + keep-younger-than: 0 + untagged-only: true \ No newline at end of file diff --git a/docker-bake.hcl b/docker-bake.hcl new file mode 100644 index 0000000000..568503663e --- /dev/null +++ b/docker-bake.hcl @@ -0,0 +1,58 @@ +variable "REPO_OWNER" { default = "local" } +variable "VARIANT" { default = "cpu" } +variable "BUILD_NUMBER" { default = "0" } +variable "LLAMA_COMMIT" { default = "local-dev" } +variable "CUDA_VERSION" {} +variable "CUDA_DOCKER_ARCH" { default = "86;90" } +variable "USE_CCACHE" { default = "true" } +variable "GGML_NATIVE" { default = "ON" } + +# Common cache configuration for GitHub Actions +target "cache_settings" { + cache-from = ["type=gha,scope=ccache-${VARIANT}"] + cache-to = ["type=gha,mode=max,scope=ccache-${VARIANT}"] +} + +group "default" { + targets = ["server", "full", "swap"] +} + +target "settings" { + context = "." + inherits = ["cache_settings"] + args = { + BUILD_NUMBER = "${BUILD_NUMBER}" + BUILD_COMMIT = "${LLAMA_COMMIT}" + CUDA_VERSION = "${CUDA_VERSION}" + CUDA_DOCKER_ARCH = "${CUDA_DOCKER_ARCH}" + GGML_NATIVE = "${GGML_NATIVE}" + USE_CCACHE = "${USE_CCACHE}" + } +} + +target "server" { + inherits = ["settings"] + target = "server" + tags = [ + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server-${BUILD_NUMBER}", + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-server" + ] +} + +target "full" { + inherits = ["settings"] + target = "full" + tags = [ + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full-${BUILD_NUMBER}-${LLAMA_COMMIT}", + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-full" + ] +} + +target "swap" { + inherits = ["settings"] + target = "swap" + tags = [ + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap-${BUILD_NUMBER}-${LLAMA_COMMIT}", + "ghcr.io/${REPO_OWNER}/ik-llama-cpp:${VARIANT}-swap" + ] +} \ No newline at end of file diff --git a/docker-bake.override.hcl b/docker-bake.override.hcl new file mode 100644 index 0000000000..cf10286995 --- /dev/null +++ b/docker-bake.override.hcl @@ -0,0 +1,15 @@ +# Local development override - automatically sets BUILD_NUMBER and BUILD_COMMIT +variable "BUILD_NUMBER" { default = "0" } +variable "BUILD_COMMIT" { default = "local-dev" } + +target "server" { + dockerfile = "./docker/ik_llama-cpu.Containerfile" +} + +target "swap" { + dockerfile = "./docker/ik_llama-cpu.Containerfile" +} + +target "full" { + dockerfile = "./docker/ik_llama-cpu.Containerfile" +} diff --git a/docker/README.md b/docker/README.md index d6501d0154..0618e76a16 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,7 +4,7 @@ Built on top of [ikawrakow/ik_llama.cpp](https://github.com/ikawrakow/ik_llama.c All commands are provided for Podman and Docker. -CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up and running. +CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up and running. ## Overview @@ -14,98 +14,100 @@ CPU or CUDA sections under [Build](#Build) and [Run]($Run) are enough to get up - [Extra Features](#Extra) - [Credits](#Credits) -# Build +## Build -Builds two image tags: +### Using docker-bake (Recommended) -- `swap`: Includes only `llama-swap` and `llama-server`. -- `full`: Includes `llama-server`, `llama-quantize`, and other utilities. +The project uses Docker Bake for building multiple targets efficiently. -Start: download the 4 files to a new directory (e.g. `~/ik_llama/`) then follow the next steps. +#### CPU Variant -``` -└── ik_llama - ├── ik_llama-cpu.Containerfile - ├── ik_llama-cpu-swap.config.yaml - ├── ik_llama-cuda.Containerfile - └── ik_llama-cuda-swap.config.yaml +```bash +docker buildx bake --builder ik-llama-builder full swap ``` -## CPU +Or with custom tags: -``` -podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full && podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap +```bash +REPO_OWNER=yourname docker buildx bake --builder ik-llama-builder \ + -f ./docker-bake.hcl \ + full swap ``` -``` -docker image build --file ik_llama-cpu.Containerfile --target full --tag ik_llama-cpu:full . && docker image build --file ik_llama-cpu.Containerfile --target swap --tag ik_llama-cpu:swap . -``` +#### CUDA Variant -## CUDA +First, set the CUDA version and GPU architecture in `ik_llama-cuda.Containerfile`: +- `CUDA_DOCKER_ARCH`: Your GPU's compute capability (e.g., `86` for RTX 30*, `89` for RTX 40*, `12.0` for RTX 50*) +- `CUDA_VERSION`: CUDA Toolkit version (e.g., `12.6.2`, `13.1.1`) -``` -podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full && podman image build --format Dockerfile --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap +```bash +VARIANT=cu12 docker buildx bake --builder ik-llama-builder full swap ``` -``` -docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target swap --tag ik_llama-cuda:swap . -``` +### Build Targets + +Builds two image tags per variant: -# Run +- **`full`**: Includes `llama-server`, `llama-quantize`, and other utilities. +- **`swap`**: Includes only `llama-swap` and `llama-server`. -- Download `.gguf` model files to your favorite directory (e.g. `/my_local_files/gguf`). +### Local Development + +1. Clone the repository: `git clone https://github.com/ikawrakow/ik_llama.cpp` +2. Enter the repo: `cd ik_llama.cpp` +3. Use either docker-bake or build-local.sh as shown above. + +## Run + +- Download `.gguf` model files to your favorite directory (e.g., `/my_local_files/gguf`). - Map it to `/models` inside the container. - Open browser `http://localhost:9292` and enjoy the features. - API endpoints are available at `http://localhost:9292/v1` for use in other applications. -## CPU +### CPU -``` +```bash podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro localhost/ik_llama-cpu:swap ``` -``` -docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro ik_llama-cpu:swap +```bash +docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro localhost/ik_llama-cpu:swap ``` -## CUDA +### CUDA - Install Nvidia Drivers and CUDA on the host. - For Docker, install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) - For Podman, install [CDI Container Device Interface](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) -- Identify for your GPU: - - [CUDA GPU Compute Capability](https://developer.nvidia.com/cuda/gpus) (e.g. `8.6` for `RTX30*0`, `8.9` for `RTX40*0`, `12.0` for `RTX50*0`) then change `CUDA_DOCKER_ARCH` in `ik_llama-cuda.Containerfile` to your GPU architecture (e.g. `CUDA_DOCKER_ARCH=86` for `RTX30*0`, `CUDA_DOCKER_ARCH=89` for `RTX40*0`, `CUDA_DOCKER_ARCH=120` for `RTX50*0`). If you have a mix of different GPUs add them like `CUDA_DOCKER_ARCH=86;89;120`). - - [CUDA Toolkit supported version](https://developer.nvidia.com/cuda-toolkit-archive) then adjust `CUDA_VERSION` in `ik_llama-cuda.Containerfile` to your GPU (e.g. `CUDA_VERSION=13.1` for `RTX50*0`). +- Identify your GPU: + - [CUDA GPU Compute Capability](https://developer.nvidia.com/cuda/gpus) (e.g., `8.6` for RTX30*, `8.9` for RTX40*, `12.0` for RTX50*) + - [CUDA Toolkit supported version](https://developer.nvidia.com/cuda-toolkit-archive) -``` +```bash podman run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --device nvidia.com/gpu=all --security-opt=label=disable localhost/ik_llama-cuda:swap ``` -``` -docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --runtime nvidia ik_llama-cuda:swap +```bash +docker run -it --name ik_llama --rm -p 9292:8080 -v /my_local_files/gguf:/models:ro --runtime nvidia localhost/ik_llama-cuda:swap ``` -# Troubleshooting +## Troubleshooting - If CUDA is not available, use `ik_llama-cpu` instead. - If models are not found, ensure you mount the correct directory: `-v /my_local_files/gguf:/models:ro` - If you need to install `podman` or `docker` follow the [Podman Installation](https://podman.io/docs/installation) or [Install Docker Engine](https://docs.docker.com/engine/install) for your OS. -# Extra - -- `CUSTOM_COMMIT` can be used to build a specific `ik_llama.cpp` commit (e.g. `1ec12b8`). +## Extra -``` -podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target full --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cpu-1ec12b8:full && podman image build --format Dockerfile --file ik_llama-cpu.Containerfile --target swap --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cpu-1ec12b8:swap -``` +- **Custom commit**: Build a specific `ik_llama.cpp` commit by modifying the Containerfile or using build args. -``` -docker image build --file ik_llama-cuda.Containerfile --target full --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cuda-1ec12b8:full . && docker image build --file ik_llama-cuda.Containerfile --target swap --build-arg CUSTOM_COMMIT="1ec12b8" --tag ik_llama-cuda-1ec12b8:swap . +```bash +docker buildx bake --builder ik-llama-builder --set full.args.BUILD_COMMIT=1ec12b8 full ``` -- Using the tools in the `full` image: +- **Using the tools in the `full` image**: -``` +```bash $ podman run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --entrypoint bash localhost/ik_llama-cpu:full # ./llama-quantize ... # python3 gguf-py/scripts/gguf_dump.py ... @@ -113,28 +115,40 @@ $ podman run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro -- # ./llama-sweep-bench ... ``` -``` -docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --runtime nvidia --entrypoint bash ik_llama-cuda:full +```bash +docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --runtime nvidia --entrypoint bash localhost/ik_llama-cuda:full # ./llama-quantize ... # python3 gguf-py/scripts/gguf_dump.py ... # ./llama-perplexity ... # ./llama-sweep-bench ... ``` -- Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`. -- To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`. -- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`. -- If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`. -- If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB. -- If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .` -- Look for premade quants (and imatrix files) that work well on most standard systems and are designed around ik_llama.cpp (with helpful metrics in the model card) from [ubergarm](https://huggingface.co/ubergarm/models). -- Usefull graphs and numbers on @magikRUKKOLA [Perplexity vs Size Graphs for the recent quants (GLM-4.7, Kimi-K2-Thinking, Deepseek-V3.1-Terminus, Deepseek-R1, Qwen3-Coder, Kimi-K2, Chimera etc.)](https://github.com/ikawrakow/ik_llama.cpp/discussions/715) topic. -- Build custom quants with [Thireus](https://github.com/Thireus/GGUF-Tool-Suite)'s tools. -- Download from [ik_llama.cpp's Thireus fork with release builds for macOS/Windows/Ubuntu CPU and Windows CUDA](https://github.com/Thireus/ik_llama.cpp) if you cannot build. -- For a KoboldCPP experience [Croco.Cpp is fork of KoboldCPP infering GGML/GGUF models on CPU/Cuda with KoboldAI's UI. It's powered partly by IK_LLama.cpp, and compatible with most of Ikawrakow's quants except Bitnet. ](https://github.com/Nexesenex/croco.cpp) - -# Credits - -All credits to the awesome community: +- **Customize `llama-swap` config**: Save the `./docker/ik_llama-cpu-swap.config.yaml` or `./docker/ik_llama-cuda-swap.config.yaml` locally (e.g., under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your `podman run ...` or `docker run ...`. + +- **Run in background**: Replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`. + +- **GGML_NATIVE**: If you build the image on a different machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`. + +- **KV quantization types**: To use more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`. + +- **Cleanup unused CUDA images**: If you experiment with several `CUDA_VERSION`, delete unused images (they are several GB): + ```bash + podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && \ + podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04 + ``` + +- **Build without `llama-swap`**: Change `--target swap` to `--target server` in docker-bake or Containerfiles. + +- **Pre-made quants**: Look for premade quants from [ubergarm](https://huggingface.co/ubergarm/models). + +- **GGUF tools**: Build custom quants with [Thireus](https://github.com/Thireus/GGUF-Tool-Suite)'s tools. + +- **Download prebuilt binaries**: Download from [ik_llama.cpp's Thireus fork with release builds for macOS/Windows/Ubuntu CPU and Windows CUDA](https://github.com/Thireus/ik_llama.cpp). + +- **KoboldCPP experience**: [Croco.Cpp is a fork of KoboldCPP inferring GGUF/GGML models on CPU/Cuda with KoboldAI's UI. It's powered partly by IK_LLama.cpp, and compatible with most of Ikawrakow's quants except Bitnet.](https://github.com/Nexesenex/croco.cpp) + +## Credits + +All credits to the awesome community: [llama-swap](https://github.com/mostlygeek/llama-swap) diff --git a/docker/ik_llama-cpu.Containerfile b/docker/ik_llama-cpu.Containerfile index 3aa23f3d81..87d90f1282 100644 --- a/docker/ik_llama-cpu.Containerfile +++ b/docker/ik_llama-cpu.Containerfile @@ -1,73 +1,94 @@ -ARG UBUNTU_VERSION=22.04 +ARG UBUNTU_VERSION=24.04 # Stage 1: Build FROM docker.io/ubuntu:$UBUNTU_VERSION AS build + +# Build arguments +ARG GGML_NATIVE=ON +ARG GGML_AVX2=ON +ARG USE_CCACHE=true + +# Environment variables for portability and GitHub Actions ENV LLAMA_CURL=1 ENV LC_ALL=C.utf8 -ARG CUSTOM_COMMIT -RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-dev curl libgomp1 cmake -RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app +# ccache configuration +ENV CCACHE_DIR=/ccache +ENV CCACHE_MAXSIZE=1G +ENV CCACHE_COMPRESS=1 +ENV CCACHE_COMPRESSLEVEL=6 +# This is CRITICAL for GitHub Actions: it ignores the absolute path of the runner +ENV CCACHE_BASEDIR=/app + +RUN apt-get update && \ + apt-get install -yq --no-install-recommends ca-certificates build-essential libcurl4-openssl-dev curl libgomp1 cmake ccache git && \ + rm -rf /var/lib/apt/lists/* + +# Copy source code (excluding hidden files/dirs via .dockerignore) +COPY . /app + WORKDIR /app -RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi -RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \ - cmake --build build --config Release -j$(nproc) -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; -RUN mkdir -p /app/build/src && \ - find build -name "*.so" -exec cp {} /app/build/src \; -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh -# Stage 2: Base +# Build using ccache and optional custom commit +RUN --mount=type=cache,target=/ccache \ + --mount=type=bind,source=.git,target=.git \ + if [ "${USE_CCACHE}" = "true" ]; then \ + export PATH="/usr/lib/ccache:$PATH"; \ + ccache -z; \ + fi && \ + cmake -B build \ + -DGGML_NATIVE=${GGML_NATIVE} \ + -DLLAMA_CURL=ON && \ + cmake --build build --config Release -j$(nproc) && \ + if [ "${USE_CCACHE}" = "true" ]; then \ + ccache -s; \ + fi + +# Collect build artifacts +RUN mkdir -p /app/dist/lib /app/dist/full /app/dist/bin && \ + find build -name "*.so" -exec cp {} /app/dist/lib \; && \ + cp build/bin/* /app/dist/bin/ && \ + cp build/bin/* /app/dist/full/ && \ + cp *.py /app/dist/full/ && \ + cp -r gguf-py /app/dist/full/ && \ + cp -r requirements /app/dist/full/ && \ + cp requirements.txt /app/dist/full/ && \ + cp .devops/tools.sh /app/dist/full/ + +# Stage 2: Base (Shared Runtime) FROM docker.io/ubuntu:$UBUNTU_VERSION AS base -RUN apt-get update && apt-get install -yq libgomp1 curl \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete -COPY --from=build /app/lib/ /app +RUN apt-get update && \ + apt-get install -yq --no-install-recommends libgomp1 curl ca-certificates && \ + rm -rf /var/lib/apt/lists/* +WORKDIR /app +ENV LD_LIBRARY_PATH=/app/lib +COPY --from=build /app/dist/lib /app/lib -# Stage 3: Full +# Stage 3: Full (Python/Dev Tools) FROM base AS full -COPY --from=build /app/full /app -RUN mkdir -p /app/build/src -COPY --from=build /app/build/src /app/build/src -WORKDIR /app -RUN apt-get update && apt-get install -yq \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete -ENTRYPOINT ["/app/full/tools.sh"] +COPY --from=build /app/dist/full /app +RUN apt-get update && \ + apt-get install -yq --no-install-recommends git python3 python3-pip && \ + pip install --break-system-packages -r requirements.txt && \ + rm -rf /var/lib/apt/lists/* +ENTRYPOINT ["/app/tools.sh"] # Stage 4: Server FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app/llama-server -WORKDIR /app -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] +COPY --from=build /app/dist/bin/llama-server /app/llama-server +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD [ "curl", "-f", "http://localhost:8080/health" ] ENTRYPOINT [ "/app/llama-server" ] # Stage 5: Swap FROM server AS swap ARG LS_REPO=mostlygeek/llama-swap -ARG LS_VER=198 -RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ - && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \ - && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz" -COPY ./ik_llama-cpu-swap.config.yaml /app/config.yaml -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080"] -ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ] +ARG LS_VER=199 +RUN curl -sSL "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ + | tar -xz + +COPY --from=build /app/docker/ik_llama-cpu-swap.config.yaml /app/config.yaml +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD [ "curl", "-f", "http://localhost:8080"] +ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ] \ No newline at end of file diff --git a/docker/ik_llama-cuda.Containerfile b/docker/ik_llama-cuda.Containerfile index 0ac0423620..7a382a0548 100644 --- a/docker/ik_llama-cuda.Containerfile +++ b/docker/ik_llama-cuda.Containerfile @@ -5,72 +5,92 @@ ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu # Stage 1: Build FROM ${BASE_CUDA_DEV_CONTAINER} AS build -ARG CUDA_DOCKER_ARCH=86 # CUDA architecture to build for -RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-dev curl libgomp1 cmake -RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app +# Build arguments +ARG CUDA_DOCKER_ARCH="86;90" +ARG GGML_NATIVE=ON +ARG USE_CCACHE=true + +# Environment variables for portability and GitHub Actions +ENV CCACHE_DIR=/ccache +ENV CCACHE_UMASK=000 +ENV CCACHE_MAXSIZE=5G +ENV CCACHE_COMPRESS=1 +ENV CCACHE_BASEDIR=/app + +RUN apt-get update && \ + apt-get install -yq --no-install-recommends \ + ca-certificates build-essential libcurl4-openssl-dev curl libgomp1 cmake ccache git && \ + rm -rf /var/lib/apt/lists/* + +# Copy non-hidden files first +COPY . /app + WORKDIR /app -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + +# Build using ccache and optional custom commit +RUN --mount=type=cache,target=/ccache \ + --mount=type=bind,source=.git,target=.git \ + if [ "${USE_CCACHE}" = "true" ]; then \ + export PATH="/usr/lib/ccache:$PATH"; \ + ccache -z; \ fi && \ - cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) -RUN mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; -RUN mkdir -p /app/build/src && \ - find build -name "*.so" -exec cp {} /app/build/src \; -RUN mkdir -p /app/full \ - && cp build/bin/* /app/full \ - && cp *.py /app/full \ - && cp -r gguf-py /app/full \ - && cp -r requirements /app/full \ - && cp requirements.txt /app/full \ - && cp .devops/tools.sh /app/full/tools.sh + cmake -B build \ + -DGGML_NATIVE=${GGML_NATIVE} \ + -DGGML_CUDA=ON \ + -DCMAKE_CUDA_ARCHITECTURES="${CUDA_DOCKER_ARCH}" \ + -DLLAMA_CURL=ON \ + -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined && \ + cmake --build build --config Release -j$(nproc) && \ + if [ "${USE_CCACHE}" = "true" ]; then \ + ccache -s; \ + fi -# Stage 2: base +# Collect build artifacts +RUN mkdir -p /app/dist/lib /app/dist/full /app/dist/bin && \ + find build -name "*.so" -exec cp {} /app/dist/lib \; && \ + cp build/bin/* /app/dist/bin/ && \ + cp build/bin/* /app/dist/full/ && \ + cp *.py /app/dist/full/ && \ + cp -r gguf-py /app/dist/full/ && \ + cp -r requirements /app/dist/full/ && \ + cp requirements.txt /app/dist/full/ && \ + cp .devops/tools.sh /app/dist/full/ + +# Stage 2: Base (Shared Runtime) FROM ${BASE_CUDA_RUN_CONTAINER} AS base -RUN apt-get update && apt-get install -yq libgomp1 curl \ - && update-ca-certificates \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete -COPY --from=build /app/lib/ /app +RUN apt-get update && \ + apt-get install -yq --no-install-recommends libgomp1 curl ca-certificates && \ + rm -rf /var/lib/apt/lists/* +WORKDIR /app +ENV LD_LIBRARY_PATH=/app/lib +COPY --from=build /app/dist/lib /app/lib -# Stage 3: full +# Stage 3: Full (Python/Dev Tools) FROM base AS full -COPY --from=build /app/full /app -RUN mkdir -p /app/build/src -COPY --from=build /app/build/src /app/build/src -WORKDIR /app -RUN apt-get update && apt-get install -yq \ - git \ - python3 \ - python3-pip \ - && pip3 install --break-system-packages -r requirements.txt \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete +COPY --from=build /app/dist/full /app +RUN apt-get update && \ + apt-get install -yq --no-install-recommends git python3 python3-pip && \ + pip install --break-system-packages -r requirements.txt && \ + rm -rf /var/lib/apt/lists/* ENTRYPOINT ["/app/tools.sh"] # Stage 4: Server FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app/llama-server -WORKDIR /app -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] +COPY --from=build /app/dist/bin/llama-server /app/llama-server +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD [ "curl", "-f", "http://localhost:8080/health" ] ENTRYPOINT [ "/app/llama-server" ] # Stage 5: Swap FROM server AS swap ARG LS_REPO=mostlygeek/llama-swap -ARG LS_VER=198 -RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ - && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \ - && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz" -COPY ./ik_llama-cuda-swap.config.yaml /app/config.yaml -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080"] -ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ] +ARG LS_VER=199 +RUN curl -sSL "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ + | tar -xz + +COPY --from=build /app/docker/ik_llama-cuda-swap.config.yaml /app/config.yaml +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD [ "curl", "-f", "http://localhost:8080"] +ENTRYPOINT [ "/app/llama-swap", "-config", "/app/config.yaml" ] \ No newline at end of file