wsmlby
diff --git a/‎.github/workflows/build-publish.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-publish.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 22 additions & 6 deletions b/‎README.md‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎cli/README.md‎
Lines changed: 36 additions & 0 deletions b/‎cli/README.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎cli/homl_cli/utils/install_utils.py‎
Lines changed: 9 additions & 5 deletions b/‎cli/homl_cli/utils/install_utils.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎server/Dockerfile.cpu‎
Lines changed: 18 additions & 101 deletions b/‎server/Dockerfile.cpu‎
Lines changed: 18 additions & 101 deletions
diff --git a/‎server/Dockerfile.cpu.app‎
Lines changed: 0 additions & 26 deletions b/‎server/Dockerfile.cpu.app‎
Lines changed: 0 additions & 26 deletions
@@ -37,7 +37,7 @@ jobs:
           IMAGE_ID=ghcr.io/${{ github.repository_owner }}/homl-vllm-cpu-base:latest
           docker buildx build \
             -t ${IMAGE_ID} \
-            -f server/Dockerfile.cpu \
+            -f server/Dockerfile.cpu.base \
             ./vllm-src \
             --push
 
@@ -67,7 +67,7 @@ jobs:
           docker buildx build \
             --build-arg HOML_SERVER_VERSION=$VERSION \
             -t ghcr.io/${{ github.repository_owner }}/homl/server:latest-cpu \
-            -f Dockerfile.cpu.app \
+            -f Dockerfile.cpu \
             . \
             --push
 
 
@@ -52,13 +52,14 @@ For detailed information on how to use the HoML CLI, please refer to our officia
 
 [**HoML Documentation**](https://homl.dev/docs/cli.html)
 
+
 ## TODO / Roadmap
-*  [v] Improve vLLM startup time to support faster switching between models.
-*   MultiGPU support: Enable multiple models running at the same time on different GPUs.
-*   Enable multiple models running at the same time on the same GPU, this means we need to be able to estimate the vRAM usage of each model and manage the memory accordingly.
-*   Add support for ROCm, Apple Silicon, and other architectures.
-*   Add support for loading adapter layers.
-*   Add support for endpoints other than chat/completion, such as embeddings and text generation.
+- [x] Improve vLLM startup time to support faster switching between models.
+- [ ] MultiGPU support: Enable multiple models running at the same time on different GPUs.
+- [ ] Enable multiple models running at the same time on the same GPU, this means we need to be able to estimate the vRAM usage of each model and manage the memory accordingly.
+- [ ] Add support for ROCm, Apple Silicon, and other architectures.
+- [ ] Add support for loading adapter layers.
+- [ ] Add support for endpoints other than chat/completion, such as embeddings and text generation.
 
 ## Contributing
 
@@ -69,6 +70,21 @@ We are particularly looking for help with:
 *   Testing and verifying models for the curated list.
 *   Improving the CLI experience.
 
+## Contribute / Build from Source
+Currently only CUDA version is officially supported, but other platform that vLLM can run on is possible if you want to build from source.
+
+### Cli
+See [cli/README.md](cli/README.md)
+
+### Server
+See [server/README.md](server/README.md)
+
+### If you want to add support for a new platform
+
+1. [Create the server for the new platform](server/README.md#other-platforms)
+2. [Update the CLI](cli/README.md#adding-support-for-other-platforms) 
+3. follow the guide there to start the new server
+
 ## Community
 
 Join our community to stay updated, ask questions, and contribute to the project:
 
@@ -0,0 +1,36 @@
+# BUILD CLI from source
+
+Build the CLI from source by following these steps:
+
+1. Clone the repository:
+
+   ```bash
+   git clone https://github.com/homl-dev/homl.git
+   cd homl/cli
+   ```
+2. create a venv
+
+   ```bash
+   python -m venv venv
+   source venv/bin/activate
+   ```
+
+3. run the build command
+
+    ```
+    cd cli
+    bash build.sh
+    ```
+
+3. The CLI binary will be at `dist/homl`
+
+
+# Adding support for other platforms
+
+1. Make modification to the following functions to add support for other platforms inside [install_utils](homl_cli/utils/install_utils.py):
+
+    1. detect_platform: to support detecting the platform correctly
+    2. get_platform_config: to return the correct image, and add correct hardware resource assignments for docker.
+    3. install: add platform-specific installation steps
+
+2. When running with locally build image, use HOML_DOCKER_IMAGE_OVERRIDE environment variable to specify the image when running the `homl server install` command.
@@ -85,11 +85,9 @@ def check_and_install_docker():
 
 
 def get_platform_config(accelerator: str, gptoss: bool) -> Dict[str, Any]:
-    """Returns the docker image and other config for a given platform."""
-    # In the future, these images would be hosted on a public registry.
-    # For now, they are conceptual names.
+    """Returns the docker image and other config for a given platform."""   
     if accelerator == "cuda":
-        return {
+        cfg = {
             "image": "ghcr.io/wsmlby/homl/server:latest-cuda" if not gptoss else "ghcr.io/wsmlby/homl/server:latest-cuda-gptoss",
             "deploy_resources": """
       resources:
@@ -102,10 +100,15 @@ def get_platform_config(accelerator: str, gptoss: bool) -> Dict[str, Any]:
         }
     # TODO: Add support for ROCm and XPU in the future
     else:  # cpu
-        return {
+        cfg = {
             "image": "ghcr.io/wsmlby/homl/server:latest-cpu",
             "deploy_resources": "",
         }
+    
+    if os.environ.get("HOML_DOCKER_IMAGE_OVERRIDE"):
+        cfg["image"] = os.environ["HOML_DOCKER_IMAGE_OVERRIDE"]
+
+    return cfg
 
 
 def check_and_install_nvidia_runtime():
@@ -155,6 +158,7 @@ def install(insecure_socket: bool, upgrade: bool, gptoss: bool, install_webui: b
     if accelerator == "cuda":
         if not check_and_install_nvidia_runtime():
             return
+    # add other platform checks here
     else:
         click.secho("No NVIDIA runtime found. Currently only support NVIDIA GPU. Abort.", fg="red")
         return
 
@@ -1,111 +1,28 @@
-# This Dockerfile is sourced from the official vLLM project:
-# https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.cpu
-#
-# To build the CPU base image for HoML, you should:
-# 1. Clone the vLLM repository: git clone https://github.com/vllm-project/vllm.git
-# 2. Navigate to the vLLM repository root.
-# 3. Place this file at the root of the vLLM project checkout.
-# 4. Run the build command, e.g.:
-#    docker buildx build -t homl/vllm-cpu:latest -f Dockerfile.cpu .
-#
-# The resulting `homl/vllm-cpu:latest` image can then be used as a base
-# for the main HoML server image.
+# This Dockerfile builds the final HoML Server image for CPU.
+# It layers the HoML server code on top of a pre-built vLLM CPU base image.
+FROM ghcr.io/wsmlby/homl-vllm-cpu-base:latest
 
-######################### COMMON BASE IMAGE #########################
-FROM ubuntu:22.04 AS base-common
 
-WORKDIR /workspace/
+# Set the working directory to homl_server
+WORKDIR /app/homl_server
 
-ARG PYTHON_VERSION=3.12
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 
-# Install minimal dependencies and uv
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get update -y \
-    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
-        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
+# Copy requirements.txt and install dependencies
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
 
-ENV CCACHE_DIR=/root/.cache/ccache
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+# ENV ACCELERATOR=CPU
 
-ENV PATH="/root/.local/bin:$PATH"
-ENV VIRTUAL_ENV="/opt/venv"
-ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Copy our application source code
+COPY ./homl_server ./
+COPY ./homl_server ./homl_server
+COPY ./vllm_patches ./patches
 
-ENV UV_HTTP_TIMEOUT=500
+RUN cd /usr/local/lib/python3.12/dist-packages/vllm && patch -p1 < /app/patches/registry.patch
 
-# Install Python dependencies
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-ENV UV_LINK_MODE="copy"
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    uv pip install --upgrade pip && \
-    uv pip install -r requirements/cpu.txt
 
-ARG TARGETARCH
-ENV TARGETARCH=${TARGETARCH}
+ARG HOML_SERVER_VERSION=dev
+ENV HOML_SERVER_VERSION=$HOML_SERVER_VERSION
 
-######################### x86_64 BASE IMAGE #########################
-FROM base-common AS base-amd64
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"
-
-######################### arm64 BASE IMAGE #########################
-FROM base-common AS base-arm64
-
-ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
-
-######################### BASE IMAGE #########################
-FROM base-${TARGETARCH} AS base
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-######################### BUILD IMAGE #########################
-FROM base AS vllm-build
-
-ARG GIT_REPO_CHECK=0
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512=true
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
-ARG VLLM_CPU_AVX512BF16=false
-ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
-# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
-ARG VLLM_CPU_AVX512VNNI=false
-ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
-    uv pip install -r requirements/build.txt
-
-COPY . .
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
-
-######################### RELEASE IMAGE #########################
-FROM base AS vllm-openai
-
-WORKDIR /workspace/
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
-    uv pip install dist/*.whl
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+# Start the server directly from main.py
+ENTRYPOINT ["python3", "-u", "main.py"]