Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/scripts/runner_setup.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/bin/bash
set -e

curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR="/usr/local/bin" sh
uv self update
if ! command -v uv &> /dev/null; then
curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR="/usr/local/bin" sh
uv self update
fi
docker --version
16 changes: 15 additions & 1 deletion .github/workflows/pr-example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,21 @@ jobs:
- uses: actions/checkout@v5
- run: .github/scripts/runner_setup.sh
- run: .github/scripts/buildkitd.sh

- name: build vllm-rayserve-ec2 image
shell: bash
run: |
aws ecr get-login-password --region ${{ secrets.AWS_REGION }} | docker login --username AWS --password-stdin ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com
IMAGE_TAG=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/ci:vllm-0.10.2-gpu-py312-cu128-ubuntu22.04-rayserve-ec2-pr-${{ github.event.pull_request.number }}
docker buildx build --progress plain \
--build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
--cache-to=type=inline \
--cache-from=type=registry,ref="$IMAGE_TAG" \
--tag "$IMAGE_TAG" \
--target vllm-rayserve-ec2 \
-f docker/vllm/Dockerfile.rayserve .
docker push "$IMAGE_TAG"
docker rmi "$IMAGE_TAG"

example-on-g6xl-runner-1:
needs: [example-on-build-runner]
runs-on:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ __pycache__
.idea
*.pyc
.venv
.ruff_cache
28 changes: 15 additions & 13 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,14 @@ repos:
# optional: add additional arguments here
- --indent=2
- --write
stages: [manual] # run in CI
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
stages: [manual] # run in CI
stages: [pre-commit]
- repo: https://github.com/scop/pre-commit-shfmt
rev: v3.12.0-2 # Use the latest stable revision
hooks:
- id: shfmt
# Optional: Add arguments to shfmt if needed, e.g., to enable "simplify" mode
args: ["-s"]
- repo: https://github.com/crate-ci/typos
rev: v1.38.1
hooks:
- id: typos
args: [--force-exclude]
stages: [pre-commit]
- repo: https://github.com/hukkin/mdformat
rev: 1.0.0 # Use the ref you want to point at
hooks:
Expand All @@ -40,17 +31,28 @@ repos:
additional_dependencies:
- mdformat-gfm
- mdformat-black
stages: [pre-commit]
- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
hooks:
- id: markdownlint
args: [--fix]
stages: [pre-commit]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.3
hooks:
- id: ruff-check
args: [ --fix, --output-format=github ]
- id: ruff-format
stages: [pre-commit]
- id: ruff-check
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
- repo: https://github.com/crate-ci/typos
rev: v1.38.1
hooks:
- id: typos
args: [--force-exclude]
- repo: local
hooks:
- id: signoff-commit
Expand Down
7 changes: 7 additions & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ uv pip install pre-commit
pre-commit install
```

Install go using [homebrew](https://brew.sh/), below example assume on Mac.

```bash
brew install go
go env -w GOPROXY=direct
```

To manually run all linters:

```bash
Expand Down
68 changes: 68 additions & 0 deletions docker/vllm/Dockerfile.rayserve
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
FROM docker.io/vllm/vllm-openai:v0.10.2 AS base
ARG PYTHON="python3"
LABEL maintainer="Amazon AI"
ARG EFA_VERSION="1.43.3"
LABEL dlc_major_version="1"
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
DLC_CONTAINER_TYPE=base \
# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONIOENCODING=UTF-8 \
LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"

WORKDIR /

COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py
COPY ./scripts/telemetry/bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
COPY ./scripts/setup_oss_compliance.sh setup_oss_compliance.sh

RUN chmod +x /usr/local/bin/deep_learning_container.py \
&& chmod +x /usr/local/bin/bash_telemetry.sh \
&& echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bash.bashrc \
&& bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh \
# create symlink for python
&& ln -s /usr/bin/python3 /usr/bin/python \
# clean up
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp* \
&& rm -rf /tmp/uv* \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /root/.cache | true

COPY ./scripts/install_efa.sh install_efa.sh
RUN bash install_efa.sh ${EFA_VERSION} \
&& rm install_efa.sh \
&& mkdir -p /tmp/nvjpeg \
&& cd /tmp/nvjpeg \
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/lib64/ \
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/include/ \
&& rm -rf /tmp/nvjpeg \
# remove cuobjdump and nvdisasm
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
&& rm -rf /usr/local/cuda/bin/nvdisasm*

# ====================== ray serve =========================================
FROM base AS vllm-rayserve-ec2

RUN uv pip install --system ray[serve]==2.49.0 \
&& uv cache clean

ARG CACHE_REFRESH=0
RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
&& apt-get update \
&& apt-get upgrade -y \
&& apt-get clean

COPY ./scripts/dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh

ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"]
6 changes: 6 additions & 0 deletions scripts/dockerd_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
# Check if telemetry file exists before executing
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

python3 -m vllm.entrypoints.openai.api_server "$@"
102 changes: 102 additions & 0 deletions scripts/install_efa.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash

set -ex

ARCH=$(uname -m)
case $ARCH in
x86_64)
ARCH_DIR="x86_64-linux-gnu"
;;
aarch64)
ARCH_DIR="aarch64-linux-gnu"
;;
*)
echo "Unsupported architecture: $ARCH"
exit 1
;;
esac

function check_libnccl_net_so {
OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}"
NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so"

# Check if file exists
if [ ! -f "$NCCL_NET_SO" ]; then
echo "ERROR: $NCCL_NET_SO does not exist"
return 1
fi
}

function install_efa {
EFA_VERSION=$1
OPEN_MPI_PATH="/opt/amazon/openmpi"

# Install build time tools
apt-get update
apt-get install -y --allow-change-held-packages --no-install-recommends \
curl \
build-essential \
cmake \
git

# Install EFA
mkdir /tmp/efa
cd /tmp/efa
curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz
tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz
cd aws-efa-installer
./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify
rm -rf /tmp/efa
# Configure Open MPI and configure NCCL parameters
mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real
echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun
echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun
chmod a+x ${OPEN_MPI_PATH}/bin/mpirun
echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
echo NCCL_DEBUG=INFO >> /etc/nccl.conf
echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf

# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
apt-get install -y --no-install-recommends \
openssh-client \
openssh-server
mkdir -p /var/run/sshd
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
# Configure OpenSSH so that nodes can communicate with each other
mkdir -p /var/run/sshd
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
rm -rf /root/.ssh/
mkdir -p /root/.ssh/
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config

# Remove build time tools
# apt-get remove -y
# curl
# build-essential
# cmake
# git

# Cleanup
apt-get clean
apt-get autoremove -y
rm -rf /var/lib/apt/lists/*
ldconfig
check_libnccl_net_so
}

# idiomatic parameter and option handling in sh
while test $# -gt 0
do
case "$1" in
[0-9].[0-9]*.[0-9]*) install_efa $1;
;;
*) echo "bad argument $1"; exit 1
;;
esac
shift
done
34 changes: 34 additions & 0 deletions scripts/setup_oss_compliance.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

set -ex

function install_oss_compliance {
HOME_DIR="/root"
PYTHON=$1

if [ -z "$PYTHON" ]; then
echo "Python version not specified. Using default Python."
PYTHON="python3"
fi
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip
${PYTHON} -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')"
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance
chmod +x /usr/local/bin/testOSSCompliance
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON}
rm -rf ${HOME_DIR}/oss_compliance*
rm -rf /tmp/tmp*
# Removing the cache as it is needed for security verification
rm -rf /root/.cache | true
}

while test $# -gt 0
do
case "$1" in
python*) install_oss_compliance $1;
;;
*) echo "bad argument $1"; exit 1
;;
esac
shift
done
11 changes: 11 additions & 0 deletions scripts/telemetry/bash_telemetry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# telemetry.sh
#!/bin/bash
if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then
(
python /usr/local/bin/deep_learning_container.py \
--framework "${FRAMEWORK}" \
--framework-version "${FRAMEWORK_VERSION}" \
--container-type "${CONTAINER_TYPE}" \
&>/dev/null &
)
fi
Loading