Skip to content

Commit 19ae8e7

Browse files
authored
Merge pull request #352 from dipatidar/main
Adding Mistral AI deployment example
2 parents 7f37b00 + 26969c1 commit 19ae8e7

35 files changed

+2337
-0
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
TENANCY:=${TENANCY_NAME}
2+
CONTAINER_REGISTRY:=${REGION_KEY}.ocir.io
3+
4+
TGI_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/text-generation-interface-odsc:0.9.3
5+
TGI_CONTAINER_NAME:=tgi-odsc
6+
7+
VLLM_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/vllm-odsc:0.2.0
8+
VLLM_CONTAINER_NAME:=vllm-odsc
9+
10+
MODEL_DIR:=${PWD}/hfdata
11+
TARGET_DIR:=/home/datascience
12+
HF_DIR=/home/datascience/.cache
13+
14+
token:=${PWD}/token
15+
target_token:=/opt/ds/model/deployed_model/token
16+
model:=meta-llama/Llama-2-7b-chat-hf
17+
port:=8080
18+
params:="--max-batch-prefill-tokens 1024"
19+
local_model:=/opt/ds/model/deployed_model
20+
tensor_parallelism:=1
21+
22+
check-env:
23+
@if [[ -z "$${TENANCY_NAME}" ]]; then \
24+
echo "TENANCY_NAME is not set or is empty"; \
25+
exit 1; \
26+
fi
27+
@if [[ -z "$${REGION_KEY}" ]]; then \
28+
echo "REGION_KEY is not set or is empty"; \
29+
exit 1; \
30+
fi
31+
@echo "Both TENANCY_NAME and REGION_KEY are set and have values."
32+
build.tgi:
33+
cd tgi/ && docker build --network host -t ${TGI_INFERENCE_IMAGE} -f Dockerfile.tgi .
34+
build.vllm: check-env
35+
cd vllm/ && docker build --network host -t ${VLLM_INFERENCE_IMAGE} -f Dockerfile.vllm .
36+
run.tgi.hf:
37+
docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e TOKEN_FILE=${target_token} -e PARAMS=${params} -e MODEL=${model} -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
38+
run.tgi.oci:
39+
docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e PARAMS=${params} -e MODEL=${local_model} -v ${MODEL_DIR}:${TARGET_DIR} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
40+
run.vllm.hf:
41+
docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e UVICORN_NO_USE_COLORS=1 -e TOKEN_FILE=${target_token} -e MODEL=${model} -e TENSOR_PARALLELISM=${tensor_parallelism} -e HUGGINGFACE_HUB_CACHE=${HF_DIR} -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
42+
run.vllm.oci:
43+
docker run --rm -d --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e UVICORN_NO_USE_COLORS=1 -e MODEL=${local_model} -e TENSOR_PARALLELISM=${tensor_parallelism} -v ${MODEL_DIR}:${TARGET_DIR} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
44+
shell.tgi:
45+
docker run --rm -it --shm-size=1g --net host -p ${port}:${port} -e TOKEN_FILE=${target_token} -e PARAMS="--model-id ${model}" --entrypoint bash -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
46+
shell.vllm:
47+
docker run --rm -it --shm-size=15g --net host -p ${port}:${port} -e TOKEN_FILE=${target_token} -e PARAMS="--model ${model}" -e HUGGINGFACE_HUB_CACHE=${HF_DIR} --entrypoint bash -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
48+
stop.tgi:
49+
docker stop ${TGI_CONTAINER_NAME}
50+
stop.vllm:
51+
docker stop ${VLLM_CONTAINER_NAME}
52+
push.tgi:
53+
docker push ${TGI_INFERENCE_IMAGE}
54+
push.vllm:
55+
docker push ${VLLM_INFERENCE_IMAGE}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Overview
2+
3+
This repo provides two approaches to manage the inference server for LLM deployment in OCI Data Science:
4+
5+
* [Text Generation Inference](https://github.com/huggingface/text-generation-inference) from HuggingFace.
6+
* [vLLM](https://github.com/vllm-project/vllm) developed at UC Berkeley
7+
8+
## Prerequisites
9+
10+
* This is Limited Available feature. Please reach out to us via email `[email protected]` to ask to be allowlisted for this LA feature.
11+
* Configure your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) to be able to run and test your code locally.
12+
* Install [Docker](https://docs.docker.com/get-docker) or [Rancher Desktop](https://rancherdesktop.io/) as docker alternative.
13+
14+
## Required IAM Policies
15+
16+
Public [documentation](https://docs.oracle.com/en-us/iaas/data-science/using/policies.htm).
17+
18+
### Bring your own container [policies](https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__access-logging-service#model_dep_policies_auth__access-custom-container)
19+
`ALL { resource.type = 'datasciencemodeldeployment' }`
20+
21+
`allow dynamic-group <dynamic-group-name> to read repos in compartment <compartment-name> where ANY {request.operation='ReadDockerRepositoryMetadata',request.operation='ReadDockerRepositoryManifest',request.operation='PullDockerLayer' }`
22+
23+
#### If the repository is in the root compartment, allow read for the tenancy
24+
25+
`allow dynamic-group <dynamic-group-name> to read repos in tenancy where ANY {
26+
request.operation='ReadDockerRepositoryMetadata',
27+
request.operation='ReadDockerRepositoryManifest',
28+
request.operation='PullDockerLayer'
29+
}`
30+
31+
#### For user level policies
32+
33+
`allow any-user to read repos in tenancy where ALL { request.principal.type = 'datasciencemodeldeployment' }`
34+
35+
`allow any-user to read repos in compartment <compartment-name> where ALL { request.principal.type = 'datasciencemodeldeployment'}`
36+
37+
For all other Data Science policies, please refer these [details](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/distributed_training/README.md#3-oci-policies).
38+
39+
## Build TGI Container
40+
To construct the required containers for this deployment and retain the necessary information, please complete the following steps:
41+
42+
* Checkout this repository
43+
* Enter the path `model-deployment/containers/llm/inference-images`
44+
45+
```bash
46+
cd model-deployment/containers/llm/inference-images
47+
```
48+
* This example uses [OCI Container Registry](https://docs.oracle.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm) to store the container image required for the deployment. For the `Makefile` to execute the container build and push process to Oracle Cloud Container Registry, you have to setup in your local terminal the `TENANCY_NAME` and `REGION_KEY` environment variables.`TENANCY_NAME` is the name of your tenancy, which you can find under your [account settings](https://cloud.oracle.com/tenancy) and the `REGION_KEY` is a 3 letter name of your tenancy region, you consider to use for this example, for example IAD for Ashburn, or FRA for Frankfurt. You can find the region keys in our public documentation for [Regions and Availability Domains](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm)
49+
50+
```bash
51+
export TENANCY_NAME=<your-tenancy-name>
52+
export REGION_KEY=<region-key>
53+
```
54+
55+
You can find the official documentation about OCI Data Science Model Deployment: [https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_create.htm]
56+
57+
* Build the TGI container image, this step would take awhile
58+
59+
```bash
60+
make build.tgi
61+
```
62+
63+
* Before we can push the newly build container make sure that you've created the `text-generation-interface-odsc` repository in your tenancy.
64+
* Go to your tenancy [Container Registry](https://cloud.oracle.com/compute/registry/containers)
65+
* Click on the `Create repository` button
66+
* Select `Private` under Access types
67+
* Set `text-generation-interface-odsc` as a `Repository name`
68+
* Click on `Create` button
69+
70+
* You may need to `docker login` to the Oracle Cloud Container Registry (OCIR) first, if you haven't done so before been able to push the image. To login you have to use your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) that can be created under your `Oracle Cloud Account->Auth Token`. You need to login only once.
71+
72+
```bash
73+
docker login -u '<tenant-namespace>/<username>' <region>.ocir.io
74+
```
75+
76+
If `your tenancy` is **federated** with Oracle Identity Cloud Service, use the format `<tenancy-namespace>/oracleidentitycloudservice/<username>`
77+
78+
* Push the container image to the OCIR
79+
80+
```bash
81+
make push.tgi
82+
```
83+
84+
## Build vLLM Container
85+
86+
You can find the official documentation about OCI Data Science Model Deployment: [https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_create.htm]
87+
88+
* Build the vLLM container image, this step would take awhile
89+
90+
```bash
91+
make build.vllm
92+
```
93+
94+
* Before we can push the newly build container make sure that you've created the `vllm-odsc` repository in your tenancy.
95+
* Go to your tenancy [Container Registry](https://cloud.oracle.com/compute/registry/containers)
96+
* Click on the `Create repository` button
97+
* Select `Private` under Access types
98+
* Set `vllm-odsc` as a `Repository name`
99+
* Click on `Create` button
100+
101+
* You may need to `docker login` to the Oracle Cloud Container Registry (OCIR) first, if you haven't done so before been able to push the image. To login you have to use your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) that can be created under your `Oracle Cloud Account->Auth Token`. You need to login only once.
102+
103+
```bash
104+
docker login -u '<tenant-namespace>/<username>' <region>.ocir.io
105+
```
106+
107+
If `your tenancy` is **federated** with Oracle Identity Cloud Service, use the format `<tenancy-namespace>/oracleidentitycloudservice/<username>`
108+
109+
* Push the container image to the OCIR
110+
111+
```bash
112+
make push.vllm
113+
```
114+
115+
116+
117+
### Advanced debugging options: Code debugging inside the container using job
118+
For more detailed level of debugging, user can refer [README-DEBUG.md](./README-DEBUG.md).
119+
120+
## Additional Make Commands
121+
122+
### TGI containers
123+
124+
`make build.tgi` to build the container
125+
126+
`make run.tgi` to run the container
127+
128+
`make shell.tgi` to launch container with shell prompt
129+
130+
`make stop.tgi` to stop the running container
131+
132+
### vLLM containers
133+
134+
`make build.vllm` to build the container
135+
136+
`make run.vllm` to run the container
137+
138+
`make shell.vllm` to launch container with shell prompt
139+
140+
`make stop.vllm` to stop the running container
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
FROM ghcr.io/huggingface/text-generation-inference:0.9.3
2+
ARG DEBIAN_FRONTEND=noninteractive
3+
4+
RUN apt-get update && apt-get -y install tzdata && apt-get install -y curl
5+
RUN pip install flask
6+
7+
WORKDIR /home/datascience
8+
RUN ln -s /home/datascience /data
9+
10+
COPY start-tgi.sh /etc/
11+
RUN chmod a+x /etc/start-tgi.sh
12+
13+
ENV PORT 8080
14+
ENV MODEL /opt/ds/model/deployed_model
15+
16+
# llama2-7b-hf with A10 shape
17+
ENV PARAMS "--max-batch-prefill-tokens 1024"
18+
19+
# llama2-13b-hf with A10 shape
20+
# ENV PARAMS "--max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096"
21+
22+
# llama2-70b-hf with A100 shape
23+
# ENV PARAMS "-max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096 --num-shard <NUMBER_OF_ALLOWED_GPUS or WORLD_SIZE>"
24+
25+
EXPOSE ${PORT}
26+
27+
ENTRYPOINT [ "/bin/bash", "--login", "-c"]
28+
CMD ["/etc/start-tgi.sh"]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
if [ -z "$TOKEN_FILE" ] ; then
3+
echo "No authentication token is provided. Weights are assumed to be downloaded from OCI Model Catalog."
4+
else
5+
export HUGGING_FACE_HUB_TOKEN=$(cat $TOKEN_FILE)
6+
echo "Checking internet connection: "
7+
curl -sI -v https://www.wikipedia.org
8+
echo "Downloading weights:"
9+
text-generation-server download-weights $MODEL
10+
echo "Download weights complete"
11+
echo $(du -sh /home/datascience/*)
12+
fi
13+
14+
echo "Starting TGI..."
15+
text-generation-launcher --json-output --hostname 0.0.0.0 --port $PORT --model-id $MODEL $PARAMS
16+
17+
echo "Exiting TGI. Here is the disk utilization of /home/datascience - "
18+
echo $(du -sh /home/datascience)
19+
exit $LastExitCode
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
2+
ARG DEBIAN_FRONTEND=noninteractive
3+
4+
RUN apt-get update && apt-get -y install tzdata && apt-get install -y curl && apt-get install -y git
5+
RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh
6+
RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh;
7+
ENV PATH="/miniconda/bin:$PATH"
8+
RUN mkdir -p /opt/vllm
9+
10+
ARG INSTALL_DIR=/opt/vllm
11+
COPY vllm-env.yaml /opt/vllm/environment.yaml
12+
RUN conda env create --name vllm -f ${INSTALL_DIR}/environment.yaml
13+
14+
ENV TMPDIR=/home/datascience
15+
WORKDIR /home/datascience
16+
17+
COPY start-vllm.sh ${INSTALL_DIR}/start.sh
18+
RUN chmod a+x ${INSTALL_DIR}/start.sh
19+
COPY vllm-log-config.yaml ${INSTALL_DIR}/vllm-log-config.yaml
20+
ENV UVICORN_LOG-CONFIG=${INSTALL_DIR}/vllm-log-config.yaml
21+
ENV UVICORN_LOG_CONFIG=${INSTALL_DIR}/vllm-log-config.yaml
22+
23+
# Default location where downloaded models are mapped on model container. No need to override, if using model catalog.
24+
ENV MODEL /opt/ds/model/deployed_model
25+
26+
# Tensor parallelism required by the model
27+
ENV TENSOR_PARALLELISM 1
28+
29+
# Custom port for model container. No need to override.
30+
ENV PORT 8080
31+
EXPOSE ${PORT}
32+
ENV VLLM_DIR=${INSTALL_DIR}
33+
COPY vllm-api-server.py ${VLLM_DIR}/vllm-api-server.py
34+
35+
ENTRYPOINT [ "/bin/bash", "--login", "-c"]
36+
CMD ["$VLLM_DIR/start.sh"]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
if [ -z "$TOKEN_FILE" ] ; then
4+
echo "No authentication token is provided. Weights are assumed to be downloaded from OCI Model Catalog."
5+
else
6+
export HUGGING_FACE_HUB_TOKEN=$(cat $TOKEN_FILE)
7+
echo "The md5 of token is $(md5sum $TOKEN_FILE)"
8+
mkdir -p /home/datascience/.cache/huggingface
9+
cp $TOKEN_FILE /home/datascience/.cache/huggingface/token
10+
echo "Copied token file to /home/datascience/.cache/huggingface, $(md5sum /home/datascience/.cache/huggingface/token)"
11+
echo "Set HuggingFace cache folder..."
12+
export HUGGINGFACE_HUB_CACHE=/home/datascience/.cache
13+
echo "The size of partitions"
14+
echo $(df -h /home/datascience)
15+
df -h
16+
echo "Checking internet connection: "
17+
curl -sI -v https://www.wikipedia.org
18+
echo $(du -sh /home/datascience/*)
19+
fi
20+
21+
if [ "$API_SPEC" == "openai" ]
22+
then
23+
echo "starting vllm engine with openai spec"
24+
nginx -p $PWD
25+
source activate vllm && \
26+
WEB_CONCURRENCY=1 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model ${MODEL} --tensor-parallel-size ${TENSOR_PARALLELISM} ${PARAMS}
27+
else
28+
echo "starting vllm engine with default api spec"
29+
source activate vllm && \
30+
WEB_CONCURRENCY=1 python $VLLM_DIR/vllm-api-server.py --host 0.0.0.0 --log-config $VLLM_DIR/vllm-log-config.yaml --model ${MODEL} --tensor-parallel-size ${TENSOR_PARALLELISM} ${PARAMS}
31+
fi
32+
33+
echo "Exiting vLLM. Here is the disk utilization of /home/datascience - "
34+
echo $(du -sh /home/datascience)
35+
echo "server logs: "
36+
ls -lah /home/datascience
37+
cat /home/datascience/server.log

0 commit comments

Comments
 (0)