oracle-samples
diff --git a/‎model-deployment/containers/llm/inference-images/Makefile‎
Lines changed: 55 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/Makefile‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎model-deployment/containers/llm/inference-images/README.md‎
Lines changed: 140 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/README.md‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎model-deployment/containers/llm/inference-images/tgi/Dockerfile.tgi‎
Lines changed: 28 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/tgi/Dockerfile.tgi‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎model-deployment/containers/llm/inference-images/tgi/start-tgi.sh‎
Lines changed: 19 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/tgi/start-tgi.sh‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎model-deployment/containers/llm/inference-images/vllm/Dockerfile.vllm‎
Lines changed: 36 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/vllm/Dockerfile.vllm‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎model-deployment/containers/llm/inference-images/vllm/start-vllm.sh‎
Lines changed: 37 additions & 0 deletions b/‎model-deployment/containers/llm/inference-images/vllm/start-vllm.sh‎
Lines changed: 37 additions & 0 deletions
@@ -0,0 +1,55 @@
+TENANCY:=${TENANCY_NAME}
+CONTAINER_REGISTRY:=${REGION_KEY}.ocir.io
+
+TGI_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/text-generation-interface-odsc:0.9.3
+TGI_CONTAINER_NAME:=tgi-odsc
+
+VLLM_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/vllm-odsc:0.2.0
+VLLM_CONTAINER_NAME:=vllm-odsc
+
+MODEL_DIR:=${PWD}/hfdata
+TARGET_DIR:=/home/datascience
+HF_DIR=/home/datascience/.cache
+
+token:=${PWD}/token
+target_token:=/opt/ds/model/deployed_model/token
+model:=meta-llama/Llama-2-7b-chat-hf
+port:=8080
+params:="--max-batch-prefill-tokens 1024"
+local_model:=/opt/ds/model/deployed_model
+tensor_parallelism:=1
+
+check-env:
+	@if [[ -z "$${TENANCY_NAME}" ]]; then \
+		echo "TENANCY_NAME is not set or is empty"; \
+		exit 1; \
+	fi
+	@if [[ -z "$${REGION_KEY}" ]]; then \
+		echo "REGION_KEY is not set or is empty"; \
+		exit 1; \
+	fi
+	@echo "Both TENANCY_NAME and REGION_KEY are set and have values."
+build.tgi:
+	cd tgi/ && docker build --network host -t ${TGI_INFERENCE_IMAGE} -f Dockerfile.tgi .
+build.vllm: check-env
+	cd vllm/ && docker build --network host -t ${VLLM_INFERENCE_IMAGE} -f Dockerfile.vllm .
+run.tgi.hf:
+	docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e TOKEN_FILE=${target_token} -e PARAMS=${params} -e MODEL=${model} -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
+run.tgi.oci:
+	docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e PARAMS=${params} -e MODEL=${local_model} -v ${MODEL_DIR}:${TARGET_DIR} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
+run.vllm.hf:
+	docker run --rm -it --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e UVICORN_NO_USE_COLORS=1 -e TOKEN_FILE=${target_token} -e MODEL=${model} -e TENSOR_PARALLELISM=${tensor_parallelism} -e HUGGINGFACE_HUB_CACHE=${HF_DIR} -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
+run.vllm.oci:
+	docker run --rm -d --gpus all --shm-size 1g -p ${port}:${port} -e PORT=${port} -e UVICORN_NO_USE_COLORS=1 -e MODEL=${local_model} -e TENSOR_PARALLELISM=${tensor_parallelism} -v ${MODEL_DIR}:${TARGET_DIR} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
+shell.tgi:
+	docker run --rm -it --shm-size=1g --net host -p ${port}:${port} -e TOKEN_FILE=${target_token} -e PARAMS="--model-id ${model}" --entrypoint bash -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE} 
+shell.vllm:
+	docker run --rm -it --shm-size=15g --net host -p ${port}:${port} -e TOKEN_FILE=${target_token} -e PARAMS="--model ${model}" -e HUGGINGFACE_HUB_CACHE=${HF_DIR} --entrypoint bash -v ${MODEL_DIR}:${TARGET_DIR} -v ${token}:${target_token} --name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE} 
+stop.tgi:
+	docker stop ${TGI_CONTAINER_NAME}
+stop.vllm:
+	docker stop ${VLLM_CONTAINER_NAME}
+push.tgi:
+	docker push ${TGI_INFERENCE_IMAGE}
+push.vllm:
+	docker push ${VLLM_INFERENCE_IMAGE}
@@ -0,0 +1,140 @@
+# Overview
+
+This repo provides two approaches to manage the inference server for LLM deployment in OCI Data Science:
+
+* [Text Generation Inference](https://github.com/huggingface/text-generation-inference) from HuggingFace.
+* [vLLM](https://github.com/vllm-project/vllm) developed at UC Berkeley
+
+## Prerequisites
+
+* This is Limited Available feature. Please reach out to us via email `[email protected]`  to ask to be allowlisted for this LA feature.
+* Configure your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) to be able to run and test your code locally.
+* Install [Docker](https://docs.docker.com/get-docker) or [Rancher Desktop](https://rancherdesktop.io/) as docker alternative.
+
+## Required IAM Policies
+
+Public [documentation](https://docs.oracle.com/en-us/iaas/data-science/using/policies.htm).
+
+### Bring your own container [policies](https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__access-logging-service#model_dep_policies_auth__access-custom-container)
+`ALL { resource.type = 'datasciencemodeldeployment' }`
+
+`allow dynamic-group <dynamic-group-name> to read repos in compartment <compartment-name> where ANY {request.operation='ReadDockerRepositoryMetadata',request.operation='ReadDockerRepositoryManifest',request.operation='PullDockerLayer' }`
+
+#### If the repository is in the root compartment, allow read for the tenancy
+
+`allow dynamic-group <dynamic-group-name> to read repos in tenancy where ANY {
+    request.operation='ReadDockerRepositoryMetadata',
+    request.operation='ReadDockerRepositoryManifest',
+    request.operation='PullDockerLayer'
+}`
+
+#### For user level policies
+
+`allow any-user to read repos in tenancy where ALL { request.principal.type = 'datasciencemodeldeployment' }`
+
+`allow any-user to read repos in compartment <compartment-name> where ALL { request.principal.type = 'datasciencemodeldeployment'}`
+
+For all other Data Science policies, please refer these [details](https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/distributed_training/README.md#3-oci-policies).
+
+## Build TGI Container
+To construct the required containers for this deployment and retain the necessary information, please complete the following steps:
+
+* Checkout this repository
+* Enter the path `model-deployment/containers/llm/inference-images`
+
+    ```bash
+    cd model-deployment/containers/llm/inference-images
+    ```
+* This example uses [OCI Container Registry](https://docs.oracle.com/en-us/iaas/Content/Registry/Concepts/registryoverview.htm) to store the container image required for the deployment. For the `Makefile` to execute the container build and push process to Oracle Cloud Container Registry, you have to setup in your local terminal the  `TENANCY_NAME` and `REGION_KEY` environment variables.`TENANCY_NAME` is the name of your tenancy, which you can find under your [account settings](https://cloud.oracle.com/tenancy) and the `REGION_KEY` is a 3 letter name of your tenancy region, you consider to use for this example, for example IAD for Ashburn, or FRA for Frankfurt. You can find the region keys in our public documentation for [Regions and Availability Domains](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm)
+
+    ```bash
+    export TENANCY_NAME=<your-tenancy-name>
+    export REGION_KEY=<region-key>
+    ```
+
+You can find the official documentation about OCI Data Science Model Deployment: [https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_create.htm]
+
+* Build the TGI container image, this step would take awhile
+
+    ```bash
+    make build.tgi
+    ```
+
+* Before we can push the newly build container make sure that you've created the `text-generation-interface-odsc` repository in your tenancy.
+  * Go to your tenancy [Container Registry](https://cloud.oracle.com/compute/registry/containers)
+  * Click on the `Create repository` button
+  * Select `Private` under Access types
+  * Set `text-generation-interface-odsc` as a `Repository name`
+  * Click on `Create` button
+
+* You may need to `docker login` to the Oracle Cloud Container Registry (OCIR) first, if you haven't done so before been able to push the image. To login you have to use your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) that can be created under your `Oracle Cloud Account->Auth Token`. You need to login only once.
+
+    ```bash
+    docker login -u '<tenant-namespace>/<username>' <region>.ocir.io
+    ```
+
+    If `your tenancy` is **federated** with Oracle Identity Cloud Service, use the format `<tenancy-namespace>/oracleidentitycloudservice/<username>`
+
+* Push the container image to the OCIR
+
+    ```bash
+    make push.tgi
+    ```
+
+## Build vLLM Container
+
+You can find the official documentation about OCI Data Science Model Deployment: [https://docs.oracle.com/en-us/iaas/data-science/using/model_dep_create.htm]
+
+* Build the vLLM container image, this step would take awhile
+
+    ```bash
+    make build.vllm
+    ```
+
+* Before we can push the newly build container make sure that you've created the `vllm-odsc` repository in your tenancy.
+  * Go to your tenancy [Container Registry](https://cloud.oracle.com/compute/registry/containers)
+  * Click on the `Create repository` button
+  * Select `Private` under Access types
+  * Set `vllm-odsc` as a `Repository name`
+  * Click on `Create` button
+
+* You may need to `docker login` to the Oracle Cloud Container Registry (OCIR) first, if you haven't done so before been able to push the image. To login you have to use your [API Auth Token](https://docs.oracle.com/en-us/iaas/Content/Registry/Tasks/registrygettingauthtoken.htm) that can be created under your `Oracle Cloud Account->Auth Token`. You need to login only once.
+
+    ```bash
+    docker login -u '<tenant-namespace>/<username>' <region>.ocir.io
+    ```
+
+    If `your tenancy` is **federated** with Oracle Identity Cloud Service, use the format `<tenancy-namespace>/oracleidentitycloudservice/<username>`
+
+* Push the container image to the OCIR
+
+    ```bash
+    make push.vllm
+    ```
+
+
+
+### Advanced debugging options: Code debugging inside the container using job
+For more detailed level of debugging, user can refer [README-DEBUG.md](./README-DEBUG.md).
+
+## Additional Make Commands
+
+### TGI containers
+
+`make build.tgi` to build the container
+
+`make run.tgi` to run the container
+
+`make shell.tgi` to launch container with shell prompt
+
+`make stop.tgi` to stop the running container
+
+### vLLM containers
+
+`make build.vllm` to build the container
+
+`make run.vllm` to run the container
+
+`make shell.vllm` to launch container with shell prompt
+
+`make stop.vllm` to stop the running container
@@ -0,0 +1,28 @@
+FROM ghcr.io/huggingface/text-generation-inference:0.9.3
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get -y install tzdata && apt-get install -y curl
+RUN pip install flask
+
+WORKDIR /home/datascience
+RUN ln -s /home/datascience /data
+
+COPY start-tgi.sh /etc/
+RUN chmod a+x /etc/start-tgi.sh
+
+ENV PORT 8080
+ENV MODEL /opt/ds/model/deployed_model
+
+# llama2-7b-hf with A10 shape
+ENV PARAMS "--max-batch-prefill-tokens 1024"
+
+# llama2-13b-hf with A10 shape
+# ENV PARAMS "--max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096"
+
+# llama2-70b-hf with A100 shape
+# ENV PARAMS "-max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096 --num-shard <NUMBER_OF_ALLOWED_GPUS or WORLD_SIZE>"
+
+EXPOSE ${PORT}
+
+ENTRYPOINT [ "/bin/bash", "--login",  "-c"]
+CMD ["/etc/start-tgi.sh"]
@@ -0,0 +1,19 @@
+#!/bin/bash 
+if [ -z "$TOKEN_FILE" ] ; then
+  echo "No authentication token is provided. Weights are assumed to be downloaded from OCI Model Catalog."
+else
+  export HUGGING_FACE_HUB_TOKEN=$(cat $TOKEN_FILE)
+  echo "Checking internet connection: "
+  curl -sI -v https://www.wikipedia.org
+  echo "Downloading weights:"
+  text-generation-server download-weights $MODEL
+  echo "Download weights complete"
+  echo $(du -sh /home/datascience/*)
+fi
+
+echo "Starting TGI..."
+text-generation-launcher --json-output --hostname 0.0.0.0 --port $PORT --model-id $MODEL $PARAMS
+
+echo "Exiting TGI. Here is the disk utilization of /home/datascience - "
+echo $(du -sh /home/datascience)
+exit $LastExitCode
@@ -0,0 +1,36 @@
+FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get -y install tzdata && apt-get install -y curl && apt-get install -y git
+RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh
+RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh;
+ENV PATH="/miniconda/bin:$PATH"
+RUN mkdir -p /opt/vllm
+
+ARG INSTALL_DIR=/opt/vllm
+COPY vllm-env.yaml /opt/vllm/environment.yaml
+RUN conda env create --name vllm -f ${INSTALL_DIR}/environment.yaml
+
+ENV TMPDIR=/home/datascience
+WORKDIR /home/datascience
+
+COPY start-vllm.sh ${INSTALL_DIR}/start.sh
+RUN chmod a+x ${INSTALL_DIR}/start.sh
+COPY vllm-log-config.yaml ${INSTALL_DIR}/vllm-log-config.yaml
+ENV UVICORN_LOG-CONFIG=${INSTALL_DIR}/vllm-log-config.yaml
+ENV UVICORN_LOG_CONFIG=${INSTALL_DIR}/vllm-log-config.yaml
+
+# Default location where downloaded models are mapped on model container. No need to override, if using model catalog.
+ENV MODEL /opt/ds/model/deployed_model
+
+# Tensor parallelism required by the model
+ENV TENSOR_PARALLELISM 1
+
+# Custom port for model container. No need to override.
+ENV PORT 8080
+EXPOSE ${PORT}
+ENV VLLM_DIR=${INSTALL_DIR}
+COPY vllm-api-server.py ${VLLM_DIR}/vllm-api-server.py
+
+ENTRYPOINT [ "/bin/bash", "--login",  "-c"]
+CMD ["$VLLM_DIR/start.sh"]
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+if [ -z "$TOKEN_FILE" ] ; then
+  echo "No authentication token is provided. Weights are assumed to be downloaded from OCI Model Catalog."
+else
+  export HUGGING_FACE_HUB_TOKEN=$(cat $TOKEN_FILE)
+  echo "The md5 of token is $(md5sum $TOKEN_FILE)"
+  mkdir -p /home/datascience/.cache/huggingface
+  cp $TOKEN_FILE /home/datascience/.cache/huggingface/token
+  echo "Copied token file to /home/datascience/.cache/huggingface, $(md5sum /home/datascience/.cache/huggingface/token)"
+  echo "Set HuggingFace cache folder..."
+  export HUGGINGFACE_HUB_CACHE=/home/datascience/.cache
+  echo "The size of partitions"
+  echo $(df -h /home/datascience)
+  df -h
+  echo "Checking internet connection: "
+  curl -sI -v https://www.wikipedia.org
+  echo $(du -sh /home/datascience/*)
+fi
+
+if [ "$API_SPEC" == "openai" ]
+then
+    echo "starting vllm engine with openai spec"
+    nginx -p $PWD 
+    source activate vllm && \
+    WEB_CONCURRENCY=1 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --model ${MODEL} --tensor-parallel-size ${TENSOR_PARALLELISM} ${PARAMS}
+else
+    echo "starting vllm engine with default api spec"
+    source activate vllm && \
+    WEB_CONCURRENCY=1 python $VLLM_DIR/vllm-api-server.py --host 0.0.0.0 --log-config $VLLM_DIR/vllm-log-config.yaml --model ${MODEL} --tensor-parallel-size ${TENSOR_PARALLELISM} ${PARAMS}
+fi
+
+echo "Exiting vLLM. Here is the disk utilization of /home/datascience - "
+echo $(du -sh /home/datascience)
+echo "server logs: "
+ls -lah /home/datascience
+cat /home/datascience/server.log