diff --git a/dockerfiles/pytorch/Dockerfile.inf2 b/dockerfiles/pytorch/Dockerfile.inf2 index d84c43ce..426e17cf 100644 --- a/dockerfiles/pytorch/Dockerfile.inf2 +++ b/dockerfiles/pytorch/Dockerfile.inf2 @@ -1,34 +1,23 @@ # Build based on https://github.com/aws/deep-learning-containers/blob/master/huggingface/pytorch/inference/docker/2.1/py3/sdk2.18.0/Dockerfile.neuronx -FROM ubuntu:20.04 as base +FROM ubuntu:22.04 AS base LABEL maintainer="Hugging Face" -ARG PYTHON=python3.10 -ARG PYTHON_VERSION=3.10.12 -ARG MAMBA_VERSION=23.1.0-4 - -# Neuron SDK components version numbers -# ARG NEURONX_FRAMEWORK_VERSION=2.1.2.2.1.0 -# ARG NEURONX_DISTRIBUTED_VERSION=0.7.0 -# ARG NEURONX_CC_VERSION=2.13.66.0 -ARG NEURONX_TRANSFORMERS_VERSION=0.12.313 ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42 ARG NEURONX_TOOLS_VERSION=2.19.0.0 - # HF ARGS ARG OPTIMUM_NEURON_VERSION=0.0.28 # See http://bugs.python.org/issue19846 -ENV LANG C.UTF-8 -ENV LD_LIBRARY_PATH /opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH -ENV PATH /opt/conda/bin:/opt/aws/neuron/bin:$PATH +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/aws/neuron/bin:$PATH RUN apt-get update \ && apt-get upgrade -y \ && apt-get install -y --no-install-recommends software-properties-common \ - && add-apt-repository ppa:openjdk-r/ppa \ && apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ @@ -36,25 +25,14 @@ RUN apt-get update \ ca-certificates \ cmake \ curl \ - emacs \ git \ jq \ - libgl1-mesa-glx \ - libsm6 \ - libxext6 \ - libxrender-dev \ - openjdk-11-jdk \ - vim \ wget \ unzip \ zlib1g-dev \ - libcap-dev \ - gpg-agent \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean + gpg-agent -RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - # Install Neuronx tools @@ -62,55 +40,30 @@ RUN apt-get update \ && apt-get install -y \ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION + +RUN apt-get install -y \ + python3 \ + python3-pip \ + python-is-python3 + +RUN rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean -# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files -RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ - mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ - /var/lib/dpkg/info/ca-certificates-java.postinst configure; - -RUN curl -L -o ~/mambaforge.sh https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-x86_64.sh \ - && chmod +x ~/mambaforge.sh \ - && ~/mambaforge.sh -b -p /opt/conda \ - && rm ~/mambaforge.sh \ - && /opt/conda/bin/conda update -y conda \ - && /opt/conda/bin/conda install -c conda-forge -y \ - python=$PYTHON_VERSION \ - pyopenssl \ - cython \ - mkl-include \ - mkl \ - botocore \ - parso \ - scipy \ - typing \ - # Below 2 are included in miniconda base, but not mamba so need to install - conda-content-trust \ - charset-normalizer \ - && /opt/conda/bin/conda update -y conda \ - && /opt/conda/bin/conda clean -ya - -RUN conda install -c conda-forge \ - scikit-learn \ - h5py \ - requests \ - && conda clean -ya \ - && pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ - && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ - && pip install --no-cache-dir "protobuf>=3.18.3,<4" setuptools==69.5.1 packaging - +RUN pip install --no-cache-dir "protobuf>=3.18.3,<4" setuptools==69.5.1 packaging + WORKDIR / # install Hugging Face libraries and its dependencies -RUN pip install --extra-index-url https://pip.repos.neuron.amazonaws.com --no-cache-dir optimum-neuron[neuronx]==${OPTIMUM_NEURON_VERSION} \ +RUN pip install --extra-index-url https://pip.repos.neuron.amazonaws.com --no-cache-dir optimum-neuron[neuronx]==${OPTIMUM_NEURON_VERSION} \ && pip install --no-deps --no-cache-dir -U torchvision==0.16.* +# FIXME +RUN pip install --extra-index-url https://pip.repos.neuron.amazonaws.com git+https://github.com/huggingface/optimum-neuron.git@5237fb0ada643ba471f60ed3a5d2eef3b66e8e59 COPY . . -# install wheel and setuptools + RUN pip install --no-cache-dir -U pip ".[st]" # copy application @@ -119,5 +72,7 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle # copy entrypoint and change permissions COPY --chmod=0755 scripts/entrypoint.sh entrypoint.sh +COPY --chmod=0755 scripts/inf2_env.py inf2_env.py +COPY --chmod=0755 scripts/inf2_entrypoint.sh inf2_entrypoint.sh -ENTRYPOINT ["bash", "-c", "./entrypoint.sh"] +ENTRYPOINT ["bash", "-c", "./inf2_entrypoint.sh"] diff --git a/scripts/inf2_entrypoint.sh b/scripts/inf2_entrypoint.sh new file mode 100644 index 00000000..650633c6 --- /dev/null +++ b/scripts/inf2_entrypoint.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e -o pipefail -u + +export ENV_FILEPATH=$(mktemp) + +trap "rm -f ${ENV_FILEPATH}" EXIT + +touch $ENV_FILEPATH + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +${SCRIPT_DIR}/inf2_env.py $@ + +source $ENV_FILEPATH + +rm -f $ENV_FILEPATH + +exec ${SCRIPT_DIR}/entrypoint.sh $@ \ No newline at end of file diff --git a/scripts/inf2_env.py b/scripts/inf2_env.py new file mode 100644 index 00000000..da69b5a2 --- /dev/null +++ b/scripts/inf2_env.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python + +""" +This script is here to specify all missing environment variables that would be required to run some encoder models on +inferentia2. +""" + +import argparse +import logging +import os +import sys +from typing import Any, Dict, List, Optional + +from huggingface_hub import constants +from transformers import AutoConfig + +from optimum.neuron.utils import get_hub_cached_entries +from optimum.neuron.utils.version_utils import get_neuronxcc_version + +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True) +logger = logging.getLogger(__name__) + +env_config_peering = [ + ("HF_BATCH_SIZE", "static_batch_size"), + ("HF_OPTIMUM_SEQUENCE_LENGTH", "static_sequence_length"), +] + +# By the end of this script all env vars should be specified properly +env_vars = list(map(lambda x: x[0], env_config_peering)) + +# Currently not used for encoder models +# available_cores = get_available_cores() + +neuronxcc_version = get_neuronxcc_version() + + +def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser() + if not argv: + argv = sys.argv + # All these are params passed to tgi and intercepted here + parser.add_argument( + "--batch-size", + type=int, + default=os.getenv("HF_BATCH_SIZE", os.getenv("BATCH_SIZE", 0)), + ) + parser.add_argument( + "--sequence-length", type=int, + default=os.getenv("HF_OPTIMUM_SEQUENCE_LENGTH", + os.getenv("SEQUENCE_LENGTH", 0)) + ) + + parser.add_argument("--model-id", type=str, default=os.getenv("HF_MODEL_ID", os.getenv("HF_MODEL_DIR"))) + parser.add_argument("--revision", type=str, default=os.getenv("REVISION")) + + args = parser.parse_known_args(argv)[0] + + if not args.model_id: + raise Exception( + "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var" + ) + + # Override env with cmdline params + os.environ["MODEL_ID"] = args.model_id + + # Set all tgi router and tgi server values to consistent values as early as possible + # from the order of the parser defaults, the tgi router value can override the tgi server ones + if args.batch_size > 0: + os.environ["HF_BATCH_SIZE"] = str(args.batch_size) + + if args.sequence_length > 0: + os.environ["HF_OPTIMUM_SEQUENCE_LENGTH"] = str(args.sequence_length) + + if args.revision: + os.environ["REVISION"] = str(args.revision) + + return args + + +def neuron_config_to_env(neuron_config): + with open(os.environ["ENV_FILEPATH"], "w") as f: + for env_var, config_key in env_config_peering: + f.write("export {}={}\n".format(env_var, neuron_config[config_key])) + + +def sort_neuron_configs(dictionary): + return -dictionary["static_batch_size"] + + +def lookup_compatible_cached_model( + model_id: str, revision: Optional[str] +) -> Optional[Dict[str, Any]]: + # Reuse the same mechanic as the one in use to configure the tgi server part + # The only difference here is that we stay as flexible as possible on the compatibility part + entries = get_hub_cached_entries(model_id, "inference") + + logger.debug( + "Found %d cached entries for model %s, revision %s", + len(entries), + model_id, + revision, + ) + + all_compatible = [] + for entry in entries: + if check_env_and_neuron_config_compatibility( + entry, check_compiler_version=True + ): + all_compatible.append(entry) + + if not all_compatible: + logger.debug( + "No compatible cached entry found for model %s, env %s, neuronxcc version %s", + model_id, + get_env_dict(), + neuronxcc_version, + ) + return None + + logger.info("%d compatible neuron cached models found", len(all_compatible)) + + all_compatible = sorted(all_compatible, key=sort_neuron_configs) + + entry = all_compatible[0] + + logger.info("Selected entry %s", entry) + + return entry + + +def check_env_and_neuron_config_compatibility( + neuron_config: Dict[str, Any], check_compiler_version: bool +) -> bool: + logger.debug( + "Checking the provided neuron config %s is compatible with the local setup and provided environment", + neuron_config, + ) + + # Local setup compat checks + # if neuron_config["num_cores"] > available_cores: + # logger.debug( + # "Not enough neuron cores available to run the provided neuron config" + # ) + # return False + + if ( + check_compiler_version + and neuron_config["compiler_version"] != neuronxcc_version + ): + logger.debug( + "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)", + neuronxcc_version, + neuron_config["compiler_version"], + ) + return False + + for env_var, config_key in env_config_peering: + try: + neuron_config_value = str(neuron_config[config_key]) + except KeyError: + logger.debug("No key %s found in neuron config %s", config_key, neuron_config) + return False + env_value = os.getenv(env_var, str(neuron_config_value)) + if env_value != neuron_config_value: + logger.debug( + "The provided env var '%s' and the neuron config '%s' param differ (%s != %s)", + env_var, + config_key, + env_value, + neuron_config_value, + ) + return False + + return True + + +def get_env_dict() -> Dict[str, str]: + d = {} + for k in env_vars: + d[k] = os.getenv(k) + return d + + +def main(): + """ + This script determines proper default TGI env variables for the neuron precompiled models to + work properly + :return: + """ + args = parse_cmdline_and_set_env() + + for env_var in env_vars: + if not os.getenv(env_var): + break + else: + logger.info( + "All env vars %s already set, skipping, user know what they are doing", + env_vars, + ) + sys.exit(0) + + cache_dir = constants.HF_HUB_CACHE + + logger.info("Cache dir %s, model %s", cache_dir, args.model_id) + + config = AutoConfig.from_pretrained(args.model_id, revision=args.revision) + neuron_config = getattr(config, "neuron", None) + if neuron_config is not None: + compatible = check_env_and_neuron_config_compatibility( + neuron_config, check_compiler_version=False + ) + if not compatible: + env_dict = get_env_dict() + msg = ( + "Invalid neuron config and env. Config {}, env {}, neuronxcc version {}" + ).format(neuron_config, env_dict, neuronxcc_version) + logger.error(msg) + raise Exception(msg) + else: + neuron_config = lookup_compatible_cached_model(args.model_id, args.revision) + + if not neuron_config: + neuron_config = {'static_batch_size': 1, 'static_sequence_length': 128} + msg = ( + "No compatible neuron config found. Provided env {}, neuronxcc version {}. Falling back to default" + ).format(get_env_dict(), neuronxcc_version, neuron_config) + logger.info(msg) + + logger.info("Final neuron config %s", neuron_config) + + neuron_config_to_env(neuron_config) + + +if __name__ == "__main__": + main()