diff --git a/containers/pytorch/training/README.md b/containers/pytorch/training/README.md index 1fd7e915..20c13904 100644 --- a/containers/pytorch/training/README.md +++ b/containers/pytorch/training/README.md @@ -29,7 +29,7 @@ The PyTorch Training containers will start a training job that will start on `do docker run --gpus all -ti \ -v $(pwd)/artifact:/artifact \ -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \ - us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-42.ubuntu2204.py310 \ + us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-47.ubuntu2204.py311 \ trl sft \ --model_name_or_path google/gemma-2b \ --attn_implementation "flash_attention_2" \ @@ -76,7 +76,7 @@ The PyTorch Training containers come with two different containers depending on - **GPU**: To build the PyTorch Training container for GPU, an instance with at least one NVIDIA GPU available is required to install `flash-attn` (used to speed up the attention layers during training and inference). ```bash - docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-42.ubuntu2204.py310 -f containers/pytorch/training/gpu/2.3.0/transformers/4.42.3/py310/Dockerfile . + docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-47.ubuntu2204.py311 -f containers/pytorch/training/gpu/2.3.0/transformers/4.47.1/py311/Dockerfile . ``` - **TPU**: You can build PyTorch Training container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM diff --git a/containers/pytorch/training/gpu/2.3.1/transformers/4.48.0/py311/Dockerfile b/containers/pytorch/training/gpu/2.3.1/transformers/4.48.0/py311/Dockerfile new file mode 100644 index 00000000..333aea82 --- /dev/null +++ b/containers/pytorch/training/gpu/2.3.1/transformers/4.48.0/py311/Dockerfile @@ -0,0 +1,101 @@ +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 +SHELL ["/bin/bash", "-c"] + +LABEL maintainer="Hugging Face" +ARG DEBIAN_FRONTEND=noninteractive + +# Versions +ARG CUDA="cu121" +ARG PYTORCH="2.3.1" +ARG FLASH_ATTN="2.6.3" +ARG TRANSFORMERS="4.48.0" +ARG HUGGINGFACE_HUB="0.27.0" +ARG DIFFUSERS="0.32.1" +ARG PEFT="0.14.0" +ARG TRL="0.13.0" +ARG BITSANDBYTES="0.45.0" +ARG DATASETS="3.2.0" +ARG ACCELERATE="1.2.1" +ARG EVALUATE="0.4.3" +ARG SENTENCE_TRANSFORMERS="3.3.1" +ARG DEEPSPEED="0.16.1" +ARG MAX_JOBS=4 + +RUN apt-get update -y && \ + apt-get install software-properties-common -y && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get -y upgrade --only-upgrade systemd openssl cryptsetup && \ + apt-get install -y \ + build-essential \ + bzip2 \ + curl \ + git \ + git-lfs \ + tar \ + gcc \ + g++ \ + cmake \ + gnupg \ + libprotobuf-dev \ + libaio-dev \ + protobuf-compiler \ + python3.11 \ + python3.11-dev \ + libsndfile1-dev \ + ffmpeg && \ + apt-get clean autoremove --yes && \ + rm -rf /var/lib/apt/lists/* + +# Set Python 3.11 as the default python version +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + ln -sf /usr/bin/python3.11 /usr/bin/python + +# Install pip from source and upgrade it +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py && \ + pip install --upgrade pip + +# Install latest release PyTorch (PyTorch must be installed before any DeepSpeed C++/CUDA ops.) +RUN pip install --no-cache-dir --index-url https://download.pytorch.org/whl/${CUDA} "torch==${PYTORCH}" torchvision torchaudio + +# Install and upgrade Flash Attention 2 +RUN pip install --no-cache-dir packaging ninja +RUN MAX_JOBS=${MAX_JOBS} pip install --no-build-isolation flash-attn==${FLASH_ATTN} + +# Install Hugging Face Libraries +RUN pip install --no-cache-dir \ + "transformers[sklearn,sentencepiece,vision]==${TRANSFORMERS}" \ + "huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB}" \ + "diffusers==${DIFFUSERS}" \ + "datasets==${DATASETS}" \ + "accelerate==${ACCELERATE}" \ + "evaluate==${EVALUATE}" \ + "peft==${PEFT}" \ + "trl==${TRL}" \ + "sentence-transformers==${SENTENCE_TRANSFORMERS}" \ + "deepspeed==${DEEPSPEED}" \ + "bitsandbytes==${BITSANDBYTES}" \ + tensorboard \ + jupyter notebook + +ENV HF_HUB_ENABLE_HF_TRANSFER="1" + +# Install Google Cloud Dependencies +RUN pip install --upgrade --no-cache-dir \ + google-cloud-storage \ + google-cloud-bigquery \ + google-cloud-aiplatform \ + google-cloud-pubsub \ + google-cloud-logging + +# Install Google CLI single command +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ + | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ + | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + touch /var/lib/dpkg/status && \ + apt-get update -y && \ + apt-get install google-cloud-sdk -y && \ + apt-get clean autoremove --yes && \ + rm -rf /var/lib/{apt,dpkg,cache,log}