diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile new file mode 100644 index 00000000000..8aefbfe8f47 --- /dev/null +++ b/.ci/docker/Dockerfile @@ -0,0 +1,25 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND noninteractive + +# Install common dependencies (so that this step can be cached separately) +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +# Setup user +# TODO: figure out how to remove this part +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +COPY ./common/install_docs_reqs.sh install_docs_reqs.sh +RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh + +COPY ./common/install_pip_requirements.sh install_pip_requirements.sh +COPY ./requirements.txt requirements.txt +RUN bash ./install_pip_requirements.sh && rm install_pip_requirements.sh + +RUN ln -s /usr/bin/python3 /usr/bin/python + +USER ci-user +CMD ["bash"] diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh new file mode 100755 index 00000000000..42fb88997dd --- /dev/null +++ b/.ci/docker/build.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +IMAGE_NAME="$1" +shift + +export UBUNTU_VERSION="22.04" +export CUDA_VERSION="12.4.1" + +export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" +echo "Building ${IMAGE_NAME} Docker image" + +docker build \ + --no-cache \ + --progress=plain \ + -f Dockerfile \ + --build-arg BASE_IMAGE="${BASE_IMAGE}" \ + "$@" \ + . diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh new file mode 100644 index 00000000000..3100b550a89 --- /dev/null +++ b/.ci/docker/common/install_base.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker + +set -ex + +install_ubuntu() { + # Install common dependencies + apt-get update + # TODO: Some of these may not be necessary + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake=3.22* \ + curl \ + git \ + wget \ + sudo \ + vim \ + jq \ + vim \ + unzip \ + gdb \ + rsync \ + libssl-dev \ + p7zip-full \ + libglfw3 \ + libglfw3-dev \ + sox \ + libsox-dev \ + libsox-fmt-all \ + python3-pip \ + python3-dev + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +} + +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + ubuntu) + install_ubuntu + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; +esac diff --git a/.ci/docker/common/install_docs_reqs.sh b/.ci/docker/common/install_docs_reqs.sh new file mode 100644 index 00000000000..541c9976ad1 --- /dev/null +++ b/.ci/docker/common/install_docs_reqs.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker +set -ex + +apt-get update +apt-get install -y gpg-agent + +curl --retry 3 -sL https://deb.nodesource.com/setup_20.x | sudo -E bash - +sudo apt-get install -y nodejs + +curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - +echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + +apt-get update +apt-get install -y --no-install-recommends yarn +yarn global add katex --prefix /usr/local + +sudo apt-get -y install doxygen + +apt-get autoclean && apt-get clean +rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/.ci/docker/common/install_pip_requirements.sh b/.ci/docker/common/install_pip_requirements.sh new file mode 100644 index 00000000000..a548d200462 --- /dev/null +++ b/.ci/docker/common/install_pip_requirements.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -ex + +# Install pip packages +pip install --upgrade pip +pip install -r ./requirements.txt diff --git a/.ci/docker/common/install_user.sh b/.ci/docker/common/install_user.sh new file mode 100644 index 00000000000..6deb62086bc --- /dev/null +++ b/.ci/docker/common/install_user.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Copied from https://github.com/pytorch/executorch/blob/6e431355a554e5f84c3a05dfa2b981ead90c2b48/.ci/docker/common/install_user.sh#L1 + +set -ex + +# Same as ec2-user +echo "ci-user:x:1000:1000::/var/lib/ci-user:" >> /etc/passwd +echo "ci-user:x:1000:" >> /etc/group +# Needed on Focal or newer +echo "ci-user:*:19110:0:99999:7:::" >> /etc/shadow + +# Create $HOME +mkdir -p /var/lib/ci-user +chown ci-user:ci-user /var/lib/ci-user + +# Allow sudo +echo 'ci-user ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/ci-user + +# Test that sudo works +sudo -u ci-user sudo -v diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt new file mode 100644 index 00000000000..56df738f96c --- /dev/null +++ b/.ci/docker/requirements.txt @@ -0,0 +1,73 @@ +# --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version. +# Refer to ./jenkins/build.sh for tutorial build instructions + +sphinx==5.0.0 +sphinx-gallery==0.11.1 +sphinx_design +docutils==0.16 +sphinx-copybutton +sphinx_sitemap==2.6.0 +pypandoc==1.12 +pandocfilters +markdown +tqdm==4.66.1 +numpy==1.24.4 +matplotlib +librosa +torch==2.5 +torchvision +torchdata +networkx +PyHamcrest +bs4 +awscliv2==2.1.1 +flask +spacy==3.4.1 +ray[tune]==2.7.2 +tensorboard +jinja2==3.1.3 +pytorch-lightning +torchx +torchrl==0.6.0 +tensordict==0.6.0 +ax-platform>=0.4.0 +nbformat>=5.9.2 +datasets +transformers +torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable +onnx +onnxscript +onnxruntime +evaluate +accelerate>=0.20.1 + +importlib-metadata==6.8.0 + +# PyTorch Theme +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme + +ipython + +sphinxcontrib.katex +# to run examples +boto3 +pandas +requests +scikit-image +scipy==1.11.1 +numba==0.57.1 +pillow==10.2.0 +wget +gym==0.26.2 +gym-super-mario-bros==7.4.0 +pyopengl +gymnasium[mujoco]==0.27.0 +timm +iopath +pygame==2.6.0 +pycocotools +semilearn==0.3.2 +torchao==0.5.0 +segment_anything==1.0 +torchrec==1.0.0; platform_system == "Linux" +fbgemm-gpu==1.0.0; platform_system == "Linux" diff --git a/.circleci/README.md b/.circleci/README.md deleted file mode 100644 index f45e8feebbc..00000000000 --- a/.circleci/README.md +++ /dev/null @@ -1 +0,0 @@ -Do not edit `config.yml` directly, make all the changes to `config.yml.in` and then run `regenerate.py` diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 70b2c7fd5b0..00000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,320 +0,0 @@ -# run python regenerate.py to generate config.yml from config.yml.in - -version: 2.1 - -executors: - windows-with-nvidia-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -e - sudo apt-get -qq update - sudo apt-get -qq install openssh-client git - -# This system setup script is meant to run before the CI-related scripts, e.g., -# installing Git client, checking out code, setting up CI env, and -# building/testing. -setup_linux_system_environment: &setup_linux_system_environment - name: Set Up System Environment - no_output_timeout: "1h" - command: | - set -ex - - # Stop background apt updates. Hypothetically, the kill should not - # be necessary, because stop is supposed to send a kill signal to - # the process, but we've added it for good luck. Also - # hypothetically, it's supposed to be unnecessary to wait for - # the process to block. We also have that line for good luck. - # If you like, try deleting them and seeing if it works. - sudo systemctl stop apt-daily.service || true - sudo systemctl kill --kill-who=all apt-daily.service || true - - sudo systemctl stop unattended-upgrades.service || true - sudo systemctl kill --kill-who=all unattended-upgrades.service || true - - # wait until `apt-get update` has been killed - while systemctl is-active --quiet apt-daily.service - do - sleep 1; - done - while systemctl is-active --quiet unattended-upgrades.service - do - sleep 1; - done - - # See if we actually were successful - systemctl list-units --all | cat - - sudo apt-get purge -y unattended-upgrades - - cat /etc/apt/sources.list - - ps auxfww | grep [a]pt - ps auxfww | grep dpkg - -pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - steps: - - checkout - - run: - <<: *setup_linux_system_environment - - run: - name: Set Up CI Environment - no_output_timeout: "1h" - command: | - set -e - - sudo apt-get -y update - sudo apt-get -y install expect-dev moreutils - - sudo pip3 -q install awscli==1.16.35 - - if [ -n "${CUDA_VERSION}" ]; then - nvidia-smi - fi - - # This IAM user only allows read-write access to ECR - export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY} - export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} - eval $(aws ecr get-login --region us-east-1 --no-include-email) - - run: - name: Build - no_output_timeout: "20h" - command: | - set -e - - # for some reason, pip installs it in a different place than what is looked at in the py file - sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages - export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag} - - cat >/home/circleci/project/ci_build_script.sh \</dev/null - if [ -n "${CUDA_VERSION}" ]; then - export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - else - export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - fi - - echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env - echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env - echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env - # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! - set +x - if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then - if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi - - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env - else - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - fi - set -x - - echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash - docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" - - export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts - # Copy docs with plot to a docs dir - if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then - mkdir /home/circleci/project/docs - docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs - echo "Directory copied successfully" - else - echo "No docs_with_plot directory. Skipping..." - fi - - - store_artifacts: - path: ./docs - destination: tutorials - -pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" - resource_class: gpu.nvidia.small - <<: *pytorch_tutorial_build_defaults - -pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - resource_class: medium - - - <<: *pytorch_tutorial_build_defaults - -pytorch_windows_build_worker: &pytorch_windows_build_worker - executor: windows-with-nvidia-gpu - steps: - - checkout - - run: - name: Install Cuda - no_output_timeout: 30m - command: | - .circleci/scripts/windows_cuda_install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - keys: - - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - - run: - name: test - no_output_timeout: "1h" - command: | - .circleci/scripts/build_for_windows.sh - - save_cache: - key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - paths: - - advanced_source/data - - beginner_source/data - - intermediate_source/data - - prototype_source/data - -jobs: - pytorch_tutorial_pr_build_manager: - <<: *pytorch_tutorial_build_manager_defaults - pytorch_tutorial_pr_build_worker_0: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.small.multi - pytorch_tutorial_pr_build_worker_1: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.large - pytorch_tutorial_pr_build_worker_10: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_11: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_12: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_13: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_14: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_15: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_16: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_17: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_18: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_19: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_2: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_3: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_4: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_5: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_6: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_7: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_8: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_9: - <<: *pytorch_tutorial_build_worker_defaults - - pytorch_tutorial_trunk_build_manager: - <<: *pytorch_tutorial_build_manager_defaults - pytorch_tutorial_trunk_build_worker_0: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.small.multi - pytorch_tutorial_trunk_build_worker_1: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.large - pytorch_tutorial_trunk_build_worker_10: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_11: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_12: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_13: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_14: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_15: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_16: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_17: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_18: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_19: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_2: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_3: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_4: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_5: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_6: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_7: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_8: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_9: - <<: *pytorch_tutorial_build_worker_defaults - - pytorch_tutorial_windows_pr_build_worker_0: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_1: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_2: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_3: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_0: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_1: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_2: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_3: - <<: *pytorch_windows_build_worker - -workflows: - build: - when: - and: # All must be true to trigger - - equal: [ branch1, << pipeline.git.branch >> ] - - equal: [ branch2, << pipeline.git.branch >> ] - jobs: - # Build jobs that only run on PR - - pytorch_tutorial_pr_build_worker_0: - filters: - branches: - ignore: - - master - - main diff --git a/.circleci/scripts/build_for_windows.sh b/.circleci/scripts/build_for_windows.sh deleted file mode 100644 index 2d773b7259d..00000000000 --- a/.circleci/scripts/build_for_windows.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -set -eux -o pipefail - -retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) -} - -SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" -PROJECT_DIR="${SOURCE_DIR}/../.." -pushd $SOURCE_DIR - -#install wget and make -curl --retry 3 -k https://ymu.dl.osdn.jp/mingw/68260/mingw-get-0.6.3-mingw32-pre-20170905-1-bin.zip -o mingw32.zip -unzip mingw32.zip -d mingw32 -mingw32/bin/mingw-get.exe install mingw32-make -mingw32/bin/mingw-get.exe install msys-findutils -mv mingw32/bin/mingw32-make.exe mingw32/bin/make.exe -curl --retry 3 -k https://eternallybored.org/misc/wget/1.20.3/64/wget.exe -o mingw32/bin/wget.exe -export PATH="${SOURCE_DIR}/mingw32/bin:${SOURCE_DIR}/mingw32/msys/1.0/bin:$PATH" - -#install anaconda3 -export CONDA_HOME="${SOURCE_DIR}/conda" -export tmp_conda="${SOURCE_DIR}/conda" -export miniconda_exe="${SOURCE_DIR}/miniconda.exe" -rm -rf conda miniconda.exe -curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o miniconda.exe -./install_conda.bat -export PATH="${tmp_conda}:${tmp_conda}/Library/usr/bin:${tmp_conda}/Library/bin:${tmp_conda}/Scripts:${tmp_conda}/bin:$PATH" - -eval "$(conda shell.bash hook)" -conda create -qyn testenv python=3.7 -conda activate testenv - -REQUIREMENTS="$(grep -v '^ *#\|^torch\|^torchaudio\|^torchvision|^torchtext' $PROJECT_DIR/requirements.txt | grep .)" -echo -e "${REQUIREMENTS}" > requirements.txt -pip install -r requirements.txt -pip install pySoundFile -# Force uninstall torch & related packages, we'll install them using conda later. -pip uninstall -y torch torchvision torchtext -conda install -yq -c pytorch "cudatoolkit=10.2" pytorch torchvision torchtext torchaudio -python -m spacy download de_core_news_sm -python -m spacy download en_core_web_sm -pushd ${PROJECT_DIR} -DIR=.jenkins -export NUM_WORKERS=4 - -if [[ "${CIRCLE_JOB}" == *worker_* ]]; then - python $DIR/remove_runnable_code.py intermediate_source/model_parallel_tutorial.py intermediate_source/model_parallel_tutorial.py || true - python $DIR/remove_runnable_code.py advanced_source/static_quantization_tutorial.py advanced_source/static_quantization_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/hyperparameter_tuning_tutorial.py beginner_source/hyperparameter_tuning_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_io_tutorial.py beginner_source/audio_io_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_resampling_tutorial.py beginner_source/audio_resampling_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_data_augmentation_tutorial.py beginner_source/audio_data_augmentation_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_feature_extractions_tutorial.py beginner_source/audio_feature_extractions_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_feature_augmentation_tutorial.py beginner_source/audio_feature_augmentation_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/audio_datasets_tutorial.py beginner_source/audio_datasets_tutorial.py || true - python $DIR/remove_runnable_code.py beginner_source/dcgan_faces_tutorial.py beginner_source/dcgan_faces_tutorial.py || true - python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true - # Temp remove for mnist download issue. (Re-enabled for 1.8.1) - # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py beginner_source/fgsm_tutorial.py || true - - export WORKER_ID=$(echo "${CIRCLE_JOB}" | tr -dc '0-9') - count=0 - FILES_TO_RUN=() - for work_directory in beginner_source intermediate_source advanced_source recipes_source prototype_source; do - for filename in $(find $work_directory -name '\*.py' -not -path '\*/data/\*'); do - if [ $(($count % $NUM_WORKERS)) != $WORKER_ID ]; then - echo "Removing runnable code from "$filename - python $DIR/remove_runnable_code.py $filename $filename - else - echo "Keeping "$filename - FILES_TO_RUN+=($(basename $filename .py)) - fi - count=$((count+1)) - done - done - echo "FILES_TO_RUN: " ${FILES_TO_RUN[@]} -fi - -if [[ ! -d advanced_source/data || ! -d beginner_source/data || ! -d intermediate_source/data || ! -d prototype_source/data ]];then - make download -fi - -make html diff --git a/.circleci/scripts/install_conda.bat b/.circleci/scripts/install_conda.bat deleted file mode 100644 index 6052ad08b10..00000000000 --- a/.circleci/scripts/install_conda.bat +++ /dev/null @@ -1 +0,0 @@ -start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda% diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh deleted file mode 100644 index 425f1859306..00000000000 --- a/.circleci/scripts/windows_cuda_install.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -eux -o pipefail - -curl --retry 3 -kLO https://ossci-windows.s3.amazonaws.com/cuda_10.1.243_426.00_win10.exe -7z x cuda_10.1.243_426.00_win10.exe -ocuda_10.1.243_426.00_win10 -cd cuda_10.1.243_426.00_win10 -mkdir cuda_install_logs - -set +e - -./setup.exe -s nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1 -loglevel:6 -log:"$(pwd -W)/cuda_install_logs" - -set -e - -curl --retry 3 -kLO https://ossci-windows.s3.amazonaws.com/NvToolsExt.7z -7z x NvToolsExt.7z -oNvToolsExt -mkdir -p "C:/Program Files/NVIDIA Corporation/NvToolsExt" -cp -r NvToolsExt/* "C:/Program Files/NVIDIA Corporation/NvToolsExt/" -export NVTOOLSEXT_PATH="C:\\Program Files\\NVIDIA Corporation\\NvToolsExt\\" - -if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1/bin/nvcc.exe" -then - echo "CUDA installation failed" - mkdir -p /c/w/build-results - 7z a "c:\\w\\build-results\\cuda_install_logs.7z" cuda_install_logs - exit 1 -fi - -cd .. -rm -rf ./cuda_10.1.243_426.00_win10 -rm -f ./cuda_10.1.243_426.00_win10.exe diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 477207b2928..2be1df895be 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -24,7 +24,7 @@ ipython # to run examples pandas scikit-image -pillow==10.0.1 +pillow==10.3.0 wget # for codespaces env diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index c4bd3bde3cf..c6e0885eaa9 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -8,7 +8,7 @@ labels: [ body: - type: textarea attributes: - label: 🚀 Descirbe the improvement or the new tutorial + label: 🚀 Describe the improvement or the new tutorial description: | **Describe the improvement** placeholder: | diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py index 5da80f24f5b..a8e512a3204 100644 --- a/.github/scripts/docathon-label-sync.py +++ b/.github/scripts/docathon-label-sync.py @@ -25,11 +25,11 @@ def main(): issue_number = int(re.findall(r'#(\d{1,5})', pull_request_body)[0]) issue = repo.get_issue(issue_number) issue_labels = issue.labels - docathon_label_present = any(label.name == 'docathon-h1-2023' for label in issue_labels) + docathon_label_present = any(label.name == 'docathon-h1-2024' for label in issue_labels) # if the issue has a docathon label, add all labels from the issue to the PR. if not docathon_label_present: - print("The 'docathon-h1-2023' label is not present in the issue.") + print("The 'docathon-h1-2024' label is not present in the issue.") return pull_request_labels = pull_request.get_labels() issue_label_names = [label.name for label in issue_labels] @@ -37,7 +37,7 @@ def main(): if not labels_to_add: print("The pull request already has the same labels.") return - pull_request.set_labels(*labels_to_add) + pull_request.add_to_labels(*labels_to_add) print("Labels added to the pull request!") diff --git a/.github/workflows/StalePRs.yml b/.github/workflows/StalePRs.yml new file mode 100644 index 00000000000..4fbfc78550d --- /dev/null +++ b/.github/workflows/StalePRs.yml @@ -0,0 +1,157 @@ +# A workflow copied from the pytorch/pytorch repo stale PRs that implements similar logic to actions/stale. +# +# Compared to actions/stale, it is implemented to make API requests proportional +# to the number of stale PRs, not the total number of issues in the repo. This +# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into +# rate limits way too quickly. +# +# The behavior is: +# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it. +# - If a PR is labeled stale, after 30 days inactivity close the PR. +# - `high priority` and `no-stale` PRs are exempt. + +name: Close stale pull requests + +on: + schedule: + # Run at midnight UTC. + - cron: '0 0 * * *' + workflow_dispatch: + +jobs: + stale: + if: ${{ github.repository == 'pytorch/tutorials' }} + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + + steps: + - uses: actions/github-script@v6 + with: + script: | + // Do some dumb retries on requests. + const retries = 7; + const baseBackoff = 100; + const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout)); + github.hook.wrap('request', async (request, options) => { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await request(options); + } catch (err) { + if (attempt < retries) { + core.warning(`Request getting retried. Attempt: ${attempt}`); + await sleep(baseBackoff * Math.pow(2, attempt)); + continue; + } + throw err; + } + } + }); + + const MAX_API_REQUESTS = 100; + + // If a PRs not labeled stale, label them stale after no update for 60 days. + const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60; + // For PRs already labeled stale, close after not update for 30 days. + const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30; + + const STALE_MESSAGE = + "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `stale`.
" + + "Feel free to remove the `stale` label if you feel this was a mistake.
" + + "If you are unable to remove the `stale` label please contact a maintainer in order to do so.
" + + "If you want the bot to never mark this PR stale again, add the `no-stale` label.
" + + "`stale` pull requests will automatically be closed after 30 days of inactivity.
"; + + let numAPIRequests = 0; + let numProcessed = 0; + + async function processPull(pull) { + core.info(`[${pull.number}] URL: ${pull.html_url}`); + numProcessed += 1; + const labels = pull.labels.map((label) => label.name); + + // Skip if certain labels are present. + if (labels.includes("no-stale") || labels.includes("high priority")) { + core.info(`[${pull.number}] Skipping because PR has an exempting label.`); + return false; + } + + // Check if the PR is stale, according to our configured thresholds. + let staleThresholdMillis; + if (labels.includes("stale")) { + core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`); + staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS; + } else { + core.info(`[${pull.number}] Checking whether to label PR as stale.`); + staleThresholdMillis = STALE_LABEL_THRESHOLD_MS; + } + + const millisSinceLastUpdated = + new Date().getTime() - new Date(pull.updated_at).getTime(); + + if (millisSinceLastUpdated < staleThresholdMillis) { + core.info(`[${pull.number}] Skipping because PR was updated recently`); + return false; + } + + // At this point, we know we should do something. + // For PRs already labeled stale, close them. + if (labels.includes("stale")) { + core.info(`[${pull.number}] Closing PR.`); + numAPIRequests += 1; + await github.rest.issues.update({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + state: "closed", + }); + } else { + // For PRs not labeled stale, label them stale. + core.info(`[${pull.number}] Labeling PR as stale.`); + + numAPIRequests += 1; + await github.rest.issues.createComment({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + body: STALE_MESSAGE, + }); + + numAPIRequests += 1; + await github.rest.issues.addLabels({ + owner: "pytorch", + repo: "tutorials", + issue_number: pull.number, + labels: ["stale"], + }); + } + } + + for await (const response of github.paginate.iterator( + github.rest.pulls.list, + { + owner: "pytorch", + repo: "tutorials", + state: "open", + sort: "created", + direction: "asc", + per_page: 100, + } + )) { + numAPIRequests += 1; + const pulls = response.data; + // Awaiting in a loop is intentional here. We want to serialize execution so + // that log groups are printed correctl + for (const pull of pulls) { + if (numAPIRequests > MAX_API_REQUESTS) { + core.warning("Max API requests exceeded, exiting."); + process.exit(0); + } + await core.group(`Processing PR #${pull.number}`, async () => { + await processPull(pull); + }); + } + } + core.info(`Processed ${numProcessed} PRs total.`); + diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml index 809b9ad4bfc..58c515b325f 100644 --- a/.github/workflows/build-tutorials.yml +++ b/.github/workflows/build-tutorials.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: include: - - { shard: 1, num_shards: 15, runner: "linux.16xlarge.nvidia.gpu" } + - { shard: 1, num_shards: 15, runner: "linux.g5.12xlarge.nvidia.gpu" } - { shard: 2, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - { shard: 3, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } - { shard: 4, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } @@ -33,9 +33,6 @@ jobs: - { shard: 15, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } fail-fast: false runs-on: ${{ matrix.runner }} - env: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -47,6 +44,8 @@ jobs: - name: Checkout Tutorials uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Setup Linux uses: pytorch/pytorch/.github/actions/setup-linux@main @@ -54,27 +53,21 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main - - name: Calculate docker image - shell: bash - id: docker-image - run: | - set -ex - - # for some reason, pip installs it in a different place than what is looked at in the py file - pip3 install requests==2.26 - pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - - echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: ${{ steps.docker-image.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Build shell: bash env: - DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} NUM_WORKERS: ${{ matrix.num_shards }} WORKER_ID: ${{ matrix.shard }} COMMIT_ID: ${{ github.sha }} @@ -95,17 +88,14 @@ jobs: --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --tty \ --detach \ - --user jenkins \ --shm-size=2gb \ --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ "${DOCKER_IMAGE}" ) - echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash - - docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main @@ -116,9 +106,6 @@ jobs: needs: worker runs-on: [self-hosted, linux.2xlarge] environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }} - env: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" steps: - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main @@ -130,31 +117,27 @@ jobs: - name: Checkout Tutorials uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Setup Linux uses: pytorch/pytorch/.github/actions/setup-linux@main - - name: Calculate docker image - shell: bash - id: docker-image - run: | - set -ex - - # for some reason, pip installs it in a different place than what is looked at in the py file - pip3 install requests==2.26 - pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - - echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + - name: Calculate/build docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: tutorials - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: ${{ steps.docker-image.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Build shell: bash env: - DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} NUM_WORKERS: 15 WORKER_ID: ${{ matrix.shard }} COMMIT_ID: ${{ github.sha }} @@ -177,16 +160,13 @@ jobs: --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --tty \ --detach \ - --user jenkins \ --name="${container_name}" \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ + -v "${GITHUB_WORKSPACE}:/var/lib/workspace" \ + -w /var/lib/workspace \ "${DOCKER_IMAGE}" ) - echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash - - docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + docker exec -u ci-user -t "${container_name}" sh -c ".jenkins/build.sh" - name: Upload docs preview uses: seemethere/upload-artifact-s3@v5 diff --git a/.github/workflows/docathon-assign.yml b/.github/workflows/docathon-assign.yml index 1810872303b..31fa28289b0 100644 --- a/.github/workflows/docathon-assign.yml +++ b/.github/workflows/docathon-assign.yml @@ -8,16 +8,11 @@ on: jobs: assign: runs-on: ubuntu-latest + permissions: + issues: write steps: - - name: Install Dependencies - uses: actions/setup-node@v3 - with: - node-version: '18' - - name: Install @octokit/core - run: | - npm i @octokit/core @octokit/rest - name: Check for "/assigntome" in comment - uses: actions/github-script@v4 + uses: actions/github-script@v6 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: @@ -27,26 +22,23 @@ jobs: if (assignRegex.test(issueComment)) { const assignee = context.payload.comment.user.login; const issueNumber = context.payload.issue.number; - const { Octokit } = require("@octokit/rest"); - const octokit = new Octokit({ - auth: process.env.GITHUB_TOKEN, - }); - const { data: issue } = await octokit.issues.get({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: issueNumber - }); - const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2023'); + try { + const { data: issue } = await github.rest.issues.get({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber + }); + const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024'); if (hasLabel) { if (issue.assignee !== null) { - await octokit.issues.createComment({ + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, - body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2023 label](https://github.com/pytorch/tutorials/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2023)." + body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." }); } else { - octokit.issues.addAssignees({ + await github.rest.issues.addAssignees({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, @@ -54,12 +46,15 @@ jobs: }); } } else { - const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2023 label](https://github.com/pytorch/tutorials/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2023)." - await octokit.issues.createComment({ + const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)." + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issueNumber, body: commmentMessage }); + } + } catch (error) { + console.error(error); } } diff --git a/.github/workflows/docathon-label-sync.yml b/.github/workflows/docathon-label-sync.yml index 2d855877417..1b33bebaac2 100644 --- a/.github/workflows/docathon-label-sync.yml +++ b/.github/workflows/docathon-label-sync.yml @@ -7,12 +7,14 @@ on: jobs: check-labels: runs-on: ubuntu-latest - + permissions: + issues: write + pull-requests: write steps: - name: Check if PR mentions an issue and get labels uses: actions/checkout@v2 with: - fetch-depth: 0 + fetch-depth: 1 - name: Set up Python uses: actions/setup-python@v2 with: diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 00000000000..6d75d1fc929 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,59 @@ +name: Docker Build + +on: + workflow_dispatch: + pull_request: + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + push: + branches: + - main + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + +jobs: + docker-build: + runs-on: [self-hosted, linux.2xlarge] + timeout-minutes: 240 + strategy: + fail-fast: false + matrix: + include: + - docker-image-name: tutorials + env: + DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tutorials/${{ matrix.docker-image-name }} + steps: + - name: Clean workspace + shell: bash + run: | + echo "${GITHUB_WORKSPACE}" + sudo rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/test-infra/.github/actions/setup-linux@main + + - name: Build docker image + id: build-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ matrix.docker-image-name }} + push: true + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/.github/workflows/link_checkPR.yml b/.github/workflows/link_checkPR.yml new file mode 100644 index 00000000000..1fde2371590 --- /dev/null +++ b/.github/workflows/link_checkPR.yml @@ -0,0 +1,52 @@ +#Checks links in a PR to ensure they are valid. If link is valid but failing, it can be added to the .lycheeignore file +#Use the skip-link-check label on a PR to skip checking links on a PR + +name: link check on PR + +on: + pull_request: + branches: [main] + +jobs: + linkChecker: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Get Changed Files + id: changed-files + uses: tj-actions/changed-files@v41 + + - name: Check for Skip Label + id: skip-label + uses: actions/github-script@v6 + with: + script: | + const labels = await github.rest.issues.listLabelsOnIssue({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + return labels.data.some(label => label.name === 'skip-link-check'); + + - name: Check Links + if: steps.skip-label.outputs.result == 'false' + uses: lycheeverse/lychee-action@v1 + with: + args: --accept=200,403,429 --base . --verbose --no-progress ${{ steps.changed-files.outputs.all_changed_files }} + token: ${{ secrets.CUSTOM_TOKEN }} + fail: true + + - name: Skip Message + if: steps.skip-label.outputs.result == 'true' + run: echo "Link check was skipped due to the presence of the 'skip-link-check' label." + + - name: Suggestions + if: failure() + run: | + echo -e "\nPlease review the links reported in the Check links step above." + echo -e "If a link is valid but fails due to a CAPTCHA challenge, IP blocking, login requirements, etc., consider adding such links to .lycheeignore file to bypass future checks.\n" + exit 1 diff --git a/.gitignore b/.gitignore index ef7a026d9e8..1d9d572e565 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ beginner intermediate advanced pytorch_basics -recipes +/recipes prototype #data things diff --git a/.gitmodules b/.gitmodules index 3a3c564c8fa..e69de29bb2d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "src/pytorch-sphinx-theme"] - path = src/pytorch-sphinx-theme - url = https://github.com/pytorch/pytorch_sphinx_theme diff --git a/.jenkins/build.sh b/.jenkins/build.sh index f8f3f35690d..8eca78ae346 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -11,23 +11,21 @@ export LANG=C.UTF-8 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" # Update root certificates by installing new libgnutls30 -sudo apt-get update || sudo apt-get install libgnutls30 + +# Install pandoc (does not install from pypi) sudo apt-get update -sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync +sudo apt-get install -y pandoc # NS: Path to python runtime should already be part of docker container # export PATH=/opt/conda/bin:$PATH -rm -rf src -# NS: ghstack is not needed to build tutorials and right now it forces importlib to be downgraded to 3.X -pip uninstall -y ghstack -pip install --progress-bar off -r $DIR/../requirements.txt #Install PyTorch Nightly for test. # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html -# Install 2.1 for testing -pip uninstall -y torch torchvision torchaudio torchtext torchdata -pip3 install torch torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu121 -pip3 install torchdata torchtext --index-url https://download.pytorch.org/whl/test/cpu +# Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed). +sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata +sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124 +sudo pip uninstall -y fbgemm-gpu torchrec +sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124 # Install two language tokenizers for Translation with TorchText tutorial python -m spacy download en_core_web_sm @@ -40,7 +38,6 @@ awsv2 configure set default.s3.multipart_threshold 5120MB if [[ "${JOB_TYPE}" == "worker" ]]; then # Step 1: Remove runnable code from tutorials that are not supposed to be run python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true - # python $DIR/remove_runnable_code.py advanced_source/ddp_pipeline_tutorial.py advanced_source/ddp_pipeline_tutorial.py || true # Temp remove for mnist download issue. (Re-enabled for 1.8.1) # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py beginner_source/fgsm_tutorial.py || true # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true @@ -57,10 +54,16 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename. FILES_TO_RUN=$(python .jenkins/get_files_to_run.py) echo "FILES_TO_RUN: " ${FILES_TO_RUN} + # Files to run must be accessible to subprocessed (at least to `download_data.py`) + export FILES_TO_RUN - # Step 3: Run `make docs` to generate HTML files and static files for these tutorials + # Step 3: Run `make docs` to generate HTML files and static files for these tutorialis + pip3 install -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme make docs + # Step 3.1: Run the post-processing script: + python .jenkins/post_process_notebooks.py + # Step 4: If any of the generated files are not related the tutorial files we want to run, # then we remove them set +x @@ -117,6 +120,7 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 1: Generate no-plot HTML pages for all tutorials + pip3 install -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme make html-noplot cp -r _build/html docs @@ -138,6 +142,9 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then bash $DIR/remove_invisible_code_block_batch.sh docs python .jenkins/validate_tutorials_built.py + # Step 5.1: Run post-processing script on .ipynb files: + python .jenkins/post_process_notebooks.py + # Step 6: Copy generated HTML files and static files to S3 7z a manager.7z docs awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z diff --git a/.jenkins/custom_pandoc_filter.py b/.jenkins/custom_pandoc_filter.py new file mode 100644 index 00000000000..f4ceb0df11e --- /dev/null +++ b/.jenkins/custom_pandoc_filter.py @@ -0,0 +1,139 @@ +from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock +import markdown +import html + +def to_markdown(item, skip_octicon=False): + # A handler function to process strings, links, code, and code + # blocks + if item['t'] == 'Str': + return item['c'] + elif item['t'] == 'Space': + return ' ' + elif item['t'] == 'Link': + link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1]) + return f'{link_text}' + elif item['t'] == 'Code': + # Need to remove icticon as they don't render in .ipynb + if any(value == 'octicon' for key, value in item['c'][0][2]): + return '' + else: + # Escape the code and wrap it in tags + return f'{html.escape(item["c"][1])}' + elif item['t'] == 'CodeBlock': + # Escape the code block and wrap it in
 tags
+        return f'
{html.escape(item["c"][1])}
' + else: + return '' + + +def process_admonitions(key, value, format, meta): + # Replace admonitions with proper HTML. + if key == 'Div': + [[ident, classes, keyvals], contents] = value + if 'note' in classes: + color = '#54c7ec' + label = 'NOTE:' + elif 'tip' in classes: + color = '#6bcebb' + label = 'TIP:' + elif 'warning' in classes: + color = '#e94f3b' + label = 'WARNING:' + else: + return + + note_content = [] + for block in contents: + if block.get('t') == 'Para': + for item in block['c']: + if item['t'] == 'Str': + note_content.append(Str(item['c'])) + elif item['t'] == 'Space': + note_content.append(Space()) + elif item['t'] == 'Link': + note_content.append(Link(*item['c'])) + elif item['t'] == 'Code': + note_content.append(Code(*item['c'])) + elif block.get('t') == 'CodeBlock': + note_content.append(CodeBlock(*block['c'])) + + note_content_md = ''.join(to_markdown(item) for item in note_content) + html_content = markdown.markdown(note_content_md) + + return [{'t': 'RawBlock', 'c': ['html', f'
{label}
']}, {'t': 'RawBlock', 'c': ['html', '
']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '
']}] + elif key == 'RawBlock': + # this is needed for the cells that have embedded video. + # We add a special tag to those: ``` {python, .jupyter-code-cell} + # The post-processing script then finds those and genrates separate + # code cells that can load video. + [format, content] = value + if format == 'html' and 'iframe' in content: + # Extract the video URL + video_url = content.split('src="')[1].split('"')[0] + # Create the Python code to display the video + python_code = f""" +from IPython.display import display, HTML +html_code = \""" +{content} +\""" +display(HTML(html_code)) +""" + + return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]} + + +def process_images(key, value, format, meta): + # Add https://pytorch.org/tutorials/ to images so that they + # load correctly in the notebook. + if key != 'Image': + return None + [ident, classes, keyvals], caption, [src, title] = value + if not src.startswith('http'): + while src.startswith('../'): + src = src[3:] + if src.startswith('/_static'): + src = src[1:] + src = 'https://pytorch.org/tutorials/' + src + + return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]} + + +def process_grids(key, value, format, meta): + # Generate side by side grid cards. Only for the two-cards layout + # that we use in the tutorial template. + if key == 'Div': + [[ident, classes, keyvals], contents] = value + if 'grid' in classes: + columns = ['
', + '
'] + column_num = 0 + for block in contents: + if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]: + item_html = '' + for item in block['c'][1]: + if item['t'] == 'Para': + item_html += '

' + ''.join(to_markdown(i) for i in item['c']) + '

' + elif item['t'] == 'BulletList': + item_html += '
    ' + for list_item in item['c']: + item_html += '
  • ' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '
  • ' + item_html += '
' + columns[column_num] += item_html + column_num = (column_num + 1) % 2 + columns = [column + '
' for column in columns] + return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]} + +def is_code_block(item): + return item['t'] == 'Code' and 'octicon' in item['c'][1] + + +def process_all(key, value, format, meta): + for transform in [process_admonitions, process_images, process_grids]: + new_value = transform(key, value, format, meta) + if new_value is not None: + break + return new_value + + +if __name__ == "__main__": + toJSONFilter(process_all) diff --git a/.jenkins/download_data.py b/.jenkins/download_data.py index d5440dc404f..cc07c72561b 100644 --- a/.jenkins/download_data.py +++ b/.jenkins/download_data.py @@ -105,6 +105,13 @@ def download_lenet_mnist() -> None: sha256="cb5f8e578aef96d5c1a2cc5695e1aa9bbf4d0fe00d25760eeebaaac6ebc2edcb", ) +def download_gpu_quantization_torchao() -> None: + # Download SAM model checkpoint for prototype_source/gpu_quantization_torchao_tutorial.py + download_url_to_file("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", + prefix=PROTOTYPE_DATA_DIR, + dst="sam_vit_h_4b8939.pth", + sha256="a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e", + ) def main() -> None: DATA_DIR.mkdir(exist_ok=True) @@ -122,7 +129,8 @@ def main() -> None: download_dcgan_data() if FILES_TO_RUN is None or "fgsm_tutorial" in FILES_TO_RUN: download_lenet_mnist() - + if FILES_TO_RUN is None or "gpu_quantization_torchao_tutorial" in FILES_TO_RUN: + download_gpu_quantization_torchao() if __name__ == "__main__": main() diff --git a/.jenkins/insert_last_verified.py b/.jenkins/insert_last_verified.py new file mode 100644 index 00000000000..b43ef8de8e8 --- /dev/null +++ b/.jenkins/insert_last_verified.py @@ -0,0 +1,160 @@ +import json +import os +import subprocess +import sys +from datetime import datetime + +from bs4 import BeautifulSoup + + +json_file_path = "tutorials-review-data.json" + +# paths to skip from the post-processing script +paths_to_skip = [ + "beginner/examples_autograd/two_layer_net_custom_function", # not present in the repo + "beginner/examples_nn/two_layer_net_module", # not present in the repo + "beginner/examples_tensor/two_layer_net_numpy", # not present in the repo + "beginner/examples_tensor/two_layer_net_tensor", # not present in the repo + "beginner/examples_autograd/two_layer_net_autograd", # not present in the repo + "beginner/examples_nn/two_layer_net_optim", # not present in the repo + "beginner/examples_nn/two_layer_net_nn", # not present in the repo + "intermediate/coding_ddpg", # not present in the repo - will delete the carryover +] +# Mapping of source directories to build directories +source_to_build_mapping = { + "beginner": "beginner_source", + "recipes": "recipes_source", + "distributed": "distributed", + "intermediate": "intermediate_source", + "prototype": "prototype_source", + "advanced": "advanced_source", + "": "", # root dir for index.rst +} + +def get_git_log_date(file_path, git_log_args): + try: + result = subprocess.run( + ["git", "log"] + git_log_args + ["--", file_path], + capture_output=True, + text=True, + check=True, + ) + if result.stdout: + date_str = result.stdout.splitlines()[0] + return datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z") + except subprocess.CalledProcessError: + pass + raise ValueError(f"Could not find date for {file_path}") + +def get_creation_date(file_path): + return get_git_log_date(file_path, ["--diff-filter=A", "--format=%aD"]).strftime("%b %d, %Y") + + +def get_last_updated_date(file_path): + return get_git_log_date(file_path, ["-1", "--format=%aD"]).strftime("%b %d, %Y") + +# Try to find the source file with the given base path and the extensions .rst and .py +def find_source_file(base_path): + for ext in [".rst", ".py"]: + source_file_path = base_path + ext + if os.path.exists(source_file_path): + return source_file_path + return None + + +# Function to process a JSON file and insert the "Last Verified" information into the HTML files +def process_json_file(build_dir , json_file_path): + with open(json_file_path, "r", encoding="utf-8") as json_file: + json_data = json.load(json_file) + + for entry in json_data: + path = entry["Path"] + last_verified = entry["Last Verified"] + status = entry.get("Status", "") + if path in paths_to_skip: + print(f"Skipping path: {path}") + continue + if status in ["needs update", "not verified"]: + formatted_last_verified = "Not Verified" + elif last_verified: + try: + last_verified_date = datetime.strptime(last_verified, "%Y-%m-%d") + formatted_last_verified = last_verified_date.strftime("%b %d, %Y") + except ValueError: + formatted_last_verified = "Unknown" + else: + formatted_last_verified = "Not Verified" + if status == "deprecated": + formatted_last_verified += "Deprecated" + + for build_subdir, source_subdir in source_to_build_mapping.items(): + if path.startswith(build_subdir): + html_file_path = os.path.join(build_dir, path + ".html") + base_source_path = os.path.join( + source_subdir, path[len(build_subdir) + 1 :] + ) + source_file_path = find_source_file(base_source_path) + break + else: + print(f"Warning: No mapping found for path {path}") + continue + + if not os.path.exists(html_file_path): + print( + f"Warning: HTML file not found for path {html_file_path}." + "If this is a new tutorial, please add it to the audit JSON file and set the Verified status and todays's date." + ) + continue + + if not source_file_path: + print(f"Warning: Source file not found for path {base_source_path}.") + continue + + created_on = get_creation_date(source_file_path) + last_updated = get_last_updated_date(source_file_path) + + with open(html_file_path, "r", encoding="utf-8") as file: + soup = BeautifulSoup(file, "html.parser") + # Check if the

tag with class "date-info-last-verified" already exists + existing_date_info = soup.find("p", {"class": "date-info-last-verified"}) + if existing_date_info: + print( + f"Warning:

tag with class 'date-info-last-verified' already exists in {html_file_path}" + ) + continue + + h1_tag = soup.find("h1") # Find the h1 tag to insert the dates + if h1_tag: + date_info_tag = soup.new_tag("p", **{"class": "date-info-last-verified"}) + date_info_tag["style"] = "color: #6c6c6d; font-size: small;" + # Add the "Created On", "Last Updated", and "Last Verified" information + date_info_tag.string = ( + f"Created On: {created_on} | " + f"Last Updated: {last_updated} | " + f"Last Verified: {formatted_last_verified}" + ) + # Insert the new tag after the

tag + h1_tag.insert_after(date_info_tag) + # Save back to the HTML. + with open(html_file_path, "w", encoding="utf-8") as file: + file.write(str(soup)) + else: + print(f"Warning:

tag not found in {html_file_path}") + + +def main(): + if len(sys.argv) < 2: + print("Error: Build directory not provided. Exiting.") + exit(1) + build_dir = sys.argv[1] + print(f"Build directory: {build_dir}") + process_json_file(build_dir , json_file_path) + print( + "Finished processing JSON file. Please check the output for any warnings. " + "Pages like `nlp/index.html` are generated only during the full `make docs` " + "or `make html` build. Warnings about these files when you run `make html-noplot` " + "can be ignored." + ) + +if __name__ == "__main__": + main() diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index a97a0219ebf..6e82d054b4e 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,10 +28,49 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, + "intermediate_source/torchrec_intro_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "recipes_source/torch_export_aoti_python.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "advanced_source/pendulum.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu", + "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." + }, + "intermediate_source/torchvision_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu", + "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." + }, + "advanced_source/coding_ddpg.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu", + "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." + }, + "recipes_source/compiling_optimizer_lr_scheduler.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "intermediate_source/torch_compile_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, + "intermediate_source/torch_export_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "intermediate_source/scaled_dot_product_attention_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "intermediate_source/transformer_building_blocks.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "recipes_source/regional_compilation.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "advanced_source/semi_structured_sparse.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, + "prototype_source/gpu_quantization_torchao_tutorial.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" } } diff --git a/.jenkins/post_process_notebooks.py b/.jenkins/post_process_notebooks.py new file mode 100644 index 00000000000..81f51766c3e --- /dev/null +++ b/.jenkins/post_process_notebooks.py @@ -0,0 +1,97 @@ +import nbformat as nbf +import os +import re + +""" +This post-processing script needs to run after the .ipynb files are +generated. The script removes extraneous ```{=html} syntax from the +admonitions and splits the cells that have video iframe into a +separate code cell that can be run to load the video directly +in the notebook. This script is included in build.sh. +""" + + +# Pattern to search ``` {.python .jupyter-code-cell} +pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL) + + +def process_video_cell(notebook_path): + """ + This function finds the code blocks with the + "``` {.python .jupyter-code-cell}" code bocks and slices them + into a separe code cell (instead of markdown) which allows to + load the video in the notebook. The rest of the content is placed + in a new markdown cell. + """ + print(f'Processing file: {notebook_path}') + notebook = nbf.read(notebook_path, as_version=4) + + # Iterate over markdown cells + for i, cell in enumerate(notebook.cells): + if cell.cell_type == 'markdown': + match = pattern.search(cell.source) + if match: + print(f'Match found in cell {i}: {match.group(0)[:100]}...') + # Extract the parts before and after the video code block + before_html_block = match.group(1) + code_block = match.group(2) + + # Add a comment to run the cell to display the video + code_block = "# Run this cell to load the video\n" + code_block + # Create a new code cell + new_code_cell = nbf.v4.new_code_cell(source=code_block) + + # Replace the original markdown cell with the part before the code block + cell.source = before_html_block + + # Insert the new code cell after the current one + notebook.cells.insert(i+1, new_code_cell) + print(f'New code cell created with source: {new_code_cell.source}') + + # If there is content after the HTML code block, create a new markdown cell + if len(match.group(3).strip()) > 0: + after_html_block = match.group(3) + new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block) + # Create a new markdown cell and add the content after code block there + notebook.cells.insert(i+2, new_markdown_cell) + + else: + # Remove ```{=html} from the code block + cell.source = remove_html_tag(cell.source) + + nbf.write(notebook, notebook_path) + + +def remove_html_tag(content): + """ + Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which + prevents it from rendering correctly. This function removes + ```{=html} that we don't need. + """ + content = re.sub(r'```{=html}\n\n```', '">', content) + content = re.sub(r'<\/div>\n```', '

\n', content) + content = re.sub(r'```{=html}\n\n```', '\n', content) + content = re.sub(r'```{=html}', '', content) + content = re.sub(r'

\n```', '

', content) + return content + + +def walk_dir(downloads_dir): + """ + Walk the dir and process all notebook files in + the _downloads directory and its subdirectories. + """ + for root, dirs, files in os.walk(downloads_dir): + for filename in files: + if filename.endswith('.ipynb'): + process_video_cell(os.path.join(root, filename)) + + +def main(): + downloads_dir = './docs/_downloads' + walk_dir(downloads_dir) + + +if __name__ == "__main__": + main() diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index 596ab1700c9..665b3b48e3b 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -10,8 +10,8 @@ NOT_RUN = [ "beginner_source/basics/intro", # no code + "beginner_source/introyt/introyt_index", # no code "beginner_source/onnx/intro_onnx", - "beginner_source/translation_transformer", "beginner_source/profiler", "beginner_source/saving_loading_models", "beginner_source/introyt/captumyt", @@ -22,19 +22,18 @@ "beginner_source/former_torchies/tensor_tutorial_old", "beginner_source/examples_autograd/polynomial_autograd", "beginner_source/examples_autograd/polynomial_custom_function", - "beginner_source/t5_tutorial", # re-enable after this is fixed: https://github.com/pytorch/text/issues/1756 - "intermediate_source/parametrizations", "intermediate_source/mnist_train_nas", # used by ax_multiobjective_nas_tutorial.py "intermediate_source/fx_conv_bn_fuser", + "intermediate_source/_torch_export_nightly_tutorial", # does not work on release + "intermediate_source/transformer_building_blocks", # does not work on release "advanced_source/super_resolution_with_onnxruntime", - "advanced_source/ddp_pipeline", # requires 4 gpus + "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker "prototype_source/fx_graph_mode_ptq_dynamic", "prototype_source/vmap_recipe", "prototype_source/torchscript_freezing", "prototype_source/nestedtensor", "recipes_source/recipes/saving_and_loading_models_for_inference", "recipes_source/recipes/saving_multiple_models_in_one_file", - "recipes_source/recipes/loading_data_recipe", "recipes_source/recipes/tensorboard_with_pytorch", "recipes_source/recipes/what_is_state_dict", "recipes_source/recipes/profiler_recipe", @@ -51,7 +50,11 @@ "recipes_source/recipes/Captum_Recipe", "intermediate_source/flask_rest_api_tutorial", "intermediate_source/text_to_speech_with_torchaudio", - "intermediate_source/tensorboard_profiler_tutorial" # reenable after 2.0 release. + "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. + "intermediate_source/torch_export_tutorial", # reenable after 2940 is fixed. + "advanced_source/pendulum", + "beginner_source/onnx/export_simple_model_to_onnx_tutorial", + "beginner_source/onnx/onnx_registry_tutorial" ] def tutorial_source_dirs() -> List[Path]: diff --git a/.lycheeignore b/.lycheeignore new file mode 100644 index 00000000000..3d86ae872de --- /dev/null +++ b/.lycheeignore @@ -0,0 +1,20 @@ +# Used for links to be ignored during the link check. +# Add link to file along with comment as to why it should be ignored + +#Example link in some of the tutorials that should be ignored +file:///f:/libtmp/some_file + +#Ignore links with "file:///" to catch any other example links +file:\/\/\/.* + +# Ignore colab link in the setting of conf.py +https://pytorch.org/tutorials/beginner/colab/n + +# Ignore local host link from intermediate_source/tensorboard_tutorial.rst +http://localhost:6006 + +# Ignore local host link from recipes_source/deployment_with_flask.rst +http://localhost:5000/predict + +# Ignore local host link from advanced_source/cpp_frontend.rst +https://www.uber.com/blog/deep-neuroevolution/ diff --git a/.pyspelling.yml b/.pyspelling.yml index d09b401bdc0..1afe6dbb45e 100644 --- a/.pyspelling.yml +++ b/.pyspelling.yml @@ -19,7 +19,7 @@ matrix: - open: '\.\.\s+(figure|literalinclude|math|image|grid)::' close: '\n' # Exclude roles: - - open: ':(?:(class|py:mod|mod|func)):`' + - open: ':(?:(class|py:mod|mod|func|meth|obj)):`' content: '[^`]*' close: '`' # Exclude reStructuredText hyperlinks @@ -45,6 +45,9 @@ matrix: - open: '\.\. (code-block|math)::.*$\n*' content: '(?P(^(?P[ ]+).*$\n))(?P(^([ \t]+.*|[ \t]*)$\n)*)' close: '(^(?![ \t]+.*$))' + # Ignore references like "[1] Author: Title" + - open: '\[\d\]' + close: '\n' - pyspelling.filters.markdown: - pyspelling.filters.html: ignores: @@ -70,7 +73,7 @@ matrix: - open: ':figure:.*' close: '\n' # Ignore reStructuredText roles - - open: ':(?:(class|file|func|math|ref|octicon)):`' + - open: ':(?:(class|file|func|math|ref|octicon|meth|obj)):`' content: '[^`]*' close: '`' - open: ':width:' @@ -97,7 +100,7 @@ matrix: content: '''''*' close: '$' # Ignore reStructuredText block directives - - open: '\.\. (code-block|math)::.*$\n*' + - open: '\.\. (code-block|math|table)::.*$\n*' content: '(?P(^(?P[ ]+).*$\n))(?P(^([ \t]+.*|[ \t]*)$\n)*)' close: '(^(?![ \t]+.*$))' - open: '\.\. (raw)::.*$\n*' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f6adf15d02b..c4038d168c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,7 +65,7 @@ There are three types of tutorial content that we host on code in these tutorials is run every time they are built. To keep these tutorials up and running all their package dependencies need to be resolved--which makes it more challenging to maintain this type of - tutorial. + tutorial. * **Non-interactive tutorials** are authored and submitted as reStructuredText files. The build system only converts them into HTML; @@ -80,18 +80,16 @@ There are three types of tutorial content that we host on non-interactive. -# Managing data that is used by your tutorial +# Managing data that is used by your tutorial Your tutorial might depend on external data, such as pre-trained models, training data, or test data. We recommend storing this data in a commonly-used storage service, such as Amazon S3, and instructing your -users to download the data at the beginning of your tutorial. - -The -[Makefile](https://github.com/pytorch/tutorials/blob/main/Makefile) -that we use to build the tutorials contains automation that downloads -required data files. +users to download the data at the beginning of your tutorial. +To download your data add a function to the [download.py](https://github.com/pytorch/tutorials/blob/main/.jenkins/download_data.py) +script. Follow the same pattern as other download functions. +Please do not add download logic to `Makefile` as it will incur download overhead for all CI shards. # Python packages used by your tutorial @@ -104,7 +102,7 @@ tutorial fails to build in our Continuous Integration (CI) system, we might contact you in order to resolve the issue. -# Deprecation of tutorials +# Deprecation of tutorials Under some circumstances, we might deprecate--and subsequently archive--a tutorial removing it from the site. For example, if the @@ -137,7 +135,7 @@ end-to-end understanding of how to use PyTorch. Recipes are scoped examples of how to use specific features; the goal of a recipe is to teach readers how to easily leverage features of PyTorch for their needs. Tutorials and recipes are always _actionable_. If the material is -purely informative, consider adding it to the API docs instead. +purely informative, consider adding it to the API docs instead. View our current [full-length tutorials](https://pytorch.org/tutorials/). @@ -165,11 +163,11 @@ Write for a global audience with an instructive and directive voice. - PyTorch has a global audience; use clear, easy to understand language. Avoid idioms or other figures of speech. - To keep your instructions concise, use - [active voice](https://writing.wisc.edu/handbook/style/ccs_activevoice/) as much as possible. -- For a short guide on the essentials of writing style, + [active voice](https://writing.wisc.edu/handbook/style/ccs_activevoice/) as much as possible. +- For a short guide on the essentials of writing style, [The Elements of Style](https://www.gutenberg.org/files/37134/37134-h/37134-h.htm) is invaluable. -- For extensive guidance on technical-writing style, the Google developer documentation +- For extensive guidance on technical-writing style, the Google developer documentation [google style](https://developers.google.com/style) is a great resource. - Think of the process as similar to creating a (really practical) @@ -195,7 +193,7 @@ We recommend that tutorials use the following structure which guides users throu 1. Step-by-step instructions. Ideally, the steps in the tutorial should map back to the learning objectives. Consider adding comments in the code that correspond to these steps and that help to clarify what - each section of the code is doing. + each section of the code is doing. 1. Link to relevant [PyTorch documentation](https://pytorch.org/docs/stable/index.html). This helps readers have context for the tutorial source code and better @@ -220,9 +218,8 @@ described in the preceding sections: - [NLP From Scratch: Generating Names with a Character-Level RNN Tutorial](https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html) -If you are creating a recipe, we recommend that you use [this -template](https://github.com/pytorch/tutorials/blob/tutorials_refresh/recipes_source/recipes/example_recipe.py) -as a guide. +If you are creating a recipe, [this is a good +example.](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes/what_is_state_dict.py) # Submission Process # @@ -230,7 +227,7 @@ as a guide. Submit your tutorial as either a Python (`.py`) file or a reStructuredText (`.rst`) file. For Python files, the filename for your tutorial should end in "`_tutorial.py`"; for example, -"`cool_pytorch_feature_tutorial.py`". +"`cool_pytorch_feature_tutorial.py`". Do not submit a Jupyter notebook. If you develop your tutorial in Jupyter, you'll need to convert it to Python. This @@ -276,8 +273,8 @@ search, you need to include it in `index.rst`, or for recipes, in :header: Learn the Basics # Tutorial title :card_description: A step-by-step guide to building a complete ML workflow with PyTorch. # Short description :image: _static/img/thumbnails/cropped/60-min-blitz.png # Image that appears with the card - :link: beginner/basics/intro.html - :tags: Getting-Started + :link: beginner/basics/intro.html + :tags: Getting-Started ``` @@ -328,9 +325,9 @@ example](https://github.com/pytorch/tutorials/blob/main/_static/img/thumbnails/c The following command builds an HTML version of the tutorial website. - ``` - make html-noplot - ``` +``` +make html-noplot +``` This command does not run your tutorial code. To build the tutorial in a way that executes the code, use `make docs`. However, unless you have a @@ -340,7 +337,7 @@ test your tutorial when you submit your PR. ## Submit the PR ## - + NOTE: Please do not use [ghstack](https://github.com/ezyang/ghstack). We do not support ghstack in the [`pytorch/tutorials`](https://github.com/pytorch/tutorials) repo. @@ -368,5 +365,5 @@ build. You can see an example Netlify preview at the following URL: ## Do not merge the PR yourself ## -Please **DO NOT MERGE** your own PR; the tutorial won't be published. In order to avoid potential build breaks with the tutorials site, only certain maintainers can authorize publishing. +Please **DO NOT MERGE** your own PR; the tutorial won't be published. In order to avoid potential build breaks with the tutorials site, only certain maintainers can authorize publishing. diff --git a/Makefile b/Makefile index 5e994b01141..9068d32b2ab 100644 --- a/Makefile +++ b/Makefile @@ -82,24 +82,36 @@ download: wget -nv -N http://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz -P $(DATADIR) tar $(TAROPTS) -xzf $(DATADIR)/vocab.tar.gz -C ./beginner_source/data/ - # Download dataset for beginner_source/torchtext_custom_dataset_tutorial.py - wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR) - unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/ - - + # Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py + wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR) + unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/ + +download-last-reviewed-json: + @echo "Downloading tutorials-review-data.json..." + curl -o tutorials-review-data.json https://raw.githubusercontent.com/pytorch/tutorials/refs/heads/last-reviewed-data-json/tutorials-review-data.json + @echo "Finished downloading tutorials-review-data.json." docs: make download + make download-last-reviewed-json make html + @python .jenkins/insert_last_verified.py $(BUILDDIR)/html rm -rf docs cp -r $(BUILDDIR)/html docs touch docs/.nojekyll + rm -rf tutorials-review-data.json html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)/html" # bash .jenkins/remove_invisible_code_block_batch.sh "$(BUILDDIR)/html" @echo + make download-last-reviewed-json @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + @echo "Running post-processing script to insert 'Last Verified' dates..." + @python .jenkins/insert_last_verified.py $(BUILDDIR)/html + rm -rf tutorials-review-data.json clean-cache: make clean rm -rf advanced beginner intermediate recipes + # remove additional python files downloaded for torchvision_tutorial.py + rm -rf intermediate_source/engine.py intermediate_source/utils.py intermediate_source/transforms.py intermediate_source/coco_eval.py intermediate_source/coco_utils.py diff --git a/README.md b/README.md index d9fda75a019..af84d9ebe79 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,16 @@ All the tutorials are now presented as sphinx style documentation at: ## [https://pytorch.org/tutorials](https://pytorch.org/tutorials) +# Asking a question +If you have a question about a tutorial, post in https://dev-discuss.pytorch.org/ rather than creating an issue in this repo. Your question will be answered much faster on the dev-discuss forum. + +# Submitting an issue + +You can submit the following types of issues: + +* Feature request - request a new tutorial to be added. Please explain why this tutorial is needed and how it demonstrates PyTorch value. +* Bug report - report a failure or outdated information in an existing tutorial. When submitting a bug report, please run: `python3 -m torch.utils.collect_env` to get information about your environment and add the output to the bug report. # Contributing @@ -13,26 +22,26 @@ We use sphinx-gallery's [notebook styled examples](https://sphinx-gallery.github Here is how you can create a new tutorial (for a detailed description, see [CONTRIBUTING.md](./CONTRIBUTING.md)): +NOTE: Before submitting a new tutorial, read [PyTorch Tutorial Submission Policy](./tutorial_submission_policy.md). + 1. Create a Python file. If you want it executed while inserted into documentation, save the file with the suffix `tutorial` so that the file name is `your_tutorial.py`. 2. Put it in one of the `beginner_source`, `intermediate_source`, `advanced_source` directory based on the level of difficulty. If it is a recipe, add it to `recipes_source`. For tutorials demonstrating unstable prototype features, add to the `prototype_source`. -2. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst). -3. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/main/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes_index.rst) +3. For Tutorials (except if it is a prototype feature), include it in the `toctree` directive and create a `customcarditem` in [index.rst](./index.rst). +4. For Tutorials (except if it is a prototype feature), create a thumbnail in the [index.rst file](https://github.com/pytorch/tutorials/blob/main/index.rst) using a command like `.. customcarditem:: beginner/your_tutorial.html`. For Recipes, create a thumbnail in the [recipes_index.rst](https://github.com/pytorch/tutorials/blob/main/recipes_source/recipes_index.rst) If you are starting off with a Jupyter notebook, you can use [this script](https://gist.github.com/chsasank/7218ca16f8d022e02a9c0deb94a310fe) to convert the notebook to Python file. After conversion and addition to the project, please make sure that section headings and other things are in logical order. ## Building locally -The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: +The tutorial build is very large and requires a GPU. If your machine does not have a GPU device, you can preview your HTML build without actually downloading the data and running the tutorial code: 1. Install required dependencies by running: `pip install -r requirements.txt`. -> If you want to use `virtualenv`, in the root of the repo, run: `virtualenv venv`, then `source venv/bin/activate`. +> Typically, you would run either in `conda` or `virtualenv`. If you want to use `virtualenv`, in the root of the repo, run: `virtualenv venv`, then `source venv/bin/activate`. - If you have a GPU-powered laptop, you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This might take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step. - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial. -> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`. - ## Building a single tutorial You can build a single tutorial by using the `GALLERY_PATTERN` environment variable. For example to run only `neural_style_transfer_tutorial.py`, run: @@ -48,7 +57,13 @@ GALLERY_PATTERN="neural_style_transfer_tutorial.py" sphinx-build . _build The `GALLERY_PATTERN` variable respects regular expressions. + ## About contributing to PyTorch Documentation and Tutorials -* You can find information about contributing to PyTorch documentation in the -PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. +* You can find information about contributing to PyTorch documentation in the +PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. * Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md). + + +## License + +PyTorch Tutorials is BSD licensed, as found in the LICENSE file. diff --git a/_static/css/custom.css b/_static/css/custom.css index 7b7055fff78..a0882c1d4fc 100755 --- a/_static/css/custom.css +++ b/_static/css/custom.css @@ -71,3 +71,27 @@ .sd-card:hover:after { transform: scaleX(1); } + +.card-prerequisites:hover { + transition: none; + border: none; +} + +.card-prerequisites:hover:after { + transition: none; + transform: none; +} + +.card-prerequisites:after { + display: block; + content: ''; + border-bottom: none; + background-color: #fff; + transform: none; + transition: none; + transform-origin: none; +} + +.pytorch-left-menu-search input[type=text] { + background-image: url("../images/search-icon.svg"); +} diff --git a/_static/css/custom2.css b/_static/css/custom2.css new file mode 100644 index 00000000000..a24ee796872 --- /dev/null +++ b/_static/css/custom2.css @@ -0,0 +1,112 @@ +/* Survey banner .css */ + +.survey-banner { + margin-top: 10px; + background-color: #f3f4f7; + padding-top: 15px; + padding-left: 10px; + padding-bottom: 1px; +} + +@media screen and (max-width: 600px) { + .survey-banner { + padding-top: 5px; + padding-left: 5px; + padding-bottom: -1px; + font-size: 12px; + margin-bottom: 5px; + } +} + +/* Left nav for 2nd level nav */ + +.pytorch-left-menu li.toctree-l2 { + padding-left: 10px; +} + +.pytorch-left-menu li.toctree-l2.current > a, { + color: #ee4c2c; +} + +.pytorch-left-menu li.toctree-l2.current a:link.reference.internal { + color: #ee4c2c; +} + +.pytorch-left-menu li.toctree-l1.current > a:before { + content: ""; +} + +/* search radio button*/ + +input[type="radio"] { + accent-color: #ee4c2c; +} + +.gsst_b { + display: none; +} + +#gsc-i-id1 { + height: 1.5rem; + text-indent: 12px !important; + font-size: 1rem !important; + font-family: "FreightSansi"; + background-image: url(../images/search-icon.svg) !important; + background-repeat: no-repeat !important; + background-size: 18px 18px !important; + background-position: 5px 0px !important; + padding-left: 20px !important; +} + +#gsc-i-id1::placeholder { + font-family: 'FreightSans'; + font-size: 1rem; + color: #262626; +} + +.gsc-control-cse { + padding: 0 !important; + border-radius: 0px !important; + border: none !important; +} + +.gsc-overflow-hidden { + overflow: visible !important; +} + +#___gcse_0 { + height: 44px !important; + padding: 0 !important; +} + +table.gsc-search-box td.gsc-input { + padding-right: 0 !important; +} + +table.gsc-search-box td { + height: 44px; + margin-bottom: 0 !important; + padding-bottom: 0 !important; +} + +.gsc-search-button-v2 { + display: none; +} + +.gs_id50 { + width: 308px; +} + +.gsib_a { + padding: 0px 8px 4px 9px !important; +} + +.gsc-input-box { + border-radius: 0px !important; + border: none !important; +} + +form.gsc-search-box { + margin-bottom 0px; +} + diff --git a/_static/img/ExecuTorch-Logo-cropped.svg b/_static/img/ExecuTorch-Logo-cropped.svg new file mode 100644 index 00000000000..9e0ef52fbd8 --- /dev/null +++ b/_static/img/ExecuTorch-Logo-cropped.svg @@ -0,0 +1,57 @@ + + + + + + + + + + + diff --git a/_static/img/cat_resized.jpg b/_static/img/cat_resized.jpg new file mode 100644 index 00000000000..c7746e65308 Binary files /dev/null and b/_static/img/cat_resized.jpg differ diff --git a/_static/img/compiled_autograd/call_hook_node.png b/_static/img/compiled_autograd/call_hook_node.png new file mode 100644 index 00000000000..3e094cf6f73 Binary files /dev/null and b/_static/img/compiled_autograd/call_hook_node.png differ diff --git a/_static/img/compiled_autograd/entire_verbose_log.png b/_static/img/compiled_autograd/entire_verbose_log.png new file mode 100644 index 00000000000..4ce2b8538ee Binary files /dev/null and b/_static/img/compiled_autograd/entire_verbose_log.png differ diff --git a/_static/img/compiled_autograd/recompile_due_to_dynamic.png b/_static/img/compiled_autograd/recompile_due_to_dynamic.png new file mode 100644 index 00000000000..41ae56acf2d Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_dynamic.png differ diff --git a/_static/img/compiled_autograd/recompile_due_to_node.png b/_static/img/compiled_autograd/recompile_due_to_node.png new file mode 100644 index 00000000000..800a1784587 Binary files /dev/null and b/_static/img/compiled_autograd/recompile_due_to_node.png differ diff --git a/_static/img/distributed/device_mesh.png b/_static/img/distributed/device_mesh.png new file mode 100644 index 00000000000..2ccabcc4824 Binary files /dev/null and b/_static/img/distributed/device_mesh.png differ diff --git a/_static/img/distributed/fsdp_sharding.png b/_static/img/distributed/fsdp_sharding.png new file mode 100755 index 00000000000..9dd1e3c111e Binary files /dev/null and b/_static/img/distributed/fsdp_sharding.png differ diff --git a/_static/img/distributed/fsdp_tp.png b/_static/img/distributed/fsdp_tp.png new file mode 100644 index 00000000000..e419304ac7d Binary files /dev/null and b/_static/img/distributed/fsdp_tp.png differ diff --git a/_static/img/distributed/loss_parallel.png b/_static/img/distributed/loss_parallel.png new file mode 100644 index 00000000000..b5cf9a499bc Binary files /dev/null and b/_static/img/distributed/loss_parallel.png differ diff --git a/_static/img/distributed/megatron_lm.png b/_static/img/distributed/megatron_lm.png new file mode 100644 index 00000000000..38f7b06639f Binary files /dev/null and b/_static/img/distributed/megatron_lm.png differ diff --git a/_static/img/distributed/tcpstore_barrier_time.png b/_static/img/distributed/tcpstore_barrier_time.png new file mode 100644 index 00000000000..5ece3a7471d Binary files /dev/null and b/_static/img/distributed/tcpstore_barrier_time.png differ diff --git a/_static/img/distributed/tcpstore_init_time.png b/_static/img/distributed/tcpstore_init_time.png new file mode 100644 index 00000000000..df514b4dc48 Binary files /dev/null and b/_static/img/distributed/tcpstore_init_time.png differ diff --git a/_static/img/hta/comm_across_ranks.png b/_static/img/hta/comm_across_ranks.png new file mode 100644 index 00000000000..2336de3bcbc Binary files /dev/null and b/_static/img/hta/comm_across_ranks.png differ diff --git a/_static/img/hta/counts_diff.png b/_static/img/hta/counts_diff.png new file mode 100644 index 00000000000..34575c145de Binary files /dev/null and b/_static/img/hta/counts_diff.png differ diff --git a/_static/img/hta/cuda_kernel_launch.png b/_static/img/hta/cuda_kernel_launch.png new file mode 100644 index 00000000000..e57c54a2fc5 Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch.png differ diff --git a/_static/img/hta/cuda_kernel_launch_stats.png b/_static/img/hta/cuda_kernel_launch_stats.png new file mode 100644 index 00000000000..33a160fc752 Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch_stats.png differ diff --git a/_static/img/hta/duration_diff.png b/_static/img/hta/duration_diff.png new file mode 100644 index 00000000000..050d491c872 Binary files /dev/null and b/_static/img/hta/duration_diff.png differ diff --git a/_static/img/hta/idle_time.png b/_static/img/hta/idle_time.png new file mode 100644 index 00000000000..782bfe9adb5 Binary files /dev/null and b/_static/img/hta/idle_time.png differ diff --git a/_static/img/hta/idle_time_breakdown_percentage.png b/_static/img/hta/idle_time_breakdown_percentage.png new file mode 100644 index 00000000000..3bab5946eab Binary files /dev/null and b/_static/img/hta/idle_time_breakdown_percentage.png differ diff --git a/_static/img/hta/idle_time_summary.png b/_static/img/hta/idle_time_summary.png new file mode 100644 index 00000000000..101b696b534 Binary files /dev/null and b/_static/img/hta/idle_time_summary.png differ diff --git a/_static/img/hta/kernel_metrics_df.png b/_static/img/hta/kernel_metrics_df.png new file mode 100644 index 00000000000..53eefb58b0c Binary files /dev/null and b/_static/img/hta/kernel_metrics_df.png differ diff --git a/_static/img/hta/kernel_type_breakdown.png b/_static/img/hta/kernel_type_breakdown.png new file mode 100644 index 00000000000..29a29cf89b2 Binary files /dev/null and b/_static/img/hta/kernel_type_breakdown.png differ diff --git a/_static/img/hta/launch_delay_outliers.png b/_static/img/hta/launch_delay_outliers.png new file mode 100644 index 00000000000..9bb455adea4 Binary files /dev/null and b/_static/img/hta/launch_delay_outliers.png differ diff --git a/_static/img/hta/mem_bandwidth_queue_length.png b/_static/img/hta/mem_bandwidth_queue_length.png new file mode 100644 index 00000000000..9df5383b5d9 Binary files /dev/null and b/_static/img/hta/mem_bandwidth_queue_length.png differ diff --git a/_static/img/hta/overlap_df.png b/_static/img/hta/overlap_df.png new file mode 100644 index 00000000000..ef164a28a12 Binary files /dev/null and b/_static/img/hta/overlap_df.png differ diff --git a/_static/img/hta/overlap_plot.png b/_static/img/hta/overlap_plot.png new file mode 100644 index 00000000000..acd449bc7ff Binary files /dev/null and b/_static/img/hta/overlap_plot.png differ diff --git a/_static/img/hta/pie_charts.png b/_static/img/hta/pie_charts.png new file mode 100644 index 00000000000..fa9137109a6 Binary files /dev/null and b/_static/img/hta/pie_charts.png differ diff --git a/_static/img/hta/queue_length_summary.png b/_static/img/hta/queue_length_summary.png new file mode 100644 index 00000000000..639a03fb6d1 Binary files /dev/null and b/_static/img/hta/queue_length_summary.png differ diff --git a/_static/img/hta/runtime_outliers.png b/_static/img/hta/runtime_outliers.png new file mode 100644 index 00000000000..1e2dfff9006 Binary files /dev/null and b/_static/img/hta/runtime_outliers.png differ diff --git a/_static/img/hta/short_gpu_kernels.png b/_static/img/hta/short_gpu_kernels.png new file mode 100644 index 00000000000..ff382a3a7f0 Binary files /dev/null and b/_static/img/hta/short_gpu_kernels.png differ diff --git a/_static/img/hta/temporal_breakdown_df.png b/_static/img/hta/temporal_breakdown_df.png new file mode 100644 index 00000000000..dce1829d113 Binary files /dev/null and b/_static/img/hta/temporal_breakdown_df.png differ diff --git a/_static/img/hta/temporal_breakdown_plot.png b/_static/img/hta/temporal_breakdown_plot.png new file mode 100644 index 00000000000..9c5f45c1d35 Binary files /dev/null and b/_static/img/hta/temporal_breakdown_plot.png differ diff --git a/_static/img/itt_tutorial/vtune_xpu_config.png b/_static/img/itt_tutorial/vtune_xpu_config.png new file mode 100644 index 00000000000..80dd1812d26 Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_config.png differ diff --git a/_static/img/itt_tutorial/vtune_xpu_timeline.png b/_static/img/itt_tutorial/vtune_xpu_timeline.png new file mode 100644 index 00000000000..43818cf105c Binary files /dev/null and b/_static/img/itt_tutorial/vtune_xpu_timeline.png differ diff --git a/_static/img/onnx/custom_addandround_function.png b/_static/img/onnx/custom_addandround_function.png new file mode 100644 index 00000000000..a0c7000161e Binary files /dev/null and b/_static/img/onnx/custom_addandround_function.png differ diff --git a/_static/img/onnx/custom_addandround_model.png b/_static/img/onnx/custom_addandround_model.png new file mode 100644 index 00000000000..793d8cfbb5d Binary files /dev/null and b/_static/img/onnx/custom_addandround_model.png differ diff --git a/_static/img/onnx/custom_aten_add_function.png b/_static/img/onnx/custom_aten_add_function.png new file mode 100644 index 00000000000..8ef05a747a0 Binary files /dev/null and b/_static/img/onnx/custom_aten_add_function.png differ diff --git a/_static/img/onnx/custom_aten_add_model.png b/_static/img/onnx/custom_aten_add_model.png new file mode 100644 index 00000000000..e5ef1c71742 Binary files /dev/null and b/_static/img/onnx/custom_aten_add_model.png differ diff --git a/_static/img/onnx/custom_aten_gelu_model.png b/_static/img/onnx/custom_aten_gelu_model.png new file mode 100644 index 00000000000..5b326690eb7 Binary files /dev/null and b/_static/img/onnx/custom_aten_gelu_model.png differ diff --git a/_static/img/pendulum.gif b/_static/img/pendulum.gif new file mode 100644 index 00000000000..a7adf181fc8 Binary files /dev/null and b/_static/img/pendulum.gif differ diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png new file mode 100644 index 00000000000..9d84e9d229d Binary files /dev/null and b/_static/img/pinmem/pinmem.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png new file mode 100644 index 00000000000..dedac997b0b Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png new file mode 100644 index 00000000000..2d5ff462e1a Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned1.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png new file mode 100644 index 00000000000..130182a1978 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png new file mode 100644 index 00000000000..c596fcdb691 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned1.png differ diff --git a/_static/img/profiler_rocm_chrome_trace_view.png b/_static/img/profiler_rocm_chrome_trace_view.png new file mode 100644 index 00000000000..cff7ba98c8a Binary files /dev/null and b/_static/img/profiler_rocm_chrome_trace_view.png differ diff --git a/_static/img/profiler_rocm_tensorboard_operartor_view.png b/_static/img/profiler_rocm_tensorboard_operartor_view.png new file mode 100644 index 00000000000..27effb91e7c Binary files /dev/null and b/_static/img/profiler_rocm_tensorboard_operartor_view.png differ diff --git a/_static/img/python_extension_autoload_impl.png b/_static/img/python_extension_autoload_impl.png new file mode 100644 index 00000000000..64e18fc7b4b Binary files /dev/null and b/_static/img/python_extension_autoload_impl.png differ diff --git a/_static/img/rollout_recurrent.png b/_static/img/rollout_recurrent.png new file mode 100644 index 00000000000..2ce24d40d23 Binary files /dev/null and b/_static/img/rollout_recurrent.png differ diff --git a/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png b/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png new file mode 100644 index 00000000000..426a14d98f5 Binary files /dev/null and b/_static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png differ diff --git a/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png b/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png new file mode 100644 index 00000000000..76f2bcaf4de Binary files /dev/null and b/_static/img/thumbnails/cropped/TIAToolbox-Tutorial.png differ diff --git a/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp b/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp new file mode 100644 index 00000000000..5a1ca81e07d Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png new file mode 100644 index 00000000000..fafd95768a1 Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png new file mode 100644 index 00000000000..fd6f7aba1f4 Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png new file mode 100644 index 00000000000..8feda69de2d Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png new file mode 100644 index 00000000000..8feda69de2d Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png differ diff --git a/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png new file mode 100644 index 00000000000..e17e03812ce Binary files /dev/null and b/_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png differ diff --git a/_static/img/trace_xpu_img.png b/_static/img/trace_xpu_img.png new file mode 100644 index 00000000000..2eca0a78cb6 Binary files /dev/null and b/_static/img/trace_xpu_img.png differ diff --git a/_static/img/tv_tutorial/tv_image01.png b/_static/img/tv_tutorial/tv_image01.png deleted file mode 100644 index bb47d27d24e..00000000000 Binary files a/_static/img/tv_tutorial/tv_image01.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image02.png b/_static/img/tv_tutorial/tv_image02.png deleted file mode 100644 index 8717199010b..00000000000 Binary files a/_static/img/tv_tutorial/tv_image02.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image05.png b/_static/img/tv_tutorial/tv_image05.png deleted file mode 100644 index 3961033693a..00000000000 Binary files a/_static/img/tv_tutorial/tv_image05.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image06.png b/_static/img/tv_tutorial/tv_image06.png deleted file mode 100644 index 4c20d89026a..00000000000 Binary files a/_static/img/tv_tutorial/tv_image06.png and /dev/null differ diff --git a/_static/img/usb_semisup_learn/code.png b/_static/img/usb_semisup_learn/code.png new file mode 100644 index 00000000000..fdc7b798a37 Binary files /dev/null and b/_static/img/usb_semisup_learn/code.png differ diff --git a/_static/js/custom.js b/_static/js/custom.js new file mode 100644 index 00000000000..3e6c7fb8312 --- /dev/null +++ b/_static/js/custom.js @@ -0,0 +1,52 @@ +document.addEventListener("DOMContentLoaded", function() { + // Select all
  • elements with the class "toctree-l1" + var toctreeItems = document.querySelectorAll('li.toctree-l1'); + + toctreeItems.forEach(function(item) { + // Find the link within the item + var link = item.querySelector('a'); + var nestedList = item.querySelector('ul'); + + if (link && nestedList) { + // Create a span element for the "[+]" or "[-]" sign + var expandSign = document.createElement('span'); + expandSign.style.cursor = 'pointer'; // Make it look clickable + + // Use the link text as a unique key for localStorage + var sectionKey = 'section_' + link.textContent.trim().replace(/\s+/g, '_'); + + // Retrieve the saved state from localStorage + var isExpanded = localStorage.getItem(sectionKey); + + // If no state is saved, default to expanded for "Learn the Basics" and collapsed for others + if (isExpanded === null) { + isExpanded = (link.textContent.trim() === 'Learn the Basics') ? 'true' : 'false'; + localStorage.setItem(sectionKey, isExpanded); + } + + if (isExpanded === 'true') { + nestedList.style.display = 'block'; // Expand the section + expandSign.textContent = '[-] '; // Show "[-]" since it's expanded + } else { + nestedList.style.display = 'none'; // Collapse the section + expandSign.textContent = '[+] '; // Show "[+]" since it's collapsed + } + + // Add a click event to toggle the nested list + expandSign.addEventListener('click', function() { + if (nestedList.style.display === 'none') { + nestedList.style.display = 'block'; + expandSign.textContent = '[-] '; // Change to "[-]" when expanded + localStorage.setItem(sectionKey, 'true'); // Save state + } else { + nestedList.style.display = 'none'; + expandSign.textContent = '[+] '; // Change back to "[+]" when collapsed + localStorage.setItem(sectionKey, 'false'); // Save state + } + }); + + // Insert the sign before the link + link.parentNode.insertBefore(expandSign, link); + } + }); +}); diff --git a/_static/tiatoolbox_tutorial.ipynb b/_static/tiatoolbox_tutorial.ipynb new file mode 100644 index 00000000000..35cb4bc5693 --- /dev/null +++ b/_static/tiatoolbox_tutorial.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"YWsXrOQGyiNu"},"source":["# Whole Slide Image Classification Using PyTorch and TIAToolbox\n"]},{"cell_type":"markdown","metadata":{"id":"yLUSqCAMyiNz"},"source":["## Introduction\n","\n","In this tutorial, we will show how to classify Whole Slide Images (WSIs) using PyTorch deep learning models with help from TIAToolbox. A WSI represents human tissues taken through an operation or a biopsy and scanned using specialized scanners. They are used by pathologists and computational pathology researchers to [study cancer at the microscopic level](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7522141/) in order to understand for example tumor growth and help improve treatment for patients.\n","\n","What makes WSIs challenging to process is their enormous size. For example, a typical slide image has in the order of [100,000x100,000 pixels](https://doi.org/10.1117%2F12.912388) where each pixel can correspond to about 0.25x0.25 microns on the slide. This introduces challenges in loading and processing such images, not to mention hundreds or even thousands of WSIs in a single study (larger studies produce better results)!\n","\n","Conventional image processing pipelines are not suitable for WSI processing so we need better tools. This where [TIAToolbox](https://github.com/TissueImageAnalytics/tiatoolbox) can help as it brings a set of useful tools to import and process tissue slides in a fast and computationally efficient manner. Typically, WSIs are saved in a pyramid structure with multiple copies of the same image at various magnification levels optimized for visualization. The level 0 (or the bottom level) of the pyramid contains the image at the highest magnification or zoom level, whereas the higher levels in the pyramid have a lower resolution copy of the base image. The pyramid structure is sketched below.\n","\n","![WSI pyramid stack](https://tia-toolbox.readthedocs.io/en/latest/_images/read_bounds_tissue.png)\n","*WSI pyramid stack ([source](https://tia-toolbox.readthedocs.io/en/latest/_autosummary/tiatoolbox.wsicore.wsireader.WSIReader.html#))*\n","\n","
    \n","\n","TIAToolbox allows us to automate common downstream analysis tasks such as [tissue classification](https://doi.org/10.1016/j.media.2022.102685). In this tutorial we will show you how you can:\n","1. Load WSI images using TIAToolbox; and\n","2. Use different PyTorch models to classify slides at the batch-level. In this tutorial, we will provide an example of using TorchVision's `ResNet18` model and custom [`HistoEncoder`](https://github.com/jopo666/HistoEncoder) model.\n","\n","Let's get started!"]},{"cell_type":"markdown","metadata":{"id":"EPiF6kU5yiN0","tags":["remove-cell"]},"source":["## Setting up the environment\n","To run the examples provided in this tutorial, the following packages are required as prequisites..\n","\n","1. OpenJpeg\n","2. OpenSlide\n","3. Pixman\n","4. TIAToolbox\n","5. HistoEncoder (for a custom model example)\n","\n","Please run the following command in your terminal to install these packages:"]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"end_time":"2023-11-10T18:40:04.895625400Z","start_time":"2023-11-10T18:40:04.621790200Z"},"id":"oCOSzUCUXnfh","tags":["remove-cell"],"vscode":{"languageId":"shellscript"}},"outputs":[],"source":["%%bash\n","apt-get -y install libopenjp2-7-dev libopenjp2-tools openslide-tools libpixman-1-dev | tail -n 1\n","pip install histoencoder | tail -n 1\n","pip install git+https://github.com/TissueImageAnalytics/tiatoolbox.git@develop | tail -n 1\n","echo \"Installation is done.\""]},{"cell_type":"markdown","metadata":{"id":"seaUmzYoSANq"},"source":["Alternatively, you can run `brew install openjpeg openslide` to install the prerequistite packages on MacOS instead of `apt-get`. Further information on installation can be [found here](https://tia-toolbox.readthedocs.io/en/latest/installation.html). You will likely need to restart the runtime in the runtime menu at the top of the page to continue with the rest of the tutorial, in order for the newly installed dependencies to be picked up."]},{"cell_type":"markdown","metadata":{"id":"bGp2XDMAX1GB"},"source":["### Importing related libraries\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"end_time":"2023-11-10T18:43:40.489228400Z","start_time":"2023-11-10T18:43:39.434913Z"},"id":"SNbdWfvnFtG5"},"outputs":[],"source":["\"\"\"Import modules required to run the Jupyter notebook.\"\"\"\n","from __future__ import annotations\n","\n","# Configure logging\n","import logging\n","import warnings\n","if logging.getLogger().hasHandlers():\n"," logging.getLogger().handlers.clear()\n","warnings.filterwarnings(\"ignore\", message=\".*The 'nopython' keyword.*\")\n","\n","# Downloading data and files\n","import shutil\n","from pathlib import Path\n","from zipfile import ZipFile\n","\n","# Data processing and visualization\n","import matplotlib as mpl\n","import matplotlib.pyplot as plt\n","import numpy as np\n","import pandas as pd\n","from matplotlib import cm\n","import PIL\n","import contextlib\n","import io\n","from sklearn.metrics import accuracy_score, confusion_matrix\n","\n","# TIAToolbox for WSI loading and processing\n","from tiatoolbox import logger\n","from tiatoolbox.models.architecture import vanilla\n","from tiatoolbox.models.engine.patch_predictor import (\n"," IOPatchPredictorConfig,\n"," PatchPredictor,\n",")\n","from tiatoolbox.utils.misc import download_data, grab_files_from_dir\n","from tiatoolbox.utils.visualization import overlay_prediction_mask\n","from tiatoolbox.wsicore.wsireader import WSIReader\n","\n","# Torch-related\n","import torch\n","from torchvision import transforms\n","\n","# Configure plotting\n","mpl.rcParams[\"figure.dpi\"] = 160 # for high resolution figure in notebook\n","mpl.rcParams[\"figure.facecolor\"] = \"white\" # To make sure text is visible in dark mode\n","\n","# If you are not using GPU, change ON_GPU to False\n","ON_GPU = True\n","\n","# Function to suppress console output for overly verbose code blocks\n","def suppress_console_output():\n"," return contextlib.redirect_stderr(io.StringIO())"]},{"cell_type":"markdown","metadata":{"collapsed":false,"id":"X8dSUvDHSANq"},"source":["### Clean-up before a run\n","\n","To ensure proper clean-up (for example in abnormal termination), all files downloaded or created in this run are saved in a single directory `global_save_dir`, which we set equal to \"./tmp/\". To simplify maintenance, the name of the directory occurs only at this one place, so that it can easily be changed, if desired.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"end_time":"2023-11-10T18:41:51.192871200Z","start_time":"2023-11-10T18:41:51.160504Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"YibjAicoAVS1","outputId":"0006363f-003a-42d2-ee34-25105b6339a4","tags":["remove-cell"]},"outputs":[{"name":"stdout","output_type":"stream","text":["|2023-11-12|17:47:11.792| [INFO] Removing directory tmp\n","|2023-11-12|17:47:11.792| [INFO] Creating new directory tmp\n"]}],"source":["warnings.filterwarnings(\"ignore\")\n","global_save_dir = Path(\"./tmp/\")\n","\n","\n","def rmdir(dir_path: str | Path) -> None:\n"," \"\"\"Helper function to delete directory.\"\"\"\n"," if Path(dir_path).is_dir():\n"," shutil.rmtree(dir_path)\n"," logger.info(\"Removing directory %s\", dir_path)\n","\n","\n","rmdir(global_save_dir) # remove directory if it exists from previous runs\n","global_save_dir.mkdir()\n","logger.info(\"Creating new directory %s\", global_save_dir)"]},{"cell_type":"markdown","metadata":{"id":"TlgYO3n0FtG6"},"source":["### Downloading the data\n","For our sample data, we will use one whole-slide image, and patches from the validation subset of [Kather 100k](https://zenodo.org/record/1214456#.YJ-tn3mSkuU) dataset.\n"]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"end_time":"2023-11-10T18:41:56.177054800Z","start_time":"2023-11-10T18:41:56.104412700Z"},"colab":{"base_uri":"https://localhost:8080/"},"id":"l7CzZGFHFtG6","outputId":"39bd40d4-9f0c-4f0a-e18a-e7e982e8364e","tags":["hide-output"]},"outputs":[{"name":"stdout","output_type":"stream","text":["|2023-11-12|17:47:11.797| [INFO] Download has started. Please wait...\n","|2023-11-12|17:47:28.245| [INFO] Download is complete.\n"]}],"source":["wsi_path = global_save_dir / \"sample_wsi.svs\"\n","patches_path = global_save_dir / \"kather100k-validation-sample.zip\"\n","weights_path = global_save_dir / \"resnet18-kather100k.pth\"\n","\n","logger.info(\"Download has started. Please wait...\")\n","\n","# Downloading and unzip a sample whole-slide image\n","download_data(\n"," \"https://tiatoolbox.dcs.warwick.ac.uk/sample_wsis/TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.svs\",\n"," wsi_path,\n",")\n","\n","# Download and unzip a sample of the validation set used to train the Kather 100K dataset\n","download_data(\n"," \"https://tiatoolbox.dcs.warwick.ac.uk/datasets/kather100k-validation-sample.zip\",\n"," patches_path,\n",")\n","with ZipFile(patches_path, \"r\") as zipfile:\n"," zipfile.extractall(path=global_save_dir)\n","\n","# Download pretrained model weights for WSI classification using ResNet18 architecture\n","download_data(\n"," \"https://tiatoolbox.dcs.warwick.ac.uk/models/pc/resnet18-kather100k.pth\",\n"," weights_path,\n",")\n","\n","logger.info(\"Download is complete.\")"]},{"cell_type":"markdown","metadata":{"id":"qdaSTKE8FtG7"},"source":["## Reading the data\n","\n","We create a list of patches and a list of corresponding labels.\n","For example, the first label in `label_list` will indicate the class of the first image patch in `patch_list`.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"start_time":"2023-11-10T18:40:05.791111900Z"},"colab":{"base_uri":"https://localhost:8080/","height":886},"id":"5sF4Q-6Px6IV","outputId":"4c474a52-24ca-4947-9cf0-08dcfe960702"},"outputs":[{"name":"stdout","output_type":"stream","text":["|2023-11-12|17:47:28.276| [INFO] Class ID: 0 -- Class Name: BACK -- Number of images: 211\n","|2023-11-12|17:47:28.276| [INFO] Class ID: 1 -- Class Name: NORM -- Number of images: 176\n","|2023-11-12|17:47:28.277| [INFO] Class ID: 2 -- Class Name: DEB -- Number of images: 230\n","|2023-11-12|17:47:28.277| [INFO] Class ID: 3 -- Class Name: TUM -- Number of images: 286\n","|2023-11-12|17:47:28.277| [INFO] Class ID: 4 -- Class Name: ADI -- Number of images: 208\n","|2023-11-12|17:47:28.277| [INFO] Class ID: 5 -- Class Name: MUC -- Number of images: 178\n","|2023-11-12|17:47:28.277| [INFO] Class ID: 6 -- Class Name: MUS -- Number of images: 270\n","|2023-11-12|17:47:28.278| [INFO] Class ID: 7 -- Class Name: STR -- Number of images: 209\n","|2023-11-12|17:47:28.278| [INFO] Class ID: 8 -- Class Name: LYM -- Number of images: 232\n","|2023-11-12|17:47:28.278| [INFO] Total number of patches: 2000\n"]},{"data":{"image/png":"","text/plain":["
    "]},"metadata":{},"output_type":"display_data"}],"source":["# Read the patch data and create a list of patches and a list of corresponding labels\n","dataset_path = global_save_dir / \"kather100k-validation-sample\"\n","\n","# Set the path to the dataset\n","image_ext = \".tif\" # file extension of each image\n","\n","# Obtain the mapping between the label ID and the class name\n","label_dict = {\n"," \"BACK\": 0, # Background (empty glass region)\n"," \"NORM\": 1, # Normal colon mucosa\n"," \"DEB\": 2, # Debris\n"," \"TUM\": 3, # Colorectal adenocarcinoma epithelium\n"," \"ADI\": 4, # Adipose\n"," \"MUC\": 5, # Mucus\n"," \"MUS\": 6, # Smooth muscle\n"," \"STR\": 7, # Cancer-associated stroma\n"," \"LYM\": 8, # Lymphocytes\n","}\n","\n","class_names = list(label_dict.keys())\n","class_labels = list(label_dict.values())\n","\n","# Generate a list of patches and generate the label from the filename\n","patch_list = []\n","label_list = []\n","for class_name, label in label_dict.items():\n"," dataset_class_path = dataset_path / class_name\n"," patch_list_single_class = grab_files_from_dir(\n"," dataset_class_path,\n"," file_types=\"*\" + image_ext,\n"," )\n"," patch_list.extend(patch_list_single_class)\n"," label_list.extend([label] * len(patch_list_single_class))\n","\n","# Show some dataset statistics\n","plt.bar(class_names, [label_list.count(label) for label in class_labels])\n","plt.xlabel(\"Patch types\")\n","plt.ylabel(\"Number of patches\")\n","\n","# Count the number of examples per class\n","for class_name, label in label_dict.items():\n"," logger.info(\n"," \"Class ID: %d -- Class Name: %s -- Number of images: %d\",\n"," label,\n"," class_name,\n"," label_list.count(label),\n"," )\n","\n","# Overall dataset statistics\n","logger.info(\"Total number of patches: %d\", (len(patch_list)))"]},{"cell_type":"markdown","metadata":{"id":"r8tg66bu48Vh"},"source":["As you can see for this patch dataset, we have 9 classes/labels with IDs 0-8 and associated class names. describing the dominant tissue type in the patch:\n","\n","- BACK ⟶ Background (empty glass region)\n","- LYM ⟶ Lymphocytes\n","- NORM ⟶ Normal colon mucosa\n","- DEB ⟶ Debris\n","- MUS ⟶ Smooth muscle\n","- STR ⟶ Cancer-associated stroma\n","- ADI ⟶ Adipose\n","- MUC ⟶ Mucus\n","- TUM ⟶ Colorectal adenocarcinoma epithelium\n","\n"]},{"cell_type":"markdown","metadata":{"id":"UxBdhIE-FtG7"},"source":["## Classify image patches\n","\n","We demonstrate how to obtain a prediction for each patch within a digital slide first with the `patch` mode and then with a large slide using `wsi` mode."]},{"cell_type":"markdown","metadata":{"id":"N8_S93fSVaFS"},"source":["### Define `PatchPredictor` model\n","\n","The PatchPredictor class runs a CNN-based classifier written in PyTorch.\n","\n","- `model` can be any trained PyTorch model with the constraint that it should follow the [`tiatoolbox.models.abc.ModelABC`](https://tia-toolbox.readthedocs.io/en/latest/_autosummary/tiatoolbox.models.models_abc.ModelABC.html) class structure. For more information on this matter, please refer to [our example notebook on advanced model techniques](https://github.com/TissueImageAnalytics/tiatoolbox/blob/develop/examples/07-advanced-modeling.ipynb). In order to load a custom model, you need to write a small preprocessing function, as in `preproc_func(img)`, which make sures the input tensors are in the right format for the loaded network.\n","- Alternatively, you can pass `pretrained_model` as a string argument. This specifies the CNN model that performs the prediction, and it must be one of the models listed [here](https://tia-toolbox.readthedocs.io/en/latest/usage.html?highlight=pretrained%20models#tiatoolbox.models.architecture.get_pretrained_model). The command will look like this: `predictor = PatchPredictor(pretrained_model='resnet18-kather100k', pretrained_weights=weights_path, batch_size=32)`.\n","- `pretrained_weights`: When using a `pretrained_model`, the corresponding pretrained weights will also be downloaded by default. You can override the default with your own set of weights via the `pretrained_weight` argument.\n","- `batch_size`: Number of images fed into the model each time. Higher values for this parameter require a larger (GPU) memory capacity."]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"start_time":"2023-11-10T18:40:05.805638800Z"},"id":"dlQu5878FtG8","tags":["hide-output"]},"outputs":[],"source":["model = vanilla.CNNModel(backbone=\"resnet18\", num_classes=9) # Importing model from torchvision.models.resnet18\n","model.load_state_dict(torch.load(weights_path, map_location=\"cpu\"), strict=True)\n","def preproc_func(img):\n"," img = PIL.Image.fromarray(img)\n"," img = transforms.ToTensor()(img)\n"," return img.permute(1, 2, 0)\n","model.preproc_func = preproc_func\n","predictor = PatchPredictor(model=model, batch_size=32)"]},{"cell_type":"markdown","metadata":{"id":"xKUJrBKkSANr"},"source":["### Predict patch labels\n","\n","We create a predictor object and then call the `predict` method using the `patch` mode. We then compute the classification accuracy and confusion matrix."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"P_NpnknhSANr","outputId":"eadde29a-8fdd-44d8-d238-8498c87edc59"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|###########################################| 63/63 [00:04<00:00, 13.15it/s]"]},{"name":"stdout","output_type":"stream","text":["|2023-11-12|17:47:33.576| [INFO] Classification accuracy: 0.993000\n"]},{"name":"stderr","output_type":"stream","text":["\n"]},{"data":{"text/html":["
    \n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
    BACKNORMDEBTUMADIMUCMUSSTRLYM
    BACK1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000
    NORM0.0000000.9886360.0000000.0113640.0000000.0000000.0000000.0000000.00000
    DEB0.0000000.0000000.9913040.0000000.0000000.0000000.0000000.0086960.00000
    TUM0.0000000.0000000.0000000.9965030.0000000.0034970.0000000.0000000.00000
    ADI0.0048080.0000000.0000000.0000000.9903850.0000000.0048080.0000000.00000
    MUC0.0000000.0000000.0000000.0000000.0000000.9887640.0000000.0112360.00000
    MUS0.0000000.0000000.0000000.0000000.0000000.0000000.9962960.0037040.00000
    STR0.0000000.0000000.0047850.0000000.0000000.0047850.0047850.9856460.00000
    LYM0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0043100.99569
    \n","
    "],"text/plain":[" BACK NORM DEB TUM ADI MUC MUS \n","BACK 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \\\n","NORM 0.000000 0.988636 0.000000 0.011364 0.000000 0.000000 0.000000 \n","DEB 0.000000 0.000000 0.991304 0.000000 0.000000 0.000000 0.000000 \n","TUM 0.000000 0.000000 0.000000 0.996503 0.000000 0.003497 0.000000 \n","ADI 0.004808 0.000000 0.000000 0.000000 0.990385 0.000000 0.004808 \n","MUC 0.000000 0.000000 0.000000 0.000000 0.000000 0.988764 0.000000 \n","MUS 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.996296 \n","STR 0.000000 0.000000 0.004785 0.000000 0.000000 0.004785 0.004785 \n","LYM 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n","\n"," STR LYM \n","BACK 0.000000 0.00000 \n","NORM 0.000000 0.00000 \n","DEB 0.008696 0.00000 \n","TUM 0.000000 0.00000 \n","ADI 0.000000 0.00000 \n","MUC 0.011236 0.00000 \n","MUS 0.003704 0.00000 \n","STR 0.985646 0.00000 \n","LYM 0.004310 0.99569 "]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["with suppress_console_output():\n"," output = predictor.predict(imgs=patch_list, mode=\"patch\", on_gpu=ON_GPU)\n","\n","acc = accuracy_score(label_list, output[\"predictions\"])\n","logger.info(\"Classification accuracy: %f\", acc)\n","\n","# Creating and visualizing the confusion matrix for patch classification results\n","conf = confusion_matrix(label_list, output[\"predictions\"], normalize=\"true\")\n","df_cm = pd.DataFrame(conf, index=class_names, columns=class_names)\n","df_cm"]},{"cell_type":"markdown","metadata":{"id":"6rmVxHVmSANs"},"source":["### Predict patch labels for a whole slide\n","\n","We also introduce `IOPatchPredictorConfig`, a class that specifies the configuration of image reading and prediction writing for the model prediction engine. This is required to inform the classifier which level of the WSI pyramid the classifier should read, process data and generate output.\n","\n","Parameters of `IOPatchPredictorConfig` are defined as:\n","\n","- `input_resolutions`: A list, in the form of a dictionary, specifying the resolution of each input. List elements must be in the same order as in the target `model.forward()`. If your model accepts only one input, you just need to put one dictionary specifying `'units'` and `'resolution'`. Note that TIAToolbox supports a model with more than one input. For more information on units and resolution, please see [TIAToolbox documentation](https://tia-toolbox.readthedocs.io/en/latest/_autosummary/tiatoolbox.wsicore.wsireader.WSIReader.html#tiatoolbox.wsicore.wsireader.WSIReader.read_rect).\n","- `patch_input_shape`: Shape of the largest input in (height, width) format.\n","- `stride_shape`: The size of a stride (steps) between two consecutive patches, used in the patch extraction process. If the user sets `stride_shape` equal to `patch_input_shape`, patches will be extracted and processed without any overlap."]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"start_time":"2023-11-10T18:40:05.805638800Z"},"id":"9Kp1kx7wmOYq"},"outputs":[],"source":["wsi_ioconfig = IOPatchPredictorConfig(\n"," input_resolutions=[{\"units\": \"mpp\", \"resolution\": 0.5}],\n"," patch_input_shape=[224, 224],\n"," stride_shape=[224, 224],\n",")"]},{"cell_type":"markdown","metadata":{"id":"drn9RF4-SANs"},"source":["The `predict` method applies the CNN on the input patches and get the results. Here are the arguments and their descriptions:\n","\n","- `mode`: Type of input to be processed. Choose from `patch`, `tile` or `wsi` according to your application.\n","- `imgs`: List of inputs, which should be a list of paths to the input tiles or WSIs.\n","- `return_probabilities`: Set to *__True__* to get per class probabilities alongside predicted labels of input patches. If you wish to merge the predictions to generate prediction maps for `tile` or `wsi` modes, you can set `return_probabilities=True`.\n","- `ioconfig`: set the IO configuration information using the `IOPatchPredictorConfig` class.\n","- `resolution` and `unit` (not shown below): These arguments specify the level or micron-per-pixel resolution of the WSI levels from which we plan to extract patches and can be used instead of `ioconfig`. Here we specify the WSI's level as `'baseline'`, which is equivalent to level 0. In general, this is the level of greatest resolution. In this particular case, the image has only one level. More information can be found in the [documentation](https://tia-toolbox.readthedocs.io/en/latest/usage.html?highlight=WSIReader.read_rect#tiatoolbox.wsicore.wsireader.WSIReader.read_rect).\n","- `masks`: A list of paths corresponding to the masks of WSIs in the `imgs` list. These masks specify the regions in the original WSIs from which we want to extract patches. If the mask of a particular WSI is specified as `None`, then the labels for all patches of that WSI (even background regions) would be predicted. This could cause unnecessary computation.\n","- `merge_predictions`: You can set this parameter to `True` if it's required to generate a 2D map of patch classification results. However, for large WSIs this will require large available memeory. An alternative (default) solution is to set `merge_predictions=False`, and then generate the 2D prediction maps using the `merge_predictions` function as you will see later on.\n","\n","Since we are using a large WSI the patch extraction and prediction processes may take some time (make sure to set the `ON_GPU=True` if you have access to Cuda enabled GPU and PyTorch+Cuda)."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"tUZTP0bKSANs","outputId":"723a5ee7-7f0d-462c-ac59-c6acfb720c85"},"outputs":[{"name":"stderr","output_type":"stream","text":["|2023-11-12|17:47:33.620| [WARNING] Read: Scale > 1.This means that the desired resolution is higher than the WSI baseline (maximum encoded resolution). Interpolation of read regions may occur.\n"]},{"name":"stderr","output_type":"stream","text":["100%|#########################################| 629/629 [02:14<00:00, 4.68it/s]\n"]}],"source":["with suppress_console_output():\n"," wsi_output = predictor.predict(\n"," imgs=[wsi_path],\n"," masks=None,\n"," mode=\"wsi\",\n"," merge_predictions=False,\n"," ioconfig=wsi_ioconfig,\n"," return_probabilities=True,\n"," save_dir=global_save_dir / \"wsi_predictions\",\n"," on_gpu=ON_GPU,\n"," )"]},{"cell_type":"markdown","metadata":{"id":"noAAy35oSANs"},"source":["We see how the prediction model works on our whole-slide images by visualizing the `wsi_output`. We first need to merge patch prediction outputs and then visualize them as an overlay on the original image. As before, the `merge_predictions` method is used to merge the patch predictions. Here we set the parameters `resolution=1.25, units='power'` to generate the prediction map at 1.25x magnification. If you would like to have higher/lower resolution (bigger/smaller) prediction maps, you need to change these parameters accordingly. When the predictions are merged, use the `overlay_patch_prediction` function to overlay the prediction map on the WSI thumbnail, which should be extracted at the resolution used for prediction merging."]},{"cell_type":"code","execution_count":null,"metadata":{"ExecuteTime":{"start_time":"2023-11-10T18:40:05.805638800Z"},"colab":{"base_uri":"https://localhost:8080/","height":1000},"id":"WF_vY2B4i1yi","outputId":"04feef1f-6754-4181-c8a7-20afb35b345c"},"outputs":[{"data":{"text/plain":["(-0.5, 6039.5, 4703.5, -0.5)"]},"execution_count":9,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
    "]},"metadata":{},"output_type":"display_data"}],"source":["overview_resolution = (\n"," 4 # the resolution in which we desire to merge and visualize the patch predictions\n",")\n","# the unit of the `resolution` parameter. Can be \"power\", \"level\", \"mpp\", or \"baseline\"\n","overview_unit = \"mpp\"\n","wsi = WSIReader.open(wsi_path)\n","wsi_overview = wsi.slide_thumbnail(resolution=overview_resolution, units=overview_unit)\n","plt.figure(), plt.imshow(wsi_overview)\n","plt.axis(\"off\")"]},{"cell_type":"markdown","metadata":{"id":"ruKBD5tSSANs"},"source":["Overlaying the prediction map on this image as below gives:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"RndmFblDSANs","outputId":"48969f6f-55e9-4d7c-bfc8-c286089cd268"},"outputs":[{"data":{"image/png":"","text/plain":["
    "]},"metadata":{},"output_type":"display_data"}],"source":["# Visualization of whole-slide image patch-level prediction\n","# first set up a label to color mapping\n","label_color_dict = {}\n","label_color_dict[0] = (\"empty\", (0, 0, 0))\n","colors = cm.get_cmap(\"Set1\").colors\n","for class_name, label in label_dict.items():\n"," label_color_dict[label + 1] = (class_name, 255 * np.array(colors[label]))\n","\n","pred_map = predictor.merge_predictions(\n"," wsi_path,\n"," wsi_output[0],\n"," resolution=overview_resolution,\n"," units=overview_unit,\n",")\n","overlay = overlay_prediction_mask(\n"," wsi_overview,\n"," pred_map,\n"," alpha=0.5,\n"," label_info=label_color_dict,\n"," return_ax=True,\n",")\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"8D-rITa4SANs"},"source":["## Feature extraction with a pathology-specific model\n","\n","In this section, we will show how to extract features from a pretrained pytorch model that exists outside TIAToolbox, using the WSI inference engines provided by tiatoolbox. To illustrate this we will use HistoEncoder, a computational-pathology specific model that has been trained in a self-supervised fashion to extract features from histology images. The model has been made available here:\n","\n","'HistoEncoder: Foundation models for digital pathology' (https://github.com/jopo666/HistoEncoder) by Pohjonen, Joona and team at the University of Helsinki.\n","\n","We will plot a umap reduction into 3D (rgb) of the feature map to visualize how the features capture the differences between some of the above mentioned tissue types."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"VpInLNBLSANt"},"outputs":[],"source":["# Import some extra modules\n","import histoencoder.functional as F\n","import torch.nn as nn\n","\n","from tiatoolbox.models.engine.semantic_segmentor import DeepFeatureExtractor, IOSegmentorConfig\n","from tiatoolbox.models.models_abc import ModelABC\n","import umap"]},{"cell_type":"markdown","metadata":{"id":"D8BFVjGESANt"},"source":["TIAToolbox defines a ModelABC which is a class inheriting PyTorch [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) and specifies how a model should look in order to be used in the TIAToolbox inference engines. The histoencoder model doesn't follow this structure, so we need to wrap it in a class whose output and methods are those that the TIAToolbox engine expects."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Af9QuM7PSANt"},"outputs":[],"source":["class HistoEncWrapper(ModelABC):\n"," \"\"\"Wrapper for HistoEnc model that conforms to tiatoolbox ModelABC interface.\"\"\"\n","\n"," def __init__(self: HistoEncWrapper, encoder) -> None:\n"," super().__init__()\n"," self.feat_extract = encoder\n","\n"," def forward(self: HistoEncWrapper, imgs: torch.Tensor) -> torch.Tensor:\n"," \"\"\"Pass input data through the model.\n","\n"," Args:\n"," imgs (torch.Tensor):\n"," Model input.\n","\n"," \"\"\"\n"," out = F.extract_features(self.feat_extract, imgs, num_blocks=2, avg_pool=True)\n"," return out\n","\n"," @staticmethod\n"," def infer_batch(\n"," model: nn.Module,\n"," batch_data: torch.Tensor,\n"," *,\n"," on_gpu: bool,\n"," ) -> list[np.ndarray]:\n"," \"\"\"Run inference on an input batch.\n","\n"," Contains logic for forward operation as well as i/o aggregation.\n","\n"," Args:\n"," model (nn.Module):\n"," PyTorch defined model.\n"," batch_data (torch.Tensor):\n"," A batch of data generated by\n"," `torch.utils.data.DataLoader`.\n"," on_gpu (bool):\n"," Whether to run inference on a GPU.\n","\n"," \"\"\"\n"," img_patches_device = batch_data.to('cuda') if on_gpu else batch_data\n"," model.eval()\n"," # Do not compute the gradient (not training)\n"," with torch.inference_mode():\n"," output = model(img_patches_device)\n"," return [output.cpu().numpy()]"]},{"cell_type":"markdown","metadata":{"id":"_XQpoea5SANt"},"source":["Now that we have our wrapper, we will create our feature extraction model and instantiate a [DeepFeatureExtractor](https://tia-toolbox.readthedocs.io/en/v1.4.1/_autosummary/tiatoolbox.models.engine.semantic_segmentor.DeepFeatureExtractor.html) to allow us to use this model over a WSI. We will use the same WSI as above, but this time we will extract features from the patches of the WSI using the HistoEncoder model, rather than predicting some label for each patch."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"VtSHvExqSANt"},"outputs":[],"source":["# create the model\n","encoder = F.create_encoder(\"prostate_medium\")\n","model = HistoEncWrapper(encoder)\n","\n","# set the pre-processing function\n","norm=transforms.Normalize(mean=[0.662, 0.446, 0.605],std=[0.169, 0.190, 0.155])\n","trans = [\n"," transforms.ToTensor(),\n"," norm,\n","]\n","model.preproc_func = transforms.Compose(trans)\n","\n","wsi_ioconfig = IOSegmentorConfig(\n"," input_resolutions=[{\"units\": \"mpp\", \"resolution\": 0.5}],\n"," patch_input_shape=[224, 224],\n"," output_resolutions=[{\"units\": \"mpp\", \"resolution\": 0.5}],\n"," patch_output_shape=[224, 224],\n"," stride_shape=[224, 224],\n",")"]},{"cell_type":"markdown","metadata":{"id":"p6LrLhviSANt"},"source":["When we create the `DeepFeatureExtractor`, we will pass the `auto_generate_mask=True` argument. This will automatically create a mask of the tissue region using otsu thresholding, so that the extractor processes only those patches containing tissue."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"KoTLy4k0SANt","outputId":"936b14d4-8d83-42e3-dfcc-ab637fc23c03"},"outputs":[{"name":"stderr","output_type":"stream","text":["|2023-11-12|17:50:30.207| [WARNING] Read: Scale > 1.This means that the desired resolution is higher than the WSI baseline (maximum encoded resolution). Interpolation of read regions may occur.\n","Process Batch: 100%|##########################| 630/630 [02:23<00:00, 4.39it/s]\n"]},{"name":"stdout","output_type":"stream","text":["|2023-11-12|17:52:54.487| [INFO] Finish: 0\n","|2023-11-12|17:52:54.487| [INFO] --Input: tmp/sample_wsi.svs\n","|2023-11-12|17:52:54.488| [INFO] --Output: /home/u2271662/tia/projects/tiatoolbox/code/tutorials/intermediate_source/tmp/wsi_features/0\n"]}],"source":["# create the feature extractor and run it on the WSI\n","extractor = DeepFeatureExtractor(model=model, auto_generate_mask=True, batch_size=32, num_loader_workers=4, num_postproc_workers=4)\n","with suppress_console_output():\n"," out = extractor.predict(imgs=[wsi_path], mode=\"wsi\", ioconfig=wsi_ioconfig, save_dir=global_save_dir / \"wsi_features\",)"]},{"cell_type":"markdown","metadata":{"id":"CMJKi5JkSANt"},"source":["These features could be used to train a downstream model, but here in order to get some intuition for what the features represent, we will use a UMAP reduction to visualize the features in RGB space. The points labeled in a similar color should have similar features, so we can check if the features naturally separate out into the different tissue regions when we overlay the UMAP reduction on the WSI thumbnail. We will plot it along with the patch-level prediction map from above to see how the features compare to the patch-level predictions in the following cells."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eNIpM0dJSANt","outputId":"d5dcd269-704d-486f-92da-5639ff642994"},"outputs":[{"data":{"image/png":"","text/plain":["
    "]},"metadata":{},"output_type":"display_data"},{"data":{"image/png":"","text/plain":["
    "]},"metadata":{},"output_type":"display_data"}],"source":["# First we define a function to calculate the umap reduction\n","def umap_reducer(x, dims=3, nns=10):\n"," \"\"\"UMAP reduction of the input data.\"\"\"\n"," reducer = umap.UMAP(n_neighbors=nns, n_components=dims, metric=\"manhattan\", spread=0.5, random_state=2)\n"," reduced = reducer.fit_transform(x)\n"," reduced -= reduced.min(axis=0)\n"," reduced /= reduced.max(axis=0)\n"," return reduced\n","\n","# load the features output by our feature extractor\n","pos = np.load(global_save_dir / \"wsi_features\" / \"0.position.npy\")\n","feats = np.load(global_save_dir / \"wsi_features\" / \"0.features.0.npy\")\n","pos = pos / 8 # as we extracted at 0.5mpp, and we are overlaying on a thumbnail at 4mpp\n","\n","# reduce the features into 3 dimensional (rgb) space\n","reduced = umap_reducer(feats)\n","\n","# plot the prediction map the classifier again\n","overlay = overlay_prediction_mask(\n"," wsi_overview,\n"," pred_map,\n"," alpha=0.5,\n"," label_info=label_color_dict,\n"," return_ax=True,\n",")\n","\n","# plot the feature map reduction\n","plt.figure()\n","plt.imshow(wsi_overview)\n","plt.scatter(pos[:,0], pos[:,1], c=reduced, s=1, alpha=0.5)\n","plt.axis(\"off\")\n","plt.title(\"UMAP reduction of HistoEnc features\")\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"ixWAJc_ZSANt"},"source":["We see that the prediction map from our patch-level predictor, and the feature map from our self-supervised feature encoder, capture similar information about the tissue types in the WSI. This is a good sanity check that our models are working as expected. It also shows that the features extracted by the HistoEncoder model are capturing the differences between the tissue types, and so that they are encoding histologically relevant information."]},{"cell_type":"markdown","metadata":{"id":"J_1pb6BGGbVu"},"source":["## Where to Go From Here\n","\n","In this notebook, we show how we can use the `PatchPredictor` and `DeepFeatureExtractor` classes and their `predict` method to predict the label, or extract features, for patches of big tiles and WSIs. We introduce `merge_predictions` and `overlay_prediction_mask` helper functions that merge the patch prediction outputs and visualize the resulting prediction map as an overlay on the input image/WSI.\n","\n","All the processes take place within TIAToolbox and we can easily put the pieces together, following our example code. Please make sure to set inputs and options correctly. We encourage you to further investigate the effect on the prediction output of changing `predict` function parameters. We have demonstrated how to use your own pretrained model or one provided by the research community for a specific task in the TIAToolbox framework to do inference on large WSIs even if the model structure is not defined in the TIAToolbox model class.\n","\n","You can learn more through the following resources:\n","\n","- [Advanced model handling with PyTorch and TIAToolbox](https://tia-toolbox.readthedocs.io/en/latest/_notebooks/jnb/07-advanced-modeling.html)\n","- [Creating slide graphs for WSI with a custom PyTorch graph neural network](https://tia-toolbox.readthedocs.io/en/latest/_notebooks/jnb/full-pipelines/slide-graph.html)"]}],"metadata":{"accelerator":"GPU","celltoolbar":"Edit Metadata","colab":{"provenance":[{"file_id":"1Ke0YSaLwsoiIc6ZlNj3MNm7fMdGdL2M2","timestamp":1699972954536}]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.12"}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file diff --git a/_templates/layout.html b/_templates/layout.html index 242e347d092..e3827929f00 100644 --- a/_templates/layout.html +++ b/_templates/layout.html @@ -1,5 +1,21 @@ {% extends "!layout.html" %} + +{% block menu %} + {% if 'singlehtml' not in builder %} + {% set global_toc = toctree(collapse=theme_collapse_navigation|tobool, + includehidden=theme_includehidden|tobool, + titles_only=True) %} + {% endif %} + {% if global_toc %} + {{ global_toc }} + {% else %} + +
    {{ toc }}
    + {% endif %} +{% endblock %} + + {%- block content %} {{ super() }} @@ -12,12 +28,95 @@ {%- endblock %} +{% block sidebartitle %} + + {% if theme_display_version %} + {%- set nav_version = version %} + {% if READTHEDOCS and current_version %} + {%- set nav_version = current_version %} + {% endif %} + {% if nav_version %} +
    + {{ nav_version }} +
    + {% endif %} + {% endif %} + + + +
    + + +
    + + +{% endblock %} {% block footer %} {{ super() }} diff --git a/advanced_source/ONNXLive.rst b/advanced_source/ONNXLive.rst index 21380e43405..7177522c968 100644 --- a/advanced_source/ONNXLive.rst +++ b/advanced_source/ONNXLive.rst @@ -2,172 +2,11 @@ ONNX Live Tutorial ================== -This tutorial will show you to convert a neural style transfer model that has been exported from PyTorch into the Apple CoreML format using ONNX. This will allow you to easily run deep learning models on Apple devices and, in this case, live stream from the camera. +This tutorial has been deprecated. -What is ONNX? -------------- +Redirecting in 3 seconds... -ONNX (Open Neural Network Exchange) is an open format to represent deep learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them. ONNX is developed and supported by a community of partners. You can learn more about ONNX and what tools are supported by going to `onnx.ai `_. -Tutorial Overview ------------------ +.. raw:: html -This tutorial will walk you through 4 main steps: - - -#. `Download (or train) PyTorch style transfer models`_ -#. `Convert the PyTorch models to ONNX models`_ -#. `Convert the ONNX models to CoreML models`_ -#. `Run the CoreML models in a style transfer iOS App`_ - -Preparing the Environment -------------------------- - -We will be working in a virtualenv in order to avoid conflicts with your local packages. -We are also using Python 3.6 for this tutorial, but other versions should work as well. - -.. code-block:: python - - python3.6 -m venv venv - source ./venv/bin/activate - - -You need to install pytorch and the onnx->coreml converter: - -.. code-block:: bash - - pip install torchvision onnx-coreml - - -You will also need to install XCode if you want to run the iOS style transfer app on your iPhone. -You can also convert models in Linux, however to run the iOS app itself, you will need a Mac. - -Download (or train) PyTorch style transfer models -------------------------------------------------- - -For this tutorial, we will use the style transfer models that are published with pytorch in https://github.com/pytorch/examples/tree/master/fast_neural_style . -If you would like to use a different PyTorch or ONNX model, feel free to skip this step. - -These models are meant for applying style transfer on still images and really not optimized to be fast enough for video. However if we reduce the resolution low enough, they can also work well on videos. - -Let's download the models: - -.. code-block:: bash - - git clone https://github.com/pytorch/examples - cd examples/fast_neural_style - - -If you would like to train the models yourself, the pytorch/examples repository you just cloned has more information on how to do this. -For now, we'll just download pre-trained models with the script provided by the repository: - -.. code-block:: bash - - python download_saved_models.py - - -This script downloads the pre-trained PyTorch models and puts them into the ``saved_models`` folder. -There should now be 4 files, ``candy.pth``\ , ``mosaic.pth``\ , ``rain_princess.pth`` and ``udnie.pth`` in your directory. - -Convert the PyTorch models to ONNX models ------------------------------------------ - -Now that we have the pre-trained PyTorch models as ``.pth`` files in the ``saved_models`` folder, we will need to convert them to ONNX format. -The model definition is in the pytorch/examples repository we cloned previously, and with a few lines of python we can export it to ONNX. -In this case, instead of actually running the neural net, we will call ``torch.onnx._export``\ , which is provided with PyTorch as an api to directly export ONNX formatted models from PyTorch. -However, in this case we don't even need to do that, because a script already exists ``neural_style/neural_style.py`` that will do this for us. -You can also take a look at that script if you would like to apply it to other models. - -Exporting the ONNX format from PyTorch is essentially tracing your neural network so this api call will internally run the network on 'dummy data' in order to generate the graph. -For this, it needs an input image to apply the style transfer to which can simply be a blank image. -However, the pixel size of this image is important, as this will be the size for the exported style transfer model. -To get good performance, we'll use a resolution of 250x540. Feel free to take a larger resolution if you care less about -FPS and more about style transfer quality. - -Let's use `ImageMagick `_ to create a blank image of the resolution we want: - -.. code-block:: bash - - convert -size 250x540 xc:white png24:dummy.jpg - - -and use that to export the PyTorch models: - -.. code-block:: bash - - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/candy.pth --cuda 0 --export_onnx ./saved_models/candy.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/udnie.pth --cuda 0 --export_onnx ./saved_models/udnie.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/rain_princess.pth --cuda 0 --export_onnx ./saved_models/rain_princess.onnx - python ./neural_style/neural_style.py eval --content-image dummy.jpg --output-image dummy-out.jpg --model ./saved_models/mosaic.pth --cuda 0 --export_onnx ./saved_models/mosaic.onnx - - -You should end up with 4 files, ``candy.onnx``\ , ``mosaic.onnx``\ , ``rain_princess.onnx`` and ``udnie.onnx``\ , -created from the corresponding ``.pth`` files. - -Convert the ONNX models to CoreML models ----------------------------------------- - -Now that we have ONNX models, we can convert them to CoreML models in order to run them on Apple devices. -For this, we use the onnx-coreml converter we installed previously. -The converter comes with a ``convert-onnx-to-coreml`` script, which the installation steps above added to our path. Unfortunately that won't work for us as we need to mark the input and output of the network as an image -and, while this is supported by the converter, it is only supported when calling the converter from python. - -Looking at the style transfer model (for example opening the .onnx file in an application like `Netron `_\ ), -we see that the input is named '0' and the output is named '186'. These are just numeric ids assigned by PyTorch. -We will need to mark these as images. - -So let's create a small python file and call it ``onnx_to_coreml.py``. This can be created by using the touch command and edited with your favorite editor to add the following lines of code. - -.. code-block:: python - - import sys - from onnx import onnx_pb - from onnx_coreml import convert - - model_in = sys.argv[1] - model_out = sys.argv[2] - - model_file = open(model_in, 'rb') - model_proto = onnx_pb.ModelProto() - model_proto.ParseFromString(model_file.read()) - coreml_model = convert(model_proto, image_input_names=['0'], image_output_names=['186']) - coreml_model.save(model_out) - - -we now run it: - -.. code-block:: bash - - python onnx_to_coreml.py ./saved_models/candy.onnx ./saved_models/candy.mlmodel - python onnx_to_coreml.py ./saved_models/udnie.onnx ./saved_models/udnie.mlmodel - python onnx_to_coreml.py ./saved_models/rain_princess.onnx ./saved_models/rain_princess.mlmodel - python onnx_to_coreml.py ./saved_models/mosaic.onnx ./saved_models/mosaic.mlmodel - - -Now, there should be 4 CoreML models in your ``saved_models`` directory: ``candy.mlmodel``\ , ``mosaic.mlmodel``\ , ``rain_princess.mlmodel`` and ``udnie.mlmodel``. - -Run the CoreML models in a style transfer iOS App -------------------------------------------------- - -This repository (i.e. the one you're currently reading the README.md of) contains an iOS app able to run CoreML style transfer models on a live camera stream from your phone camera. Let's clone the repository: - -.. code-block:: bash - - git clone https://github.com/onnx/tutorials - - -and open the ``tutorials/examples/CoreML/ONNXLive/ONNXLive.xcodeproj`` project in XCode. -We recommend using XCode 9.3 and an iPhone X. There might be issues running on older devices or XCode versions. - -In the ``Models/`` folder, the project contains some .mlmodel files. We're going to replace them with the models we just created. - -You then run the app on your iPhone and you are all set. Tapping on the screen switches through the models. - -Conclusion ----------- - -We hope this tutorial gave you an overview of what ONNX is about and how you can use it to convert neural networks -between frameworks, in this case neural style transfer models moving from PyTorch to CoreML. - -Feel free to experiment with these steps and test them on your own models. -Please let us know if you hit any issues or want to give feedback. We'd like to hear what you think. + diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py index 41f02628213..c634932971b 100644 --- a/advanced_source/coding_ddpg.py +++ b/advanced_source/coding_ddpg.py @@ -58,25 +58,39 @@ # Imports and setup # ----------------- # - -import torchrl +# .. code-block:: bash +# +# %%bash +# pip3 install torchrl mujoco glfw # sphinx_gallery_start_ignore import warnings -from typing import Tuple warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + # sphinx_gallery_end_ignore -import torch.cuda + +import torch import tqdm -import torch.multiprocessing ############################################################################### # We will execute the policy on CUDA if available +is_fork = multiprocessing.get_start_method() == "fork" device = ( - torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") ) collector_device = torch.device("cpu") # Change the device to ``cuda`` to use CUDA @@ -168,7 +182,7 @@ # Later, we will see how the target parameters should be updated in TorchRL. # -from tensordict.nn import TensorDictModule +from tensordict.nn import TensorDictModule, TensorDictSequential def _init( @@ -237,23 +251,18 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp.update(hyperparams) value_key = "state_action_value" if value_type == ValueEstimators.TD1: - self._value_estimator = TD1Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TD1Estimator(value_network=self.actor_critic, **hp) elif value_type == ValueEstimators.TD0: - self._value_estimator = TD0Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TD0Estimator(value_network=self.actor_critic, **hp) elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_estimator = TDLambdaEstimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) + self._value_estimator = TDLambdaEstimator(value_network=self.actor_critic, **hp) else: raise NotImplementedError(f"Unknown value type {value_type}") + self._value_estimator.set_keys(value=value_key) ############################################################################### @@ -281,12 +290,11 @@ def _loss_actor( ) -> torch.Tensor: td_copy = tensordict.select(*self.actor_in_keys) # Get an action from the actor network: since we made it functional, we need to pass the params - td_copy = self.actor_network(td_copy, params=self.actor_network_params) + with self.actor_network_params.to_module(self.actor_network): + td_copy = self.actor_network(td_copy) # get the value associated with that action - td_copy = self.value_network( - td_copy, - params=self.value_network_params.detach(), - ) + with self.value_network_params.detach().to_module(self.value_network): + td_copy = self.value_network(td_copy) return -td_copy.get("state_action_value") @@ -304,11 +312,12 @@ def _loss_actor( def _loss_value( self, tensordict, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: +): td_copy = tensordict.clone() # V(s, a) - self.value_network(td_copy, params=self.value_network_params) + with self.value_network_params.to_module(self.value_network): + self.value_network(td_copy) pred_val = td_copy.get("state_action_value").squeeze(-1) # we manually reconstruct the parameters of the actor-critic, where the first @@ -323,9 +332,8 @@ def _loss_value( batch_size=self.target_actor_network_params.batch_size, device=self.target_actor_network_params.device, ) - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) + with target_params.to_module(self.actor_critic): + target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1) # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function` loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function) @@ -342,7 +350,7 @@ def _loss_value( # value and actor loss, collect the cost values and write them in a ``TensorDict`` # delivered to the user. -from tensordict.tensordict import TensorDict, TensorDictBase +from tensordict import TensorDict, TensorDictBase def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: @@ -450,6 +458,7 @@ def make_env(from_pixels=False): raise NotImplementedError env_kwargs = { + "device": device, "from_pixels": from_pixels, "pixels_only": from_pixels, "frame_skip": 2, @@ -512,16 +521,6 @@ def make_transformed_env( # syntax. env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) - double_to_float_list = [] - double_to_float_inv_list = [] - if env_library is DMControlEnv: - # ``DMControl`` requires double-precision - double_to_float_list += [ - "reward", - "action", - ] - double_to_float_inv_list += ["action"] - # We concatenate all states into a single "observation_vector" # even if there is a single tensor, it'll be renamed in "observation_vector". # This facilitates the downstream operations as we know the name of the @@ -537,12 +536,7 @@ def make_transformed_env( # version of the transform env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) - double_to_float_list.append(out_key) - env.append_transform( - DoubleToFloat( - in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list - ) - ) + env.append_transform(DoubleToFloat()) env.append_transform(StepCounter(max_frames_per_traj)) @@ -722,7 +716,7 @@ def get_env_stats(): ActorCriticWrapper, DdpgMlpActor, DdpgMlpQNet, - OrnsteinUhlenbeckProcessWrapper, + OrnsteinUhlenbeckProcessModule, ProbabilisticActor, TanhDelta, ValueOperator, @@ -781,15 +775,18 @@ def make_ddpg_actor( # Exploration # ~~~~~~~~~~~ # -# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper` +# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule` # exploration module, as suggested in the original paper. # Let's define the number of frames before OU noise reaches its minimum value annealing_frames = 1_000_000 -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( +actor_model_explore = TensorDictSequential( actor, - annealing_num_steps=annealing_frames, -).to(device) + OrnsteinUhlenbeckProcessModule( + spec=actor.spec.clone(), + annealing_num_steps=annealing_frames, + ).to(device), +) if device == torch.device("cpu"): actor_model_explore.share_memory() @@ -867,9 +864,6 @@ def make_ddpg_actor( reset_at_each_iter=False, split_trajs=False, device=collector_device, - # device for execution - storing_device=collector_device, - # device where data will be stored and passed exploration_type=ExplorationType.RANDOM, ) @@ -1176,7 +1170,7 @@ def ceil_div(x, y): ) # update the exploration strategy - actor_model_explore.step(current_frames) + actor_model_explore[1].step(current_frames) collector.shutdown() del collector @@ -1221,6 +1215,6 @@ def ceil_div(x, y): # # To iterate further on this loss module we might consider: # -# - Using `@dispatch` (see `[Feature] Distpatch IQL loss module `_. +# - Using `@dispatch` (see `[Feature] Distpatch IQL loss module `_.) # - Allowing flexible TensorDict keys. # diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst index d09f877e5a2..51e5e0b358f 100644 --- a/advanced_source/cpp_autograd.rst +++ b/advanced_source/cpp_autograd.rst @@ -255,9 +255,9 @@ Out: [ CPUFloatType{3,4} ] Please see the documentation for ``torch::autograd::backward`` -(`link `_) +(`link `_) and ``torch::autograd::grad`` -(`link `_) +(`link `_) for more information on how to use them. Using custom autograd function in C++ @@ -394,9 +394,9 @@ C++ using the following table: +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Python | C++ | +================================+========================================================================================================================================================================+ -| ``torch.autograd.backward`` | ``torch::autograd::backward`` (`link `_) | +| ``torch.autograd.backward`` | ``torch::autograd::backward`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ``torch.autograd.grad`` | ``torch::autograd::grad`` (`link `_) | +| ``torch.autograd.grad`` | ``torch::autograd::grad`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``torch.Tensor.detach`` | ``torch::Tensor::detach`` (`link `_) | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/advanced_source/cpp_custom_ops.rst b/advanced_source/cpp_custom_ops.rst new file mode 100644 index 00000000000..9dc06daa6f4 --- /dev/null +++ b/advanced_source/cpp_custom_ops.rst @@ -0,0 +1,469 @@ +.. _cpp-custom-ops-tutorial: + +Custom C++ and CUDA Operators +============================= + +**Author:** `Richard Zou `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to integrate custom operators written in C++/CUDA with PyTorch + * How to test custom operators using ``torch.library.opcheck`` + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 or later + * Basic understanding of C++ and CUDA programming + +.. note:: + + This tutorial will also work on AMD ROCm with no additional modifications. + +PyTorch offers a large library of operators that work on Tensors (e.g. torch.add, torch.sum, etc). +However, you may wish to bring a new custom operator to PyTorch. This tutorial demonstrates the +blessed path to authoring a custom operator written in C++/CUDA. + +For our tutorial, we’ll demonstrate how to author a fused multiply-add C++ +and CUDA operator that composes with PyTorch subsystems. The semantics of +the operation are as follows: + +.. code-block:: python + + def mymuladd(a: Tensor, b: Tensor, c: float): + return a * b + c + +You can find the end-to-end working example for this tutorial +`here `_ . + +Setting up the Build System +--------------------------- + +If you are developing custom C++/CUDA code, it must be compiled. +Note that if you’re interfacing with a Python library that already has bindings +to precompiled C++/CUDA code, you might consider writing a custom Python operator +instead (:ref:`python-custom-ops-tutorial`). + +Use `torch.utils.cpp_extension `_ +to compile custom C++/CUDA code for use with PyTorch +C++ extensions may be built either "ahead of time" with setuptools, or "just in time" +via `load_inline `_; +we’ll focus on the "ahead of time" flavor. + +Using ``cpp_extension`` is as simple as writing the following ``setup.py``: + +.. code-block:: python + + from setuptools import setup, Extension + from torch.utils import cpp_extension + + setup(name="extension_cpp", + ext_modules=[ + cpp_extension.CppExtension("extension_cpp", ["muladd.cpp"])], + cmdclass={'build_ext': cpp_extension.BuildExtension}) + +If you need to compile CUDA code (for example, ``.cu`` files), then instead use +`torch.utils.cpp_extension.CUDAExtension `_. +Please see `extension-cpp `_ for an +example for how this is set up. + +Starting with PyTorch 2.6, you can now build a single wheel for multiple CPython +versions (similar to what you would do for pure python packages). In particular, +if your custom library adheres to the `CPython Stable Limited API +`_ or avoids CPython entirely, you +can build one Python agnostic wheel against a minimum supported CPython version +through setuptools' ``py_limited_api`` flag, like so: + +.. code-block:: python + + from setuptools import setup, Extension + from torch.utils import cpp_extension + + setup(name="extension_cpp", + ext_modules=[ + cpp_extension.CppExtension( + "extension_cpp", + ["python_agnostic_code.cpp"], + py_limited_api=True)], + cmdclass={'build_ext': cpp_extension.BuildExtension}, + options={"bdist_wheel": {"py_limited_api": "cp39"}} + ) + +Note that you must specify ``py_limited_api=True`` both within ``setup`` +and also as an option to the ``"bdist_wheel"`` command with the minimal supported +Python version (in this case, 3.9). This ``setup`` would build one wheel that could +be installed across multiple Python versions ``python>=3.9``. Please see +`torchao `_ for an example. + +.. note:: + + You must verify independently that the built wheel is truly Python agnostic. + Specifying ``py_limited_api`` does not check for any guarantees, so it is possible + to build a wheel that looks Python agnostic but will crash, or worse, be silently + incorrect, in another Python environment. Take care to avoid using unstable CPython + APIs, for example APIs from libtorch_python (in particular pytorch/python bindings,) + and to only use APIs from libtorch (aten objects, operators and the dispatcher). + For example, to give access to custom ops from Python, the library should register + the ops through the dispatcher (covered below!). + +Defining the custom op and adding backend implementations +--------------------------------------------------------- +First, let's write a C++ function that computes ``mymuladd``: + +.. code-block:: cpp + + at::Tensor mymuladd_cpu(at::Tensor a, const at::Tensor& b, double c) { + TORCH_CHECK(a.sizes() == b.sizes()); + TORCH_CHECK(a.dtype() == at::kFloat); + TORCH_CHECK(b.dtype() == at::kFloat); + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CPU); + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CPU); + at::Tensor a_contig = a.contiguous(); + at::Tensor b_contig = b.contiguous(); + at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options()); + const float* a_ptr = a_contig.data_ptr(); + const float* b_ptr = b_contig.data_ptr(); + float* result_ptr = result.data_ptr(); + for (int64_t i = 0; i < result.numel(); i++) { + result_ptr[i] = a_ptr[i] * b_ptr[i] + c; + } + return result; + } + +In order to use this from PyTorch’s Python frontend, we need to register it +as a PyTorch operator using the ``TORCH_LIBRARY`` API. This will automatically +bind the operator to Python. + +Operator registration is a two step-process: + +- **Defining the operator** - This step ensures that PyTorch is aware of the new operator. +- **Registering backend implementations** - In this step, implementations for various + backends, such as CPU and CUDA, are associated with the operator. + +Defining an operator +^^^^^^^^^^^^^^^^^^^^ +To define an operator, follow these steps: + +1. select a namespace for an operator. We recommend the namespace be the name of your top-level + project; we’ll use "extension_cpp" in our tutorial. +2. provide a schema string that specifies the input/output types of the operator and if an + input Tensors will be mutated. We support more types in addition to Tensor and float; + please see `The Custom Operators Manual `_ + for more details. + + * If you are authoring an operator that can mutate its input Tensors, please see here + (:ref:`mutable-ops`) for how to specify that. + +.. code-block:: cpp + + TORCH_LIBRARY(extension_cpp, m) { + // Note that "float" in the schema corresponds to the C++ double type + // and the Python float type. + m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor"); + } + +This makes the operator available from Python via ``torch.ops.extension_cpp.mymuladd``. + +Registering backend implementations for an operator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Use ``TORCH_LIBRARY_IMPL`` to register a backend implementation for the operator. + +.. code-block:: cpp + + TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) { + m.impl("mymuladd", &mymuladd_cpu); + } + +If you also have a CUDA implementation of ``myaddmul``, you can register it +in a separate ``TORCH_LIBRARY_IMPL`` block: + +.. code-block:: cpp + + __global__ void muladd_kernel(int numel, const float* a, const float* b, float c, float* result) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) result[idx] = a[idx] * b[idx] + c; + } + + at::Tensor mymuladd_cuda(const at::Tensor& a, const at::Tensor& b, double c) { + TORCH_CHECK(a.sizes() == b.sizes()); + TORCH_CHECK(a.dtype() == at::kFloat); + TORCH_CHECK(b.dtype() == at::kFloat); + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA); + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA); + at::Tensor a_contig = a.contiguous(); + at::Tensor b_contig = b.contiguous(); + at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options()); + const float* a_ptr = a_contig.data_ptr(); + const float* b_ptr = b_contig.data_ptr(); + float* result_ptr = result.data_ptr(); + + int numel = a_contig.numel(); + muladd_kernel<<<(numel+255)/256, 256>>>(numel, a_ptr, b_ptr, c, result_ptr); + return result; + } + + TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) { + m.impl("mymuladd", &mymuladd_cuda); + } + +Adding ``torch.compile`` support for an operator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To add ``torch.compile`` support for an operator, we must add a FakeTensor kernel (also +known as a "meta kernel" or "abstract impl"). FakeTensors are Tensors that have +metadata (such as shape, dtype, device) but no data: the FakeTensor kernel for an +operator specifies how to compute the metadata of output tensors given the metadata of input tensors. +The FakeTensor kernel should return dummy Tensors of your choice with +the correct Tensor metadata (shape/strides/``dtype``/device). + +We recommend that this be done from Python via the ``torch.library.register_fake`` API, +though it is possible to do this from C++ as well (see +`The Custom Operators Manual `_ +for more details). + +.. code-block:: python + + # Important: the C++ custom operator definitions should be loaded first + # before calling ``torch.library`` APIs that add registrations for the + # C++ custom operator(s). The following import loads our + # C++ custom operator definitions. + # Note that if you are striving for Python agnosticism, you should use + # the ``load_library(...)`` API call instead. See the next section for + # more details. + from . import _C + + @torch.library.register_fake("extension_cpp::mymuladd") + def _(a, b, c): + torch._check(a.shape == b.shape) + torch._check(a.dtype == torch.float) + torch._check(b.dtype == torch.float) + torch._check(a.device == b.device) + return torch.empty_like(a) + +Setting up hybrid Python/C++ registration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In this tutorial, we defined a custom operator in C++, added CPU/CUDA +implementations in C++, and added ``FakeTensor`` kernels and backward formulas +in Python. The order in which these registrations are loaded (or imported) +matters (importing in the wrong order will lead to an error). + +To use the custom operator with hybrid Python/C++ registrations, we must +first load the C++ library that holds the custom operator definition +and then call the ``torch.library`` registration APIs. This can happen in one +of two ways: + +1. If you're following this tutorial, importing the Python C extension module + we created will load the C++ custom operator definitions. +2. If your C++ custom operator is located in a shared library object, you can + also use ``torch.ops.load_library("/path/to/library.so")`` to load it. This + is the blessed path for Python agnosticism, as you will not have a Python C + extension module to import. See `torchao __init__.py `_ + for an example. + + +Adding training (autograd) support for an operator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Use ``torch.library.register_autograd`` to add training support for an operator. Prefer +this over directly using Python ``torch.autograd.Function`` or C++ ``torch::autograd::Function``; +you must use those in a very specific way to avoid silent incorrectness (see +`The Custom Operators Manual `_ +for more details). + +.. code-block:: python + + def _backward(ctx, grad): + a, b = ctx.saved_tensors + grad_a, grad_b = None, None + if ctx.needs_input_grad[0]: + grad_a = grad * b + if ctx.needs_input_grad[1]: + grad_b = grad * a + return grad_a, grad_b, None + + def _setup_context(ctx, inputs, output): + a, b, c = inputs + saved_a, saved_b = None, None + if ctx.needs_input_grad[0]: + saved_b = b + if ctx.needs_input_grad[1]: + saved_a = a + ctx.save_for_backward(saved_a, saved_b) + + # This code adds training support for the operator. You must provide us + # the backward formula for the operator and a `setup_context` function + # to save values to be used in the backward. + torch.library.register_autograd( + "extension_cpp::mymuladd", _backward, setup_context=_setup_context) + +Note that the backward must be a composition of PyTorch-understood operators. +If you wish to use another custom C++ or CUDA kernel in your backwards pass, +it must be wrapped into a custom operator. + +If we had our own custom ``mymul`` kernel, we would need to wrap it into a +custom operator and then call that from the backward: + +.. code-block:: cpp + + // New! a mymul_cpu kernel + at::Tensor mymul_cpu(const at::Tensor& a, const at::Tensor& b) { + TORCH_CHECK(a.sizes() == b.sizes()); + TORCH_CHECK(a.dtype() == at::kFloat); + TORCH_CHECK(b.dtype() == at::kFloat); + TORCH_CHECK(a.device().type() == at::DeviceType::CPU); + TORCH_CHECK(b.device().type() == at::DeviceType::CPU); + at::Tensor a_contig = a.contiguous(); + at::Tensor b_contig = b.contiguous(); + at::Tensor result = torch::empty(a_contig.sizes(), a_contig.options()); + const float* a_ptr = a_contig.data_ptr(); + const float* b_ptr = b_contig.data_ptr(); + float* result_ptr = result.data_ptr(); + for (int64_t i = 0; i < result.numel(); i++) { + result_ptr[i] = a_ptr[i] * b_ptr[i]; + } + return result; + } + + TORCH_LIBRARY(extension_cpp, m) { + m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor"); + // New! defining the mymul operator + m.def("mymul(Tensor a, Tensor b) -> Tensor"); + } + + + TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) { + m.impl("mymuladd", &mymuladd_cpu); + // New! registering the cpu kernel for the mymul operator + m.impl("mymul", &mymul_cpu); + } + +.. code-block:: python + + def _backward(ctx, grad): + a, b = ctx.saved_tensors + grad_a, grad_b = None, None + if ctx.needs_input_grad[0]: + grad_a = torch.ops.extension_cpp.mymul.default(grad, b) + if ctx.needs_input_grad[1]: + grad_b = torch.ops.extension_cpp.mymul.default(grad, a) + return grad_a, grad_b, None + + + def _setup_context(ctx, inputs, output): + a, b, c = inputs + saved_a, saved_b = None, None + if ctx.needs_input_grad[0]: + saved_b = b + if ctx.needs_input_grad[1]: + saved_a = a + ctx.save_for_backward(saved_a, saved_b) + + + # This code adds training support for the operator. You must provide us + # the backward formula for the operator and a `setup_context` function + # to save values to be used in the backward. + torch.library.register_autograd( + "extension_cpp::mymuladd", _backward, setup_context=_setup_context) + +Testing an operator +------------------- +Use ``torch.library.opcheck`` to test that the custom op was registered correctly. +Note that this function does not test that the gradients are mathematically correct +-- plan to write separate tests for that, either manual ones or by using +``torch.autograd.gradcheck``. + +.. code-block:: python + + def sample_inputs(device, *, requires_grad=False): + def make_tensor(*size): + return torch.randn(size, device=device, requires_grad=requires_grad) + + def make_nondiff_tensor(*size): + return torch.randn(size, device=device, requires_grad=False) + + return [ + [make_tensor(3), make_tensor(3), 1], + [make_tensor(20), make_tensor(20), 3.14], + [make_tensor(20), make_nondiff_tensor(20), -123], + [make_nondiff_tensor(2, 3), make_tensor(2, 3), -0.3], + ] + + def reference_muladd(a, b, c): + return a * b + c + + samples = sample_inputs(device, requires_grad=True) + samples.extend(sample_inputs(device, requires_grad=False)) + for args in samples: + # Correctness test + result = torch.ops.extension_cpp.mymuladd(*args) + expected = reference_muladd(*args) + torch.testing.assert_close(result, expected) + + # Use opcheck to check for incorrect usage of operator registration APIs + torch.library.opcheck(torch.ops.extension_cpp.mymuladd.default, args) + +.. _mutable-ops: + +Creating mutable operators +-------------------------- +You may wish to author a custom operator that mutates its inputs. Use ``Tensor(a!)`` +to specify each mutable Tensor in the schema; otherwise, there will be undefined +behavior. If there are multiple mutated Tensors, use different names (for example, ``Tensor(a!)``, +``Tensor(b!)``, ``Tensor(c!)``) for each mutable Tensor. + +Let's author a ``myadd_out(a, b, out)`` operator, which writes the contents of ``a+b`` into ``out``. + +.. code-block:: cpp + + // An example of an operator that mutates one of its inputs. + void myadd_out_cpu(const at::Tensor& a, const at::Tensor& b, at::Tensor& out) { + TORCH_CHECK(a.sizes() == b.sizes()); + TORCH_CHECK(b.sizes() == out.sizes()); + TORCH_CHECK(a.dtype() == at::kFloat); + TORCH_CHECK(b.dtype() == at::kFloat); + TORCH_CHECK(out.dtype() == at::kFloat); + TORCH_CHECK(out.is_contiguous()); + TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CPU); + TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CPU); + TORCH_INTERNAL_ASSERT(out.device().type() == at::DeviceType::CPU); + at::Tensor a_contig = a.contiguous(); + at::Tensor b_contig = b.contiguous(); + const float* a_ptr = a_contig.data_ptr(); + const float* b_ptr = b_contig.data_ptr(); + float* result_ptr = out.data_ptr(); + for (int64_t i = 0; i < out.numel(); i++) { + result_ptr[i] = a_ptr[i] + b_ptr[i]; + } + } + +When defining the operator, we must specify that it mutates the out Tensor in the schema: + +.. code-block:: cpp + + TORCH_LIBRARY(extension_cpp, m) { + m.def("mymuladd(Tensor a, Tensor b, float c) -> Tensor"); + m.def("mymul(Tensor a, Tensor b) -> Tensor"); + // New! + m.def("myadd_out(Tensor a, Tensor b, Tensor(a!) out) -> ()"); + } + + TORCH_LIBRARY_IMPL(extension_cpp, CPU, m) { + m.impl("mymuladd", &mymuladd_cpu); + m.impl("mymul", &mymul_cpu); + // New! + m.impl("myadd_out", &myadd_out_cpu); + } + +.. note:: + + Do not return any mutated Tensors as outputs of the operator as this will + cause incompatibility with PyTorch subsystems like ``torch.compile``. + +Conclusion +---------- +In this tutorial, we went over the recommended approach to integrating Custom C++ +and CUDA operators with PyTorch. The ``TORCH_LIBRARY/torch.library`` APIs are fairly +low-level. For more information about how to use the API, see +`The Custom Operators Manual `_. diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst index 5dedbdaaa65..286c79622dd 100644 --- a/advanced_source/cpp_export.rst +++ b/advanced_source/cpp_export.rst @@ -1,6 +1,8 @@ Loading a TorchScript Model in C++ ===================================== +.. warning:: TorchScript is no longer in active development. + As its name suggests, the primary interface to PyTorch is the Python programming language. While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many @@ -203,7 +205,7 @@ minimal ``CMakeLists.txt`` to build it could look as simple as: add_executable(example-app example-app.cpp) target_link_libraries(example-app "${TORCH_LIBRARIES}") - set_property(TARGET example-app PROPERTY CXX_STANDARD 14) + set_property(TARGET example-app PROPERTY CXX_STANDARD 17) The last thing we need to build the example application is the LibTorch distribution. You can always grab the latest stable release from the `download diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst index cb0e990797e..96cbb9f5cc7 100644 --- a/advanced_source/cpp_extension.rst +++ b/advanced_source/cpp_extension.rst @@ -2,6 +2,10 @@ Custom C++ and CUDA Extensions ============================== **Author**: `Peter Goldsborough `_ +.. warning:: + + This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page` + for the newest up-to-date guides on extending PyTorch with Custom C++/CUDA Extensions. PyTorch provides a plethora of operations related to neural networks, arbitrary tensor algebra, data wrangling and other purposes. However, you may still find @@ -225,7 +229,7 @@ Instead of: Currently open issue for nvcc bug `here `_. Complete workaround code example `here -`_. +`_. Forward Pass ************ diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst index 901658183c7..d31be00c632 100644 --- a/advanced_source/cpp_frontend.rst +++ b/advanced_source/cpp_frontend.rst @@ -57,7 +57,7 @@ the right tool for the job. Examples for such environments include: Multiprocessing is an alternative, but not as scalable and has significant shortcomings. C++ has no such constraints and threads are easy to use and create. Models requiring heavy parallelization, like those used in `Deep - Neuroevolution `_, can benefit from + Neuroevolution `_, can benefit from this. - **Existing C++ Codebases**: You may be the owner of an existing C++ application doing anything from serving web pages in a backend server to @@ -662,7 +662,7 @@ Defining the DCGAN Modules We now have the necessary background and introduction to define the modules for the machine learning task we want to solve in this post. To recap: our task is to generate images of digits from the `MNIST dataset -`_. We want to use a `generative adversarial +`_. We want to use a `generative adversarial network (GAN) `_ to solve this task. In particular, we'll use a `DCGAN architecture @@ -969,7 +969,7 @@ the data loader every epoch and then write the GAN training code: discriminator->zero_grad(); torch::Tensor real_images = batch.data; torch::Tensor real_labels = torch::empty(batch.data.size(0)).uniform_(0.8, 1.0); - torch::Tensor real_output = discriminator->forward(real_images); + torch::Tensor real_output = discriminator->forward(real_images).reshape(real_labels.sizes()); torch::Tensor d_loss_real = torch::binary_cross_entropy(real_output, real_labels); d_loss_real.backward(); @@ -977,7 +977,7 @@ the data loader every epoch and then write the GAN training code: torch::Tensor noise = torch::randn({batch.data.size(0), kNoiseSize, 1, 1}); torch::Tensor fake_images = generator->forward(noise); torch::Tensor fake_labels = torch::zeros(batch.data.size(0)); - torch::Tensor fake_output = discriminator->forward(fake_images.detach()); + torch::Tensor fake_output = discriminator->forward(fake_images.detach()).reshape(fake_labels.sizes()); torch::Tensor d_loss_fake = torch::binary_cross_entropy(fake_output, fake_labels); d_loss_fake.backward(); @@ -987,7 +987,7 @@ the data loader every epoch and then write the GAN training code: // Train generator. generator->zero_grad(); fake_labels.fill_(1); - fake_output = discriminator->forward(fake_images); + fake_output = discriminator->forward(fake_images).reshape(fake_labels.sizes()); torch::Tensor g_loss = torch::binary_cross_entropy(fake_output, fake_labels); g_loss.backward(); generator_optimizer.step(); diff --git a/advanced_source/custom_ops_landing_page.rst b/advanced_source/custom_ops_landing_page.rst new file mode 100644 index 00000000000..1867fc29acb --- /dev/null +++ b/advanced_source/custom_ops_landing_page.rst @@ -0,0 +1,62 @@ +.. _custom-ops-landing-page: + +PyTorch Custom Operators +=========================== + +PyTorch offers a large library of operators that work on Tensors (e.g. ``torch.add``, +``torch.sum``, etc). However, you may wish to bring a new custom operation to PyTorch +and get it to work with subsystems like ``torch.compile``, autograd, and ``torch.vmap``. +In order to do so, you must register the custom operation with PyTorch via the Python +`torch.library docs `_ or C++ ``TORCH_LIBRARY`` +APIs. + + + +Authoring a custom operator from Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please see :ref:`python-custom-ops-tutorial`. + +You may wish to author a custom operator from Python (as opposed to C++) if: + +- you have a Python function you want PyTorch to treat as an opaque callable, especially with + respect to ``torch.compile`` and ``torch.export``. +- you have some Python bindings to C++/CUDA kernels and want those to compose with PyTorch + subsystems (like ``torch.compile`` or ``torch.autograd``) +- you are using Python (and not a C++-only environment like AOTInductor). + +Integrating custom C++ and/or CUDA code with PyTorch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please see :ref:`cpp-custom-ops-tutorial`. + +You may wish to author a custom operator from C++ (as opposed to Python) if: + +- you have custom C++ and/or CUDA code. +- you plan to use this code with ``AOTInductor`` to do Python-less inference. + +The Custom Operators Manual +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For information not covered in the tutorials and this page, please see +`The Custom Operators Manual `_ +(we're working on moving the information to our docs site). We recommend that you +first read one of the tutorials above and then use the Custom Operators Manual as a reference; +it is not meant to be read head to toe. + +When should I create a Custom Operator? +--------------------------------------- +If your operation is expressible as a composition of built-in PyTorch operators +then please write it as a Python function and call it instead of creating a +custom operator. Use the operator registration APIs to create a custom operator if you +are calling into some library that PyTorch doesn't understand (e.g. custom C/C++ code, +a custom CUDA kernel, or Python bindings to C/C++/CUDA extensions). + +Why should I create a Custom Operator? +-------------------------------------- + +It is possible to use a C/C++/CUDA kernel by grabbing a Tensor's data pointer +and passing it to a pybind'ed kernel. However, this approach doesn't compose with +PyTorch subsystems like autograd, torch.compile, vmap, and more. In order +for an operation to compose with PyTorch subsystems, it must be registered +via the operator registration APIs. diff --git a/advanced_source/ddp_pipeline.py b/advanced_source/ddp_pipeline.py deleted file mode 100644 index 1eb956a7836..00000000000 --- a/advanced_source/ddp_pipeline.py +++ /dev/null @@ -1,516 +0,0 @@ -""" -Training Transformer models using Distributed Data Parallel and Pipeline Parallelism -==================================================================================== - -**Author**: `Pritam Damania `_ - -This tutorial demonstrates how to train a large Transformer model across -multiple GPUs using `Distributed Data Parallel `__ and -`Pipeline Parallelism `__. This tutorial is an extension of the -`Sequence-to-Sequence Modeling with nn.Transformer and TorchText `__ tutorial -and scales up the same model to demonstrate how Distributed Data Parallel and -Pipeline Parallelism can be used to train Transformer models. - -Prerequisites: - - * `Pipeline Parallelism `__ - * `Sequence-to-Sequence Modeling with nn.Transformer and TorchText `__ - * `Getting Started with Distributed Data Parallel `__ -""" - - -###################################################################### -# Define the model -# ---------------- -# - -###################################################################### -# ``PositionalEncoding`` module injects some information about the -# relative or absolute position of the tokens in the sequence. The -# positional encodings have the same dimension as the embeddings so that -# the two can be summed. Here, we use ``sine`` and ``cosine`` functions of -# different frequencies. - -import sys -import os -import math -import torch -import torch.nn as nn -import torch.nn.functional as F -import tempfile -from torch.nn import TransformerEncoder, TransformerEncoderLayer - -class PositionalEncoding(nn.Module): - - def __init__(self, d_model, dropout=0.1, max_len=5000): - super(PositionalEncoding, self).__init__() - self.dropout = nn.Dropout(p=dropout) - - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.pe = nn.Parameter(pe, requires_grad=False) - - def forward(self, x): - x = x + self.pe[:x.size(0), :] - return self.dropout(x) - - -###################################################################### -# In this tutorial, we will split a Transformer model across two GPUs and use -# pipeline parallelism to train the model. In addition to this, we use -# `Distributed Data Parallel `__ -# to train two replicas of this pipeline. We have one process driving a pipe across -# GPUs 0 and 1 and another process driving a pipe across GPUs 2 and 3. Both these -# processes then use Distributed Data Parallel to train the two replicas. The -# model is exactly the same model used in the `Sequence-to-Sequence Modeling with nn.Transformer and TorchText -# `__ tutorial, -# but is split into two stages. The largest number of parameters belong to the -# `nn.TransformerEncoder `__ layer. -# The `nn.TransformerEncoder `__ -# itself consists of ``nlayers`` of `nn.TransformerEncoderLayer `__. -# As a result, our focus is on ``nn.TransformerEncoder`` and we split the model -# such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the -# other half are on another. To do this, we pull out the ``Encoder`` and -# ``Decoder`` sections into separate modules and then build an ``nn.Sequential`` -# representing the original Transformer module. - - -if sys.platform == 'win32': - print('Windows platform is not supported for pipeline parallelism') - sys.exit(0) -if torch.cuda.device_count() < 4: - print('Need at least four GPU devices for this tutorial') - sys.exit(0) - -class Encoder(nn.Module): - def __init__(self, ntoken, ninp, dropout=0.5): - super(Encoder, self).__init__() - self.pos_encoder = PositionalEncoding(ninp, dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.ninp = ninp - self.init_weights() - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, src): - # Need (S, N) format for encoder. - src = src.t() - src = self.encoder(src) * math.sqrt(self.ninp) - return self.pos_encoder(src) - -class Decoder(nn.Module): - def __init__(self, ntoken, ninp): - super(Decoder, self).__init__() - self.decoder = nn.Linear(ninp, ntoken) - self.init_weights() - - def init_weights(self): - initrange = 0.1 - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, inp): - # Need batch dimension first for output of pipeline. - return self.decoder(inp).permute(1, 0, 2) - -###################################################################### -# Start multiple processes for training -# ------------------------------------- -# - - -###################################################################### -# We start two processes where each process drives its own pipeline across two -# GPUs. ``run_worker`` is executed for each process. - -def run_worker(rank, world_size): - - -###################################################################### -# Load and batch data -# ------------------- -# - - -###################################################################### -# The training process uses Wikitext-2 dataset from ``torchtext``. -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# -# The vocab object is built based on the train dataset and is used to numericalize -# tokens into tensors. Starting from sequential data, the ``batchify()`` -# function arranges the dataset into columns, trimming off any tokens remaining -# after the data has been divided into batches of size ``batch_size``. -# For instance, with the alphabet as the sequence (total length of 26) -# and a batch size of 4, we would divide the alphabet into 4 sequences of -# length 6: -# -# .. math:: -# -# \begin{bmatrix} -# \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} -# \end{bmatrix} -# \Rightarrow -# \begin{bmatrix} -# \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & -# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & -# \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & -# \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} -# \end{bmatrix} -# -# These columns are treated as independent by the model, which means that -# the dependence of ``G`` and ``F`` can not be learned, but allows more -# efficient batch processing. -# - -# In 'run_worker' - def print_with_rank(msg): - print('[RANK {}]: {}'.format(rank, msg)) - - from torchtext.datasets import WikiText2 - from torchtext.data.utils import get_tokenizer - from torchtext.vocab import build_vocab_from_iterator - - train_iter = WikiText2(split='train') - tokenizer = get_tokenizer('basic_english') - vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=[""]) - vocab.set_default_index(vocab[""]) - - def data_process(raw_text_iter): - data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter] - return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) - - train_iter, val_iter, test_iter = WikiText2() - train_data = data_process(train_iter) - val_data = data_process(val_iter) - test_data = data_process(test_iter) - - device = torch.device(2 * rank) - - def batchify(data, bsz, rank, world_size, is_train=False): - # Divide the dataset into ``bsz`` parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the ``bsz`` batches. - data = data.view(bsz, -1).t().contiguous() - # Divide the data across the ranks only for training data. - if is_train: - data_per_rank = data.size(0) // world_size - data = data[rank * data_per_rank : (rank + 1) * data_per_rank] - return data.to(device) - - batch_size = 20 - eval_batch_size = 10 - train_data = batchify(train_data, batch_size, rank, world_size, True) - val_data = batchify(val_data, eval_batch_size, rank, world_size) - test_data = batchify(test_data, eval_batch_size, rank, world_size) - - -###################################################################### -# Functions to generate input and target sequence -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# ``get_batch()`` function generates the input and target sequence for -# the transformer model. It subdivides the source data into chunks of -# length ``bptt``. For the language modeling task, the model needs the -# following words as ``Target``. For example, with a ``bptt`` value of 2, -# we’d get the following two Variables for ``i`` = 0: -# -# .. image:: ../_static/img/transformer_input_target.png -# -# It should be noted that the chunks are along dimension 0, consistent -# with the ``S`` dimension in the Transformer model. The batch dimension -# ``N`` is along dimension 1. -# - -# In 'run_worker' - bptt = 35 - def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].view(-1) - # Need batch dimension first for pipeline parallelism. - return data.t(), target - -###################################################################### -# Model scale and Pipe initialization -# ----------------------------------- -# - - -###################################################################### -# To demonstrate training large Transformer models using pipeline parallelism, -# we scale up the Transformer layers appropriately. We use an embedding -# dimension of 4096, hidden size of 4096, 16 attention heads and 8 total -# transformer layers (``nn.TransformerEncoderLayer``). This creates a model with -# **~1 billion** parameters. -# -# We need to initialize the `RPC Framework `__ -# since Pipe depends on the RPC framework via `RRef `__ -# which allows for future expansion to cross host pipelining. We need to -# initialize the RPC framework with only a single worker since we're using a -# single process to drive multiple GPUs. -# -# The pipeline is then initialized with 8 transformer layers on one GPU and 8 -# transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and -# another across GPUs 2 and 3. Both pipes are then replicated using ``DistributedDataParallel``. - -# In 'run_worker' - ntokens = len(vocab) # the size of vocabulary - emsize = 4096 # embedding dimension - nhid = 4096 # the dimension of the feedforward network model in ``nn.TransformerEncoder`` - nlayers = 8 # the number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder`` - nhead = 16 # the number of heads in the Multihead Attention models - dropout = 0.2 # the dropout value - - from torch.distributed import rpc - tmpfile = tempfile.NamedTemporaryFile() - rpc.init_rpc( - name="worker", - rank=0, - world_size=1, - rpc_backend_options=rpc.TensorPipeRpcBackendOptions( - init_method="file://{}".format(tmpfile.name), - # Specifying _transports and _channels is a workaround and we no longer - # will have to specify _transports and _channels for PyTorch - # versions >= 1.8.1 - _transports=["ibv", "uv"], - _channels=["cuda_ipc", "cuda_basic"], - ) - ) - - # Number of GPUs for model parallelism. - num_gpus = 2 - partition_len = ((nlayers - 1) // num_gpus) + 1 - - # Add encoder in the beginning. - tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)] - module_list = [] - - # Add all the necessary transformer blocks. - for i in range(nlayers): - transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) - if i != 0 and i % (partition_len) == 0: - module_list.append(nn.Sequential(*tmp_list)) - tmp_list = [] - device = i // (partition_len) - tmp_list.append(transformer_block.to(2 * rank + device)) - - # Add decoder in the end. - tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1)) - module_list.append(nn.Sequential(*tmp_list)) - - # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing - # doesn't work with DDP. - from torch.distributed.pipeline.sync import Pipe - chunks = 8 - model = Pipe(torch.nn.Sequential( - *module_list), chunks = chunks, checkpoint="never") - - # Initialize process group and wrap model in DDP. - from torch.nn.parallel import DistributedDataParallel - import torch.distributed as dist - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - dist.init_process_group( - backend="nccl", rank=rank, world_size=world_size) - model = DistributedDataParallel(model) - - def get_total_params(module: torch.nn.Module): - total_params = 0 - for param in module.parameters(): - total_params += param.numel() - return total_params - - print_with_rank('Total parameters in model: {:,}'.format(get_total_params(model))) - -###################################################################### -# Run the model -# ------------- -# - - -###################################################################### -# `CrossEntropyLoss `__ -# is applied to track the loss and -# `SGD `__ -# implements stochastic gradient descent method as the optimizer. The initial -# learning rate is set to 5.0. `StepLR `__ is -# applied to adjust the learn rate through epochs. During the -# training, we use -# `nn.utils.clip_grad_norm\_ `__ -# function to scale all the gradient together to prevent exploding. -# - -# In 'run_worker' - criterion = nn.CrossEntropyLoss() - lr = 5.0 # learning rate - optimizer = torch.optim.SGD(model.parameters(), lr=lr) - scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) - - import time - def train(): - model.train() # Turn on the train mode - total_loss = 0. - start_time = time.time() - ntokens = len(vocab) - - # Train only for 50 batches to keep script execution time low. - nbatches = min(50 * bptt, train_data.size(0) - 1) - - for batch, i in enumerate(range(0, nbatches, bptt)): - data, targets = get_batch(train_data, i) - optimizer.zero_grad() - # Since the Pipe is only within a single host and process the ``RRef`` - # returned by forward method is local to this node and can simply - # retrieved via ``RRef.local_value()``. - output = model(data).local_value() - # Need to move targets to the device where the output of the - # pipeline resides. - loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1)) - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) - optimizer.step() - - total_loss += loss.item() - log_interval = 10 - if batch % log_interval == 0 and batch > 0: - cur_loss = total_loss / log_interval - elapsed = time.time() - start_time - print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | ' - 'lr {:02.2f} | ms/batch {:5.2f} | ' - 'loss {:5.2f} | ppl {:8.2f}'.format( - epoch, batch, nbatches // bptt, scheduler.get_last_lr()[0], - elapsed * 1000 / log_interval, - cur_loss, math.exp(cur_loss))) - total_loss = 0 - start_time = time.time() - - def evaluate(eval_model, data_source): - eval_model.eval() # Turn on the evaluation mode - total_loss = 0. - ntokens = len(vocab) - # Evaluate only for 50 batches to keep script execution time low. - nbatches = min(50 * bptt, data_source.size(0) - 1) - with torch.no_grad(): - for i in range(0, nbatches, bptt): - data, targets = get_batch(data_source, i) - output = eval_model(data).local_value() - output_flat = output.view(-1, ntokens) - # Need to move targets to the device where the output of the - # pipeline resides. - total_loss += len(data) * criterion(output_flat, targets.cuda(2 * rank + 1)).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# Loop over epochs. Save the model if the validation loss is the best -# we've seen so far. Adjust the learning rate after each epoch. - -# In 'run_worker' - best_val_loss = float("inf") - epochs = 3 # The number of epochs - best_model = None - - for epoch in range(1, epochs + 1): - epoch_start_time = time.time() - train() - val_loss = evaluate(model, val_data) - print_with_rank('-' * 89) - print_with_rank('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' - 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), - val_loss, math.exp(val_loss))) - print_with_rank('-' * 89) - - if val_loss < best_val_loss: - best_val_loss = val_loss - best_model = model - - scheduler.step() - - -###################################################################### -# Evaluate the model with the test dataset -# ------------------------------------- -# -# Apply the best model to check the result with the test dataset. - -# In 'run_worker' - test_loss = evaluate(best_model, test_data) - print_with_rank('=' * 89) - print_with_rank('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( - test_loss, math.exp(test_loss))) - print_with_rank('=' * 89) - -# Main execution -import torch.multiprocessing as mp - -if __name__=="__main__": - world_size = 2 - mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True) -###################################################################### -# Output -# ------ -# - - -###################################################################### -#.. code-block:: py -# -# [RANK 0]: | epoch 1 | 10/ 50 batches | lr 5.00 | ms/batch 778.97 | loss 43.31 | ppl 6432469059895903232.00 -# [RANK 1]: | epoch 1 | 10/ 50 batches | lr 5.00 | ms/batch 778.90 | loss 44.50 | ppl 21245447128217366528.00 -# [RANK 0]: | epoch 1 | 20/ 50 batches | lr 5.00 | ms/batch 699.89 | loss 44.50 | ppl 21176949187407757312.00 -# [RANK 1]: | epoch 1 | 20/ 50 batches | lr 5.00 | ms/batch 699.87 | loss 44.62 | ppl 23975861229620961280.00 -# [RANK 0]: | epoch 1 | 30/ 50 batches | lr 5.00 | ms/batch 698.86 | loss 41.62 | ppl 1193312915629888256.00 -# [RANK 1]: | epoch 1 | 30/ 50 batches | lr 5.00 | ms/batch 698.87 | loss 40.69 | ppl 471605759847546240.00 -# [RANK 0]: | epoch 1 | 40/ 50 batches | lr 5.00 | ms/batch 698.34 | loss 45.20 | ppl 42812308420836458496.00 -# [RANK 1]: | epoch 1 | 40/ 50 batches | lr 5.00 | ms/batch 698.33 | loss 45.68 | ppl 68839569686012223488.00 -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 1]: | end of epoch 1 | time: 40.08s | valid loss 0.80 | valid ppl 2.22 -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 0]: | end of epoch 1 | time: 40.09s | valid loss 0.80 | valid ppl 2.22 -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 0]: | epoch 2 | 10/ 50 batches | lr 4.75 | ms/batch 768.51 | loss 36.34 | ppl 6063529544668166.00 -# [RANK 1]: | epoch 2 | 10/ 50 batches | lr 4.75 | ms/batch 769.23 | loss 37.41 | ppl 17651211266236086.00 -# [RANK 0]: | epoch 2 | 20/ 50 batches | lr 4.75 | ms/batch 699.57 | loss 28.97 | ppl 3798441739584.11 -# [RANK 1]: | epoch 2 | 20/ 50 batches | lr 4.75 | ms/batch 699.56 | loss 29.28 | ppl 5203636967575.47 -# [RANK 0]: | epoch 2 | 30/ 50 batches | lr 4.75 | ms/batch 699.04 | loss 28.43 | ppl 2212498693571.25 -# [RANK 1]: | epoch 2 | 30/ 50 batches | lr 4.75 | ms/batch 699.05 | loss 28.33 | ppl 2015144761281.48 -# [RANK 0]: | epoch 2 | 40/ 50 batches | lr 4.75 | ms/batch 699.10 | loss 23.30 | ppl 13121380184.92 -# [RANK 1]: | epoch 2 | 40/ 50 batches | lr 4.75 | ms/batch 699.09 | loss 23.41 | ppl 14653799192.87 -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 0]: | end of epoch 2 | time: 39.97s | valid loss 0.24 | valid ppl 1.27 -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 1]: | end of epoch 2 | time: 39.98s | valid loss 0.24 | valid ppl 1.27 -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 0]: | epoch 3 | 10/ 50 batches | lr 4.51 | ms/batch 769.36 | loss 12.80 | ppl 361681.11 -# [RANK 1]: | epoch 3 | 10/ 50 batches | lr 4.51 | ms/batch 768.97 | loss 12.57 | ppl 287876.61 -# [RANK 0]: | epoch 3 | 20/ 50 batches | lr 4.51 | ms/batch 698.27 | loss 12.01 | ppl 164364.60 -# [RANK 1]: | epoch 3 | 20/ 50 batches | lr 4.51 | ms/batch 698.30 | loss 11.98 | ppl 159095.89 -# [RANK 0]: | epoch 3 | 30/ 50 batches | lr 4.51 | ms/batch 697.75 | loss 10.90 | ppl 54261.91 -# [RANK 1]: | epoch 3 | 30/ 50 batches | lr 4.51 | ms/batch 697.72 | loss 10.89 | ppl 53372.39 -# [RANK 0]: | epoch 3 | 40/ 50 batches | lr 4.51 | ms/batch 699.49 | loss 10.78 | ppl 47948.35 -# [RANK 1]: | epoch 3 | 40/ 50 batches | lr 4.51 | ms/batch 699.50 | loss 10.79 | ppl 48664.42 -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 0]: | end of epoch 3 | time: 39.96s | valid loss 0.38 | valid ppl 1.46 -# [RANK 0]: ----------------------------------------------------------------------------------------- -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 1]: | end of epoch 3 | time: 39.96s | valid loss 0.38 | valid ppl 1.46 -# [RANK 1]: ----------------------------------------------------------------------------------------- -# [RANK 0]: ========================================================================================= -# [RANK 0]: | End of training | test loss 0.33 | test ppl 1.39 -# [RANK 0]: ========================================================================================= -# [RANK 1]: ========================================================================================= -# [RANK 1]: | End of training | test loss 0.33 | test ppl 1.39 -# [RANK 1]: ========================================================================================= -# diff --git a/advanced_source/ddp_pipeline.rst b/advanced_source/ddp_pipeline.rst new file mode 100644 index 00000000000..bf9e4d28f33 --- /dev/null +++ b/advanced_source/ddp_pipeline.rst @@ -0,0 +1,10 @@ +Training Transformer models using Distributed Data Parallel and Pipeline Parallelism +==================================================================================== + +This tutorial has been deprecated. + +Redirecting to the latest parallelism APIs in 3 seconds... + +.. raw:: html + + diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 1a8034a62e5..4b03803c15b 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -1,6 +1,11 @@ Registering a Dispatched Operator in C++ ======================================== +.. warning:: + + This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page` + for the newest up-to-date guides on extending PyTorch with Custom Operators. + The dispatcher is an internal component of PyTorch which is responsible for figuring out what code should actually get run when you call a function like ``torch::add``. This can be nontrivial, because PyTorch operations need @@ -129,7 +134,7 @@ for debugging in larger models where previously it can be hard to pin-point exactly where the ``requires_grad``-ness is lost during the forward pass. In-place or view ops -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ To ensure correctness and best possible performance, if your op mutates an input in-place or returns a tensor that aliases with one of the inputs, two additional diff --git a/advanced_source/dynamic_quantization_tutorial.py b/advanced_source/dynamic_quantization_tutorial.py index 9cc07a1d956..c8d94789d5d 100644 --- a/advanced_source/dynamic_quantization_tutorial.py +++ b/advanced_source/dynamic_quantization_tutorial.py @@ -151,7 +151,8 @@ def tokenize(self, path): model.load_state_dict( torch.load( model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') + map_location=torch.device('cpu'), + weights_only=True ) ) diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst index f3ae1e7e559..12f15355f5f 100644 --- a/advanced_source/extend_dispatcher.rst +++ b/advanced_source/extend_dispatcher.rst @@ -17,7 +17,7 @@ to `register a dispatched operator in C++ `_ and how to write a What's a new backend? --------------------- -Adding a new backend to PyTorch requires a lot of developement and maintainence from backend extenders. +Adding a new backend to PyTorch requires a lot of development and maintenance from backend extenders. Before adding a new backend, let's first consider a few common use cases and recommended solutions for them: * If you have new algorithms for an existing PyTorch operator, send a PR to PyTorch. @@ -30,7 +30,7 @@ Before adding a new backend, let's first consider a few common use cases and rec In this tutorial we'll mainly focus on adding a new out-of-tree device below. Adding out-of-tree support for a different tensor layout might share many common steps with devices, but we haven't seen an example of -such integrations yet so it might require addtional work from PyTorch to support it. +such integrations yet so it might require additional work from PyTorch to support it. Get a dispatch key for your backend ----------------------------------- @@ -67,12 +67,12 @@ To create a Tensor on ``PrivateUse1`` backend, you need to set dispatch key in ` Note that ``TensorImpl`` class above assumes your Tensor is backed by a storage like CPU/CUDA. We also provide ``OpaqueTensorImpl`` for backends without a storage. And you might need to tweak/override certain methods to fit your customized hardware. -One example in pytorch repo is `Vulkan TensorImpl `_. +One example in pytorch repo is `Vulkan TensorImpl `_. .. note:: Once the prototype is done and you plan to do regular releases for your backend extension, please feel free to - submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispath key for your backend. + submit a PR to ``pytorch/pytorch`` to reserve a dedicated dispatch key for your backend. Get the full list of PyTorch operators @@ -361,7 +361,7 @@ actively working on might improve the experience in the future: * Improve test coverage of generic testing framework. * Improve ``Math`` kernel coverage and more comprehensive tests to make sure ``Math`` - kernel bahavior matches other backends like ``CPU/CUDA``. + kernel behavior matches other backends like ``CPU/CUDA``. * Refactor ``RegistrationDeclarations.h`` to carry the minimal information and reuse PyTorch's codegen as much as possible. * Support a backend fallback kernel to automatic convert inputs to CPU and convert the diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py index ee4dab7e7ec..b4ab10ef01d 100644 --- a/advanced_source/neural_style_tutorial.py +++ b/advanced_source/neural_style_tutorial.py @@ -56,7 +56,7 @@ import matplotlib.pyplot as plt import torchvision.transforms as transforms -import torchvision.models as models +from torchvision.models import vgg19, VGG19_Weights import copy @@ -87,7 +87,7 @@ # to 255 tensor images. # # -# .. Note:: +# .. note:: # Here are links to download the images required to run the tutorial: # `picasso.jpg `__ and # `dancing.jpg `__. @@ -183,7 +183,7 @@ def forward(self, input): return input ###################################################################### -# .. Note:: +# .. note:: # **Important detail**: although this module is named ``ContentLoss``, it # is not a true PyTorch Loss function. If you want to define your content # loss as a PyTorch Loss function, you have to create a PyTorch autograd function @@ -262,7 +262,7 @@ def forward(self, input): # network to evaluation mode using ``.eval()``. # -cnn = models.vgg19(pretrained=True).features.eval() +cnn = vgg19(weights=VGG19_Weights.DEFAULT).features.eval() @@ -372,7 +372,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std, input_img = content_img.clone() # if you want to use white noise by using the following code: # -# :: +# .. code-block:: python # # input_img = torch.randn(content_img.data.size()) diff --git a/advanced_source/pendulum.py b/advanced_source/pendulum.py new file mode 100644 index 00000000000..fae3635de1c --- /dev/null +++ b/advanced_source/pendulum.py @@ -0,0 +1,930 @@ +# -*- coding: utf-8 -*- + +""" +Pendulum: Writing your environment and transforms with TorchRL +============================================================== + +**Author**: `Vincent Moens `_ + +Creating an environment (a simulator or an interface to a physical control system) +is an integrative part of reinforcement learning and control engineering. + +TorchRL provides a set of tools to do this in multiple contexts. +This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum +simulator from the ground up. +It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium +control library `__. + +.. figure:: /_static/img/pendulum.gif + :alt: Pendulum + :align: center + + Simple Pendulum + +Key learnings: + +- How to design an environment in TorchRL: + - Writing specs (input, observation and reward); + - Implementing behavior: seeding, reset and step. +- Transforming your environment inputs and outputs, and writing your own + transforms; +- How to use :class:`~tensordict.TensorDict` to carry arbitrary data structures + through the ``codebase``. + + In the process, we will touch three crucial components of TorchRL: + +* `environments `__ +* `transforms `__ +* `models (policy and value function) `__ + +""" + +###################################################################### +# To give a sense of what can be achieved with TorchRL's environments, we will +# be designing a *stateless* environment. While stateful environments keep track of +# the latest physical state encountered and rely on this to simulate the state-to-state +# transition, stateless environments expect the current state to be provided to +# them at each step, along with the action undertaken. TorchRL supports both +# types of environments, but stateless environments are more generic and hence +# cover a broader range of features of the environment API in TorchRL. +# +# Modeling stateless environments gives users full control over the input and +# outputs of the simulator: one can reset an experiment at any stage or actively +# modify the dynamics from the outside. However, it assumes that we have some control +# over a task, which may not always be the case: solving a problem where we cannot +# control the current state is more challenging but has a much wider set of applications. +# +# Another advantage of stateless environments is that they can enable +# batched execution of transition simulations. If the backend and the +# implementation allow it, an algebraic operation can be executed seamlessly on +# scalars, vectors, or tensors. This tutorial gives such examples. +# +# This tutorial will be structured as follows: +# +# * We will first get acquainted with the environment properties: +# its shape (``batch_size``), its methods (mainly :meth:`~torchrl.envs.EnvBase.step`, +# :meth:`~torchrl.envs.EnvBase.reset` and :meth:`~torchrl.envs.EnvBase.set_seed`) +# and finally its specs. +# * After having coded our simulator, we will demonstrate how it can be used +# during training with transforms. +# * We will explore new avenues that follow from the TorchRL's API, +# including: the possibility of transforming inputs, the vectorized execution +# of the simulation and the possibility of backpropagation through the +# simulation graph. +# * Finally, we will train a simple policy to solve the system we implemented. +# + +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + +from collections import defaultdict +from typing import Optional + +import numpy as np +import torch +import tqdm +from tensordict import TensorDict, TensorDictBase +from tensordict.nn import TensorDictModule +from torch import nn + +from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec +from torchrl.envs import ( + CatTensors, + EnvBase, + Transform, + TransformedEnv, + UnsqueezeTransform, +) +from torchrl.envs.transforms.transforms import _apply_to_composite +from torchrl.envs.utils import check_env_specs, step_mdp + +DEFAULT_X = np.pi +DEFAULT_Y = 1.0 + +###################################################################### +# There are four things you must take care of when designing a new environment +# class: +# +# * :meth:`EnvBase._reset`, which codes for the resetting of the simulator +# at a (potentially random) initial state; +# * :meth:`EnvBase._step` which codes for the state transition dynamic; +# * :meth:`EnvBase._set_seed`` which implements the seeding mechanism; +# * the environment specs. +# +# Let us first describe the problem at hand: we would like to model a simple +# pendulum over which we can control the torque applied on its fixed point. +# Our goal is to place the pendulum in upward position (angular position at 0 +# by convention) and having it standing still in that position. +# To design our dynamic system, we need to define two equations: the motion +# equation following an action (the torque applied) and the reward equation +# that will constitute our objective function. +# +# For the motion equation, we will update the angular velocity following: +# +# .. math:: +# +# \dot{\theta}_{t+1} = \dot{\theta}_t + (3 * g / (2 * L) * \sin(\theta_t) + 3 / (m * L^2) * u) * dt +# +# where :math:`\dot{\theta}` is the angular velocity in rad/sec, :math:`g` is the +# gravitational force, :math:`L` is the pendulum length, :math:`m` is its mass, +# :math:`\theta` is its angular position and :math:`u` is the torque. The +# angular position is then updated according to +# +# .. math:: +# +# \theta_{t+1} = \theta_{t} + \dot{\theta}_{t+1} dt +# +# We define our reward as +# +# .. math:: +# +# r = -(\theta^2 + 0.1 * \dot{\theta}^2 + 0.001 * u^2) +# +# which will be maximized when the angle is close to 0 (pendulum in upward +# position), the angular velocity is close to 0 (no motion) and the torque is +# 0 too. +# +# Coding the effect of an action: :func:`~torchrl.envs.EnvBase._step` +# ------------------------------------------------------------------- +# +# The step method is the first thing to consider, as it will encode +# the simulation that is of interest to us. In TorchRL, the +# :class:`~torchrl.envs.EnvBase` class has a :meth:`EnvBase.step` +# method that receives a :class:`tensordict.TensorDict` +# instance with an ``"action"`` entry indicating what action is to be taken. +# +# To facilitate the reading and writing from that ``tensordict`` and to make sure +# that the keys are consistent with what's expected from the library, the +# simulation part has been delegated to a private abstract method :meth:`_step` +# which reads input data from a ``tensordict``, and writes a *new* ``tensordict`` +# with the output data. +# +# The :func:`_step` method should do the following: +# +# 1. Read the input keys (such as ``"action"``) and execute the simulation +# based on these; +# 2. Retrieve observations, done state and reward; +# 3. Write the set of observation values along with the reward and done state +# at the corresponding entries in a new :class:`TensorDict`. +# +# Next, the :meth:`~torchrl.envs.EnvBase.step` method will merge the output +# of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce +# input/output consistency. +# +# Typically, for stateful environments, this will look like this: +# +# .. code-block:: +# +# >>> policy(env.reset()) +# >>> print(tensordict) +# TensorDict( +# fields={ +# action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False), +# done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), +# observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, +# batch_size=torch.Size([]), +# device=cpu, +# is_shared=False) +# >>> env.step(tensordict) +# >>> print(tensordict) +# TensorDict( +# fields={ +# action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False), +# done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), +# next: TensorDict( +# fields={ +# done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False), +# observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False), +# reward: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False)}, +# batch_size=torch.Size([]), +# device=cpu, +# is_shared=False), +# observation: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)}, +# batch_size=torch.Size([]), +# device=cpu, +# is_shared=False) +# +# Notice that the root ``tensordict`` has not changed, the only modification is the +# appearance of a new ``"next"`` entry that contains the new information. +# +# In the Pendulum example, our :meth:`_step` method will read the relevant +# entries from the input ``tensordict`` and compute the position and velocity of +# the pendulum after the force encoded by the ``"action"`` key has been applied +# onto it. We compute the new angular position of the pendulum +# ``"new_th"`` as the result of the previous position ``"th"`` plus the new +# velocity ``"new_thdot"`` over a time interval ``dt``. +# +# Since our goal is to turn the pendulum up and maintain it still in that +# position, our ``cost`` (negative reward) function is lower for positions +# close to the target and low speeds. +# Indeed, we want to discourage positions that are far from being "upward" +# and/or speeds that are far from 0. +# +# In our example, :meth:`EnvBase._step` is encoded as a static method since our +# environment is stateless. In stateful settings, the ``self`` argument is +# needed as the state needs to be read from the environment. +# + + +def _step(tensordict): + th, thdot = tensordict["th"], tensordict["thdot"] # th := theta + + g_force = tensordict["params", "g"] + mass = tensordict["params", "m"] + length = tensordict["params", "l"] + dt = tensordict["params", "dt"] + u = tensordict["action"].squeeze(-1) + u = u.clamp(-tensordict["params", "max_torque"], tensordict["params", "max_torque"]) + costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2) + + new_thdot = ( + thdot + + (3 * g_force / (2 * length) * th.sin() + 3.0 / (mass * length**2) * u) * dt + ) + new_thdot = new_thdot.clamp( + -tensordict["params", "max_speed"], tensordict["params", "max_speed"] + ) + new_th = th + new_thdot * dt + reward = -costs.view(*tensordict.shape, 1) + done = torch.zeros_like(reward, dtype=torch.bool) + out = TensorDict( + { + "th": new_th, + "thdot": new_thdot, + "params": tensordict["params"], + "reward": reward, + "done": done, + }, + tensordict.shape, + ) + return out + + +def angle_normalize(x): + return ((x + torch.pi) % (2 * torch.pi)) - torch.pi + + +###################################################################### +# Resetting the simulator: :func:`~torchrl.envs.EnvBase._reset` +# ------------------------------------------------------------- +# +# The second method we need to care about is the +# :meth:`~torchrl.envs.EnvBase._reset` method. Like +# :meth:`~torchrl.envs.EnvBase._step`, it should write the observation entries +# and possibly a done state in the ``tensordict`` it outputs (if the done state is +# omitted, it will be filled as ``False`` by the parent method +# :meth:`~torchrl.envs.EnvBase.reset`). In some contexts, it is required that +# the ``_reset`` method receives a command from the function that called +# it (for example, in multi-agent settings we may want to indicate which agents need +# to be reset). This is why the :meth:`~torchrl.envs.EnvBase._reset` method +# also expects a ``tensordict`` as input, albeit it may perfectly be empty or +# ``None``. +# +# The parent :meth:`EnvBase.reset` does some simple checks like the +# :meth:`EnvBase.step` does, such as making sure that a ``"done"`` state +# is returned in the output ``tensordict`` and that the shapes match what is +# expected from the specs. +# +# For us, the only important thing to consider is whether +# :meth:`EnvBase._reset` contains all the expected observations. Once more, +# since we are working with a stateless environment, we pass the configuration +# of the pendulum in a nested ``tensordict`` named ``"params"``. +# +# In this example, we do not pass a done state as this is not mandatory +# for :meth:`_reset` and our environment is non-terminating, so we always +# expect it to be ``False``. +# + + +def _reset(self, tensordict): + if tensordict is None or tensordict.is_empty(): + # if no ``tensordict`` is passed, we generate a single set of hyperparameters + # Otherwise, we assume that the input ``tensordict`` contains all the relevant + # parameters to get started. + tensordict = self.gen_params(batch_size=self.batch_size) + + high_th = torch.tensor(DEFAULT_X, device=self.device) + high_thdot = torch.tensor(DEFAULT_Y, device=self.device) + low_th = -high_th + low_thdot = -high_thdot + + # for non batch-locked environments, the input ``tensordict`` shape dictates the number + # of simulators run simultaneously. In other contexts, the initial + # random state's shape will depend upon the environment batch-size instead. + th = ( + torch.rand(tensordict.shape, generator=self.rng, device=self.device) + * (high_th - low_th) + + low_th + ) + thdot = ( + torch.rand(tensordict.shape, generator=self.rng, device=self.device) + * (high_thdot - low_thdot) + + low_thdot + ) + out = TensorDict( + { + "th": th, + "thdot": thdot, + "params": tensordict["params"], + }, + batch_size=tensordict.shape, + ) + return out + + +###################################################################### +# Environment metadata: ``env.*_spec`` +# ------------------------------------ +# +# The specs define the input and output domain of the environment. +# It is important that the specs accurately define the tensors that will be +# received at runtime, as they are often used to carry information about +# environments in multiprocessing and distributed settings. They can also be +# used to instantiate lazily defined neural networks and test scripts without +# actually querying the environment (which can be costly with real-world +# physical systems for instance). +# +# There are four specs that we must code in our environment: +# +# * :obj:`EnvBase.observation_spec`: This will be a :class:`~torchrl.data.CompositeSpec` +# instance where each key is an observation (a :class:`CompositeSpec` can be +# viewed as a dictionary of specs). +# * :obj:`EnvBase.action_spec`: It can be any type of spec, but it is required +# that it corresponds to the ``"action"`` entry in the input ``tensordict``; +# * :obj:`EnvBase.reward_spec`: provides information about the reward space; +# * :obj:`EnvBase.done_spec`: provides information about the space of the done +# flag. +# +# TorchRL specs are organized in two general containers: ``input_spec`` which +# contains the specs of the information that the step function reads (divided +# between ``action_spec`` containing the action and ``state_spec`` containing +# all the rest), and ``output_spec`` which encodes the specs that the +# step outputs (``observation_spec``, ``reward_spec`` and ``done_spec``). +# In general, you should not interact directly with ``output_spec`` and +# ``input_spec`` but only with their content: ``observation_spec``, +# ``reward_spec``, ``done_spec``, ``action_spec`` and ``state_spec``. +# The reason if that the specs are organized in a non-trivial way +# within ``output_spec`` and +# ``input_spec`` and neither of these should be directly modified. +# +# In other words, the ``observation_spec`` and related properties are +# convenient shortcuts to the content of the output and input spec containers. +# +# TorchRL offers multiple :class:`~torchrl.data.TensorSpec` +# `subclasses `_ to +# encode the environment's input and output characteristics. +# +# Specs shape +# ^^^^^^^^^^^ +# +# The environment specs leading dimensions must match the +# environment batch-size. This is done to enforce that every component of an +# environment (including its transforms) have an accurate representation of +# the expected input and output shapes. This is something that should be +# accurately coded in stateful settings. +# +# For non batch-locked environments, such as the one in our example (see below), +# this is irrelevant as the environment batch size will most likely be empty. +# + + +def _make_spec(self, td_params): + # Under the hood, this will populate self.output_spec["observation"] + self.observation_spec = CompositeSpec( + th=BoundedTensorSpec( + low=-torch.pi, + high=torch.pi, + shape=(), + dtype=torch.float32, + ), + thdot=BoundedTensorSpec( + low=-td_params["params", "max_speed"], + high=td_params["params", "max_speed"], + shape=(), + dtype=torch.float32, + ), + # we need to add the ``params`` to the observation specs, as we want + # to pass it at each step during a rollout + params=make_composite_from_td(td_params["params"]), + shape=(), + ) + # since the environment is stateless, we expect the previous output as input. + # For this, ``EnvBase`` expects some state_spec to be available + self.state_spec = self.observation_spec.clone() + # action-spec will be automatically wrapped in input_spec when + # `self.action_spec = spec` will be called supported + self.action_spec = BoundedTensorSpec( + low=-td_params["params", "max_torque"], + high=td_params["params", "max_torque"], + shape=(1,), + dtype=torch.float32, + ) + self.reward_spec = UnboundedContinuousTensorSpec(shape=(*td_params.shape, 1)) + + +def make_composite_from_td(td): + # custom function to convert a ``tensordict`` in a similar spec structure + # of unbounded values. + composite = CompositeSpec( + { + key: make_composite_from_td(tensor) + if isinstance(tensor, TensorDictBase) + else UnboundedContinuousTensorSpec( + dtype=tensor.dtype, device=tensor.device, shape=tensor.shape + ) + for key, tensor in td.items() + }, + shape=td.shape, + ) + return composite + + +###################################################################### +# Reproducible experiments: seeding +# --------------------------------- +# +# Seeding an environment is a common operation when initializing an experiment. +# The only goal of :func:`EnvBase._set_seed` is to set the seed of the contained +# simulator. If possible, this operation should not call ``reset()`` or interact +# with the environment execution. The parent :func:`EnvBase.set_seed` method +# incorporates a mechanism that allows seeding multiple environments with a +# different pseudo-random and reproducible seed. +# + + +def _set_seed(self, seed: Optional[int]): + rng = torch.manual_seed(seed) + self.rng = rng + + +###################################################################### +# Wrapping things together: the :class:`~torchrl.envs.EnvBase` class +# ------------------------------------------------------------------ +# +# We can finally put together the pieces and design our environment class. +# The specs initialization needs to be performed during the environment +# construction, so we must take care of calling the :func:`_make_spec` method +# within :func:`PendulumEnv.__init__`. +# +# We add a static method :meth:`PendulumEnv.gen_params` which deterministically +# generates a set of hyperparameters to be used during execution: +# + + +def gen_params(g=10.0, batch_size=None) -> TensorDictBase: + """Returns a ``tensordict`` containing the physical parameters such as gravitational force and torque or speed limits.""" + if batch_size is None: + batch_size = [] + td = TensorDict( + { + "params": TensorDict( + { + "max_speed": 8, + "max_torque": 2.0, + "dt": 0.05, + "g": g, + "m": 1.0, + "l": 1.0, + }, + [], + ) + }, + [], + ) + if batch_size: + td = td.expand(batch_size).contiguous() + return td + + +###################################################################### +# We define the environment as non-``batch_locked`` by turning the ``homonymous`` +# attribute to ``False``. This means that we will **not** enforce the input +# ``tensordict`` to have a ``batch-size`` that matches the one of the environment. +# +# The following code will just put together the pieces we have coded above. +# + + +class PendulumEnv(EnvBase): + metadata = { + "render_modes": ["human", "rgb_array"], + "render_fps": 30, + } + batch_locked = False + + def __init__(self, td_params=None, seed=None, device="cpu"): + if td_params is None: + td_params = self.gen_params() + + super().__init__(device=device, batch_size=[]) + self._make_spec(td_params) + if seed is None: + seed = torch.empty((), dtype=torch.int64).random_().item() + self.set_seed(seed) + + # Helpers: _make_step and gen_params + gen_params = staticmethod(gen_params) + _make_spec = _make_spec + + # Mandatory methods: _step, _reset and _set_seed + _reset = _reset + _step = staticmethod(_step) + _set_seed = _set_seed + + +###################################################################### +# Testing our environment +# ----------------------- +# +# TorchRL provides a simple function :func:`~torchrl.envs.utils.check_env_specs` +# to check that a (transformed) environment has an input/output structure that +# matches the one dictated by its specs. +# Let us try it out: +# + +env = PendulumEnv() +check_env_specs(env) + +###################################################################### +# We can have a look at our specs to have a visual representation of the environment +# signature: +# + +print("observation_spec:", env.observation_spec) +print("state_spec:", env.state_spec) +print("reward_spec:", env.reward_spec) + +###################################################################### +# We can execute a couple of commands too to check that the output structure +# matches what is expected. + +td = env.reset() +print("reset tensordict", td) + +###################################################################### +# We can run the :func:`env.rand_step` to generate +# an action randomly from the ``action_spec`` domain. A ``tensordict`` containing +# the hyperparameters and the current state **must** be passed since our +# environment is stateless. In stateful contexts, ``env.rand_step()`` works +# perfectly too. +# +td = env.rand_step(td) +print("random step tensordict", td) + +###################################################################### +# Transforming an environment +# --------------------------- +# +# Writing environment transforms for stateless simulators is slightly more +# complicated than for stateful ones: transforming an output entry that needs +# to be read at the following iteration requires to apply the inverse transform +# before calling :func:`meth.step` at the next step. +# This is an ideal scenario to showcase all the features of TorchRL's +# transforms! +# +# For instance, in the following transformed environment we ``unsqueeze`` the entries +# ``["th", "thdot"]`` to be able to stack them along the last +# dimension. We also pass them as ``in_keys_inv`` to squeeze them back to their +# original shape once they are passed as input in the next iteration. +# +env = TransformedEnv( + env, + # ``Unsqueeze`` the observations that we will concatenate + UnsqueezeTransform( + unsqueeze_dim=-1, + in_keys=["th", "thdot"], + in_keys_inv=["th", "thdot"], + ), +) + +###################################################################### +# Writing custom transforms +# ^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# TorchRL's transforms may not cover all the operations one wants to execute +# after an environment has been executed. +# Writing a transform does not require much effort. As for the environment +# design, there are two steps in writing a transform: +# +# - Getting the dynamics right (forward and inverse); +# - Adapting the environment specs. +# +# A transform can be used in two settings: on its own, it can be used as a +# :class:`~torch.nn.Module`. It can also be used appended to a +# :class:`~torchrl.envs.transforms.TransformedEnv`. The structure of the class allows to +# customize the behavior in the different contexts. +# +# A :class:`~torchrl.envs.transforms.Transform` skeleton can be summarized as follows: +# +# .. code-block:: +# +# class Transform(nn.Module): +# def forward(self, tensordict): +# ... +# def _apply_transform(self, tensordict): +# ... +# def _step(self, tensordict): +# ... +# def _call(self, tensordict): +# ... +# def inv(self, tensordict): +# ... +# def _inv_apply_transform(self, tensordict): +# ... +# +# There are three entry points (:func:`forward`, :func:`_step` and :func:`inv`) +# which all receive :class:`tensordict.TensorDict` instances. The first two +# will eventually go through the keys indicated by :obj:`~tochrl.envs.transforms.Transform.in_keys` +# and call :meth:`~torchrl.envs.transforms.Transform._apply_transform` to each of these. The results will +# be written in the entries pointed by :obj:`Transform.out_keys` if provided +# (if not the ``in_keys`` will be updated with the transformed values). +# If inverse transforms need to be executed, a similar data flow will be +# executed but with the :func:`Transform.inv` and +# :func:`Transform._inv_apply_transform` methods and across the ``in_keys_inv`` +# and ``out_keys_inv`` list of keys. +# The following figure summarized this flow for environments and replay +# buffers. +# +# +# Transform API +# +# In some cases, a transform will not work on a subset of keys in a unitary +# manner, but will execute some operation on the parent environment or +# work with the entire input ``tensordict``. +# In those cases, the :func:`_call` and :func:`forward` methods should be +# re-written, and the :func:`_apply_transform` method can be skipped. +# +# Let us code new transforms that will compute the ``sine`` and ``cosine`` +# values of the position angle, as these values are more useful to us to learn +# a policy than the raw angle value: + + +class SinTransform(Transform): + def _apply_transform(self, obs: torch.Tensor) -> None: + return obs.sin() + + # The transform must also modify the data at reset time + def _reset( + self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase + ) -> TensorDictBase: + return self._call(tensordict_reset) + + # _apply_to_composite will execute the observation spec transform across all + # in_keys/out_keys pairs and write the result in the observation_spec which + # is of type ``Composite`` + @_apply_to_composite + def transform_observation_spec(self, observation_spec): + return BoundedTensorSpec( + low=-1, + high=1, + shape=observation_spec.shape, + dtype=observation_spec.dtype, + device=observation_spec.device, + ) + + +class CosTransform(Transform): + def _apply_transform(self, obs: torch.Tensor) -> None: + return obs.cos() + + # The transform must also modify the data at reset time + def _reset( + self, tensordict: TensorDictBase, tensordict_reset: TensorDictBase + ) -> TensorDictBase: + return self._call(tensordict_reset) + + # _apply_to_composite will execute the observation spec transform across all + # in_keys/out_keys pairs and write the result in the observation_spec which + # is of type ``Composite`` + @_apply_to_composite + def transform_observation_spec(self, observation_spec): + return BoundedTensorSpec( + low=-1, + high=1, + shape=observation_spec.shape, + dtype=observation_spec.dtype, + device=observation_spec.device, + ) + + +t_sin = SinTransform(in_keys=["th"], out_keys=["sin"]) +t_cos = CosTransform(in_keys=["th"], out_keys=["cos"]) +env.append_transform(t_sin) +env.append_transform(t_cos) + +###################################################################### +# Concatenates the observations onto an "observation" entry. +# ``del_keys=False`` ensures that we keep these values for the next +# iteration. +cat_transform = CatTensors( + in_keys=["sin", "cos", "thdot"], dim=-1, out_key="observation", del_keys=False +) +env.append_transform(cat_transform) + +###################################################################### +# Once more, let us check that our environment specs match what is received: +check_env_specs(env) + +###################################################################### +# Executing a rollout +# ------------------- +# +# Executing a rollout is a succession of simple steps: +# +# * reset the environment +# * while some condition is not met: +# +# * compute an action given a policy +# * execute a step given this action +# * collect the data +# * make a ``MDP`` step +# +# * gather the data and return +# +# These operations have been conveniently wrapped in the :meth:`~torchrl.envs.EnvBase.rollout` +# method, from which we provide a simplified version here below. + + +def simple_rollout(steps=100): + # preallocate: + data = TensorDict({}, [steps]) + # reset + _data = env.reset() + for i in range(steps): + _data["action"] = env.action_spec.rand() + _data = env.step(_data) + data[i] = _data + _data = step_mdp(_data, keep_other=True) + return data + + +print("data from rollout:", simple_rollout(100)) + +###################################################################### +# Batching computations +# --------------------- +# +# The last unexplored end of our tutorial is the ability that we have to +# batch computations in TorchRL. Because our environment does not +# make any assumptions regarding the input data shape, we can seamlessly +# execute it over batches of data. Even better: for non-batch-locked +# environments such as our Pendulum, we can change the batch size on the fly +# without recreating the environment. +# To do this, we just generate parameters with the desired shape. +# + +batch_size = 10 # number of environments to be executed in batch +td = env.reset(env.gen_params(batch_size=[batch_size])) +print("reset (batch size of 10)", td) +td = env.rand_step(td) +print("rand step (batch size of 10)", td) + +###################################################################### +# Executing a rollout with a batch of data requires us to reset the environment +# out of the rollout function, since we need to define the batch_size +# dynamically and this is not supported by :meth:`~torchrl.envs.EnvBase.rollout`: +# + +rollout = env.rollout( + 3, + auto_reset=False, # we're executing the reset out of the ``rollout`` call + tensordict=env.reset(env.gen_params(batch_size=[batch_size])), +) +print("rollout of len 3 (batch size of 10):", rollout) + + +###################################################################### +# Training a simple policy +# ------------------------ +# +# In this example, we will train a simple policy using the reward as a +# differentiable objective, such as a negative loss. +# We will take advantage of the fact that our dynamic system is fully +# differentiable to backpropagate through the trajectory return and adjust the +# weights of our policy to maximize this value directly. Of course, in many +# settings many of the assumptions we make do not hold, such as +# differentiable system and full access to the underlying mechanics. +# +# Still, this is a very simple example that showcases how a training loop can +# be coded with a custom environment in TorchRL. +# +# Let us first write the policy network: +# +torch.manual_seed(0) +env.set_seed(0) + +net = nn.Sequential( + nn.LazyLinear(64), + nn.Tanh(), + nn.LazyLinear(64), + nn.Tanh(), + nn.LazyLinear(64), + nn.Tanh(), + nn.LazyLinear(1), +) +policy = TensorDictModule( + net, + in_keys=["observation"], + out_keys=["action"], +) + +###################################################################### +# and our optimizer: +# + +optim = torch.optim.Adam(policy.parameters(), lr=2e-3) + +###################################################################### +# Training loop +# ^^^^^^^^^^^^^ +# +# We will successively: +# +# * generate a trajectory +# * sum the rewards +# * backpropagate through the graph defined by these operations +# * clip the gradient norm and make an optimization step +# * repeat +# +# At the end of the training loop, we should have a final reward close to 0 +# which demonstrates that the pendulum is upward and still as desired. +# +batch_size = 32 +pbar = tqdm.tqdm(range(20_000 // batch_size)) +scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, 20_000) +logs = defaultdict(list) + +for _ in pbar: + init_td = env.reset(env.gen_params(batch_size=[batch_size])) + rollout = env.rollout(100, policy, tensordict=init_td, auto_reset=False) + traj_return = rollout["next", "reward"].mean() + (-traj_return).backward() + gn = torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0) + optim.step() + optim.zero_grad() + pbar.set_description( + f"reward: {traj_return: 4.4f}, " + f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}" + ) + logs["return"].append(traj_return.item()) + logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean().item()) + scheduler.step() + + +def plot(): + import matplotlib + from matplotlib import pyplot as plt + + is_ipython = "inline" in matplotlib.get_backend() + if is_ipython: + from IPython import display + + with plt.ion(): + plt.figure(figsize=(10, 5)) + plt.subplot(1, 2, 1) + plt.plot(logs["return"]) + plt.title("returns") + plt.xlabel("iteration") + plt.subplot(1, 2, 2) + plt.plot(logs["last_reward"]) + plt.title("last reward") + plt.xlabel("iteration") + if is_ipython: + display.display(plt.gcf()) + display.clear_output(wait=True) + plt.show() + + +plot() + + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we have learned how to code a stateless environment from +# scratch. We touched the subjects of: +# +# * The four essential components that need to be taken care of when coding +# an environment (``step``, ``reset``, seeding and building specs). +# We saw how these methods and classes interact with the +# :class:`~tensordict.TensorDict` class; +# * How to test that an environment is properly coded using +# :func:`~torchrl.envs.utils.check_env_specs`; +# * How to append transforms in the context of stateless environments and how +# to write custom transformations; +# * How to train a policy on a fully differentiable simulator. +# diff --git a/advanced_source/privateuseone.rst b/advanced_source/privateuseone.rst index 494e5beb387..5b5b37c20e2 100644 --- a/advanced_source/privateuseone.rst +++ b/advanced_source/privateuseone.rst @@ -226,7 +226,7 @@ The primary goal of integrating new devices through ``PrivateUse1`` is to meet t and the next thing to do is to improve usability, which mainly involves the following aspects. 1. Register new backend module to Pytorch. -2. Generate methods and properties related to the new backend. +2. Rename PrivateUse1 to a custom name for the new backend. 3. Generate methods and properties related to the new backend. Register new backend module to Pytorch diff --git a/advanced_source/python_custom_ops.py b/advanced_source/python_custom_ops.py new file mode 100644 index 00000000000..5ace0b40897 --- /dev/null +++ b/advanced_source/python_custom_ops.py @@ -0,0 +1,275 @@ +# -*- coding: utf-8 -*- + +""" +.. _python-custom-ops-tutorial: + +Custom Python Operators +======================= + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to integrate custom operators written in Python with PyTorch + * How to test custom operators using ``torch.library.opcheck`` + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 or later + +PyTorch offers a large library of operators that work on Tensors (e.g. +``torch.add``, ``torch.sum``, etc). However, you might wish to use a new customized +operator with PyTorch, perhaps written by a third-party library. This tutorial +shows how to wrap Python functions so that they behave like PyTorch native +operators. Reasons why you may wish to create a custom operator in PyTorch include: + +- Treating an arbitrary Python function as an opaque callable with respect + to ``torch.compile`` (that is, prevent ``torch.compile`` from tracing + into the function). +- Adding training support to an arbitrary Python function + +Use :func:`torch.library.custom_op` to create Python custom operators. +Use the C++ ``TORCH_LIBRARY`` APIs to create C++ custom operators (these +work in Python-less environments). +See the `Custom Operators Landing Page `_ +for more details. + +Please note that if your operation can be expressed as a composition of +existing PyTorch operators, then there is usually no need to use the custom operator +API -- everything (for example ``torch.compile``, training support) should +just work. +""" +###################################################################### +# Example: Wrapping PIL's crop into a custom operator +# ------------------------------------ +# Let's say that we are using PIL's ``crop`` operation. + +import torch +from torchvision.transforms.functional import to_pil_image, pil_to_tensor +import PIL +import IPython +import matplotlib.pyplot as plt + +def crop(pic, box): + img = to_pil_image(pic.cpu()) + cropped_img = img.crop(box) + return pil_to_tensor(cropped_img).to(pic.device) / 255. + +def display(img): + plt.imshow(img.numpy().transpose((1, 2, 0))) + +img = torch.ones(3, 64, 64) +img *= torch.linspace(0, 1, steps=64) * torch.linspace(0, 1, steps=64).unsqueeze(-1) +display(img) + +###################################################################### + +cropped_img = crop(img, (10, 10, 50, 50)) +display(cropped_img) + +###################################################################### +# ``crop`` is not handled effectively out-of-the-box by +# ``torch.compile``: ``torch.compile`` induces a +# `"graph break" `_ +# on functions it is unable to handle and graph breaks are bad for performance. +# The following code demonstrates this by raising an error +# (``torch.compile`` with ``fullgraph=True`` raises an error if a +# graph break occurs). + +@torch.compile(fullgraph=True) +def f(img): + return crop(img, (10, 10, 50, 50)) + +# The following raises an error. Uncomment the line to see it. +# cropped_img = f(img) + +###################################################################### +# In order to black-box ``crop`` for use with ``torch.compile``, we need to +# do two things: +# +# 1. wrap the function into a PyTorch custom operator. +# 2. add a "``FakeTensor`` kernel" (aka "meta kernel") to the operator. +# Given some ``FakeTensors`` inputs (dummy Tensors that don't have storage), +# this function should return dummy Tensors of your choice with the correct +# Tensor metadata (shape/strides/``dtype``/device). + + +from typing import Sequence + +# Use torch.library.custom_op to define a new custom operator. +# If your operator mutates any input Tensors, their names must be specified +# in the ``mutates_args`` argument. +@torch.library.custom_op("mylib::crop", mutates_args=()) +def crop(pic: torch.Tensor, box: Sequence[int]) -> torch.Tensor: + img = to_pil_image(pic.cpu()) + cropped_img = img.crop(box) + return (pil_to_tensor(cropped_img) / 255.).to(pic.device, pic.dtype) + +# Use register_fake to add a ``FakeTensor`` kernel for the operator +@crop.register_fake +def _(pic, box): + channels = pic.shape[0] + x0, y0, x1, y1 = box + return pic.new_empty(channels, y1 - y0, x1 - x0) + +###################################################################### +# After this, ``crop`` now works without graph breaks: + +@torch.compile(fullgraph=True) +def f(img): + return crop(img, (10, 10, 50, 50)) + +cropped_img = f(img) +display(img) + +###################################################################### + +display(cropped_img) + +###################################################################### +# Adding training support for crop +# -------------------------------- +# Use ``torch.library.register_autograd`` to add training support for an operator. +# Prefer this over directly using ``torch.autograd.Function``; some compositions of +# ``autograd.Function`` with PyTorch operator registration APIs can lead to (and +# has led to) silent incorrectness when composed with ``torch.compile``. +# +# If you don't need training support, there is no need to use +# ``torch.library.register_autograd``. +# If you end up training with a ``custom_op`` that doesn't have an autograd +# registration, we'll raise an error message. +# +# The gradient formula for ``crop`` is essentially ``PIL.paste`` (we'll leave the +# derivation as an exercise to the reader). Let's first wrap ``paste`` into a +# custom operator: + +@torch.library.custom_op("mylib::paste", mutates_args=()) +def paste(im1: torch.Tensor, im2: torch.Tensor, coord: Sequence[int]) -> torch.Tensor: + assert im1.device == im2.device + assert im1.dtype == im2.dtype + im1_pil = to_pil_image(im1.cpu()) + im2_pil = to_pil_image(im2.cpu()) + PIL.Image.Image.paste(im1_pil, im2_pil, coord) + return (pil_to_tensor(im1_pil) / 255.).to(im1.device, im1.dtype) + +@paste.register_fake +def _(im1, im2, coord): + assert im1.device == im2.device + assert im1.dtype == im2.dtype + return torch.empty_like(im1) + +###################################################################### +# And now let's use ``register_autograd`` to specify the gradient formula for ``crop``: + +def backward(ctx, grad_output): + grad_input = grad_output.new_zeros(ctx.pic_shape) + grad_input = paste(grad_input, grad_output, ctx.coords) + return grad_input, None + +def setup_context(ctx, inputs, output): + pic, box = inputs + ctx.coords = box[:2] + ctx.pic_shape = pic.shape + +crop.register_autograd(backward, setup_context=setup_context) + +###################################################################### +# Note that the backward must be a composition of PyTorch-understood operators, +# which is why we wrapped paste into a custom operator instead of directly using +# PIL's paste. + +img = img.requires_grad_() +result = crop(img, (10, 10, 50, 50)) +result.sum().backward() +display(img.grad) + +###################################################################### +# This is the correct gradient, with 1s (white) in the cropped region and 0s +# (black) in the unused region. + +###################################################################### +# Testing Python Custom operators +# ------------------------------- +# Use ``torch.library.opcheck`` to test that the custom operator was registered +# correctly. This does not test that the gradients are mathematically correct; +# please write separate tests for that (either manual ones or ``torch.autograd.gradcheck``). +# +# To use ``opcheck``, pass it a set of example inputs to test against. If your +# operator supports training, then the examples should include Tensors that +# require grad. If your operator supports multiple devices, then the examples +# should include Tensors from each device. + +examples = [ + [torch.randn(3, 64, 64), [0, 0, 10, 10]], + [torch.randn(3, 91, 91, requires_grad=True), [10, 0, 20, 10]], + [torch.randn(3, 60, 60, dtype=torch.double), [3, 4, 32, 20]], + [torch.randn(3, 512, 512, requires_grad=True, dtype=torch.double), [3, 4, 32, 45]], +] + +for example in examples: + torch.library.opcheck(crop, example) + +###################################################################### +# Mutable Python Custom operators +# ------------------------------- +# You can also wrap a Python function that mutates its inputs into a custom +# operator. +# Functions that mutate inputs are common because that is how many low-level +# kernels are written; for example, a kernel that computes ``sin`` may take in +# the input and an output tensor and write ``input.sin()`` to the output tensor. +# +# We'll use ``numpy.sin`` to demonstrate an example of a mutable Python +# custom operator. + +import numpy as np + +@torch.library.custom_op("mylib::numpy_sin", mutates_args={"output"}, device_types="cpu") +def numpy_sin(input: torch.Tensor, output: torch.Tensor) -> None: + assert input.device == output.device + assert input.device.type == "cpu" + input_np = input.numpy() + output_np = output.numpy() + np.sin(input_np, out=output_np) + +###################################################################### +# Because the operator doesn't return anything, there is no need to register +# a ``FakeTensor`` kernel (meta kernel) to get it to work with ``torch.compile``. + +@torch.compile(fullgraph=True) +def f(x): + out = torch.empty(3) + numpy_sin(x, out) + return out + +x = torch.randn(3) +y = f(x) +assert torch.allclose(y, x.sin()) + +###################################################################### +# And here's an ``opcheck`` run telling us that we did indeed register the operator correctly. +# ``opcheck`` would error out if we forgot to add the output to ``mutates_args``, for example. + +example_inputs = [ + [torch.randn(3), torch.empty(3)], + [torch.randn(0, 3), torch.empty(0, 3)], + [torch.randn(1, 2, 3, 4, dtype=torch.double), torch.empty(1, 2, 3, 4, dtype=torch.double)], +] + +for example in example_inputs: + torch.library.opcheck(numpy_sin, example) + +###################################################################### +# Conclusion +# ---------- +# In this tutorial, we learned how to use ``torch.library.custom_op`` to +# create a custom operator in Python that works with PyTorch subsystems +# such as ``torch.compile`` and autograd. +# +# This tutorial provides a basic introduction to custom operators. +# For more detailed information, see: +# +# - `the torch.library documentation `_ +# - `the Custom Operators Manual `_ +# diff --git a/advanced_source/semi_structured_sparse.py b/advanced_source/semi_structured_sparse.py new file mode 100644 index 00000000000..38c2c6878b3 --- /dev/null +++ b/advanced_source/semi_structured_sparse.py @@ -0,0 +1,651 @@ +# -*- coding: utf-8 -*- +""" +(beta) Accelerating BERT with semi-structured (2:4) sparsity +===================================================== +**Author**: `Jesse Cai `_ + +""" + +#################################################################### +# Overview +# -------- +# +# Like other forms of sparsity, **semi-structured sparsity** is a model +# optimization technique that seeks to reduce the memory overhead and +# latency of a neural network at the expense of some model accuracy. It is +# also known as **fine-grained structured sparsity** or **2:4 structured +# sparsity**. +# +# Semi-structured sparsity derives its name from its unique sparsity +# pattern, where n out of every 2n elements are pruned. We most often see +# n=2, hence 2:4 sparsity Semi-structured sparsity is particularly +# interesting because it can be efficiently accelerated on GPUs and +# doesn’t degrade model accuracy as much as other sparsity patterns. +# +# With the introduction of +# `semi-structured sparsity support `_, +# it is possible to prune and accelerate a semi-structured sparse model +# without leaving PyTorch. We will explain this process in this tutorial. +# +# .. image:: ../../_static/img/pruning_flow.jpg +# +# By the end of this tutorial, we will have sparsified a BERT +# question-answering model to be 2:4 sparse, fine-tuning it to recover +# nearly all F1 loss (86.92 dense vs 86.48 sparse). Finally, we will +# accelerate this 2:4 sparse model for inference, yielding a 1.3x speedup. +# + +##################################################### +# Requirements +# ------------ +# +# - PyTorch >= 2.1. +# - A NVIDIA GPU with semi-structured sparsity support (Compute +# Capability 8.0+). +# +# This tutorial is designed for beginners to semi-structured sparsity and +# sparsity in general. For users with existing 2:4 sparse models, +# accelerating ``nn.Linear`` layers for inference with +# ``to_sparse_semi_structured`` is quite straightforward. Here is an example: +# + +import torch +from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor +from torch.utils.benchmark import Timer +SparseSemiStructuredTensor._FORCE_CUTLASS = True + +# mask Linear weight to be 2:4 sparse +mask = torch.Tensor([0, 0, 1, 1]).tile((3072, 2560)).cuda().bool() +linear = torch.nn.Linear(10240, 3072).half().cuda().eval() +linear.weight = torch.nn.Parameter(mask * linear.weight) + +x = torch.rand(3072, 10240).half().cuda() + +with torch.inference_mode(): + dense_output = linear(x) + dense_t = Timer(stmt="linear(x)", + globals={"linear": linear, + "x": x}).blocked_autorange().median * 1e3 + + # accelerate via SparseSemiStructuredTensor + linear.weight = torch.nn.Parameter(to_sparse_semi_structured(linear.weight)) + + sparse_output = linear(x) + sparse_t = Timer(stmt="linear(x)", + globals={"linear": linear, + "x": x}).blocked_autorange().median * 1e3 + + # sparse and dense matmul are numerically equivalent + # On an A100 80GB, we see: `Dense: 0.870ms Sparse: 0.630ms | Speedup: 1.382x` + assert torch.allclose(sparse_output, dense_output, atol=1e-3) + print(f"Dense: {dense_t:.3f}ms Sparse: {sparse_t:.3f}ms | Speedup: {(dense_t / sparse_t):.3f}x") + + +###################################################################### +# What problem does semi-structured sparsity solve? +# ------------------------------------------------- +# +# The general motivation behind sparsity is simple: if there are zeros in +# your network, you can optimize efficiency by not storing or computing those +# parameters. However, the specifics of sparsity are tricky. Zeroing out +# parameters doesn’t affect the latency / memory overhead of our model out +# of the box. +# +# This is because the dense tensor still contains the pruned (zero) +# elements, which the dense matrix multiplication kernel will still +# operate on this elements. In order to realize performance gains, we need +# to swap out dense kernels for sparse kernels, which skip calculation +# involving pruned elements. +# +# To do this, these kernels work on sparse matrices, which do not store +# the pruned elements and store the specified elements in a compressed +# format. +# +# For semi-structured sparsity, we store exactly half of the original +# parameters along with some compressed metadata about how the elements +# were arranged. +# +# .. image:: https://developer-blogs.nvidia.com/wp-content/uploads/2023/06/2-4-structured-sparsity-pattern.png +# :align: center :width: 80% +# +# Image sourced from `NVIDIA blog post `_ on semi-structured sparsity. +# +# There are many different sparse layouts, each with their own benefits +# and drawbacks. The 2:4 semi-structured sparse layout is particularly +# interesting for two reasons: +# +# * Unlike previous sparse formats, +# semi-structured sparsity was designed to be efficiently accelerated on +# GPUs. In 2020, NVIDIA introduced hardware support for semi-structured +# sparsity with their Ampere architecture, and have also released fast +# sparse kernels via +# CUTLASS `cuSPARSELt `__. +# +# * At the same time, semi-structured sparsity tends to have a milder +# impact on model accuracy compared to other sparse formats, especially +# when accounting for more advanced pruning / fine-tuning methods. NVIDIA +# has shown in their `white paper `_ +# that a simple paradigm of magnitude pruning once to be 2:4 sparse and +# then retraining the model yields nearly identical model accuracies. +# +# Semi-structured exists in a sweet spot, providing a 2x (theoretical) +# speedup at a much lower sparsity level (50%), while still being granular +# enough to preserve model accuracy. +# +# +---------------------+-------------+--------+------------+-------------+ +# | Network | Data Set | Metric | Dense FP16 | Sparse FP16 | +# +=====================+=============+========+============+=============+ +# | ResNet-50 | ImageNet | Top-1 | 76.1 | 76.2 | +# +---------------------+-------------+--------+------------+-------------+ +# | ResNeXt-101_32x8d | ImageNet | Top-1 | 79.3 | 79.3 | +# +---------------------+-------------+--------+------------+-------------+ +# | Xception | ImageNet | Top-1 | 79.2 | 79.2 | +# +---------------------+-------------+--------+------------+-------------+ +# | SSD-RN50 | COCO2017 | bbAP | 24.8 | 24.8 | +# +---------------------+-------------+--------+------------+-------------+ +# | MaskRCNN-RN50 | COCO2017 | bbAP | 37.9 | 37.9 | +# +---------------------+-------------+--------+------------+-------------+ +# | FairSeq Transformer | EN-DE WMT14 | BLEU | 28.2 | 28.5 | +# +---------------------+-------------+--------+------------+-------------+ +# | BERT-Large | SQuAD v1.1 | F1 | 91.9 | 91.9 | +# +---------------------+-------------+--------+------------+-------------+ +# +# Semi-structured sparsity has an additional advantage from a workflow +# perspective. Because the sparsity level is fixed at 50%, it is easier to +# decompose the problem of sparsifying a model into two distinct +# subproblems: +# +# - Accuracy - How can we find a set of 2:4 sparse weights that minimize +# the accuracy degradation of our model? +# +# - Performance - How can we accelerate our 2:4 sparse weights for +# inference and reduced memory overhead? +# + +##################################################################### +# .. math:: +# +# \begin{bmatrix} +# 1 & 1 & 0 & 0 \\ +# 0 & 0 & 1 & 1 \\ +# 1 & 0 & 0 & 0 \\ +# 0 & 0 & 1 & 1 \\ +# \end{bmatrix} +# +# The natural handoff point between these two problems are zeroed-out +# dense tensors. Our inference solution is designed to compress and +# accelerate tensors in this format. We anticipate many users coming up +# with custom masking solution, as this is an active area of research. +# +# Now that we’ve learned a little more about semi-structured sparsity, +# let’s apply it to a BERT model trained on a question answering task, +# SQuAD. +# +# Intro & Setup +# ------------- +# +# Let’s start by importing all the packages we need. +# + +# If you are running this in Google Colab, run: +# .. code-block: python +# +# !pip install datasets transformers evaluate accelerate pandas +# +import os +os.environ["WANDB_DISABLED"] = "true" + +import collections +import datasets +import evaluate +import numpy as np +import torch +import torch.utils.benchmark as benchmark +from torch import nn +from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor +from torch.ao.pruning import WeightNormSparsifier +import transformers + +# force CUTLASS use if ``cuSPARSELt`` is not available +SparseSemiStructuredTensor._FORCE_CUTLASS = True +torch.manual_seed(100) + + +###################################################################### +# We’ll also need to define some helper functions that are specific to the +# dataset / task at hand. These were adapted from +# `this `__ +# Hugging Face course as a reference. +# + +def preprocess_validation_function(examples, tokenizer): + inputs = tokenizer( + [q.strip() for q in examples["question"]], + examples["context"], + max_length=384, + truncation="only_second", + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + sample_map = inputs.pop("overflow_to_sample_mapping") + example_ids = [] + + for i in range(len(inputs["input_ids"])): + sample_idx = sample_map[i] + example_ids.append(examples["id"][sample_idx]) + sequence_ids = inputs.sequence_ids(i) + offset = inputs["offset_mapping"][i] + inputs["offset_mapping"][i] = [ + o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) + ] + + inputs["example_id"] = example_ids + return inputs + + +def preprocess_train_function(examples, tokenizer): + inputs = tokenizer( + [q.strip() for q in examples["question"]], + examples["context"], + max_length=384, + truncation="only_second", + return_offsets_mapping=True, + padding="max_length", + ) + + offset_mapping = inputs["offset_mapping"] + answers = examples["answers"] + start_positions = [] + end_positions = [] + + for i, (offset, answer) in enumerate(zip(offset_mapping, answers)): + start_char = answer["answer_start"][0] + end_char = start_char + len(answer["text"][0]) + sequence_ids = inputs.sequence_ids(i) + + # Find the start and end of the context + idx = 0 + while sequence_ids[idx] != 1: + idx += 1 + context_start = idx + while sequence_ids[idx] == 1: + idx += 1 + context_end = idx - 1 + + # If the answer is not fully inside the context, label it (0, 0) + if offset[context_start][0] > end_char or offset[context_end][1] < start_char: + start_positions.append(0) + end_positions.append(0) + else: + # Otherwise it's the start and end token positions + idx = context_start + while idx <= context_end and offset[idx][0] <= start_char: + idx += 1 + start_positions.append(idx - 1) + + idx = context_end + while idx >= context_start and offset[idx][1] >= end_char: + idx -= 1 + end_positions.append(idx + 1) + + inputs["start_positions"] = start_positions + inputs["end_positions"] = end_positions + return inputs + + +def compute_metrics(start_logits, end_logits, features, examples): + n_best = 20 + max_answer_length = 30 + metric = evaluate.load("squad") + + example_to_features = collections.defaultdict(list) + for idx, feature in enumerate(features): + example_to_features[feature["example_id"]].append(idx) + + predicted_answers = [] + # for example in ``tqdm`` (examples): + for example in examples: + example_id = example["id"] + context = example["context"] + answers = [] + + # Loop through all features associated with that example + for feature_index in example_to_features[example_id]: + start_logit = start_logits[feature_index] + end_logit = end_logits[feature_index] + offsets = features[feature_index]["offset_mapping"] + + start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() + end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Skip answers that are not fully in the context + if offsets[start_index] is None or offsets[end_index] is None: + continue + # Skip answers with a length that is either < 0 + # or > max_answer_length + if ( + end_index < start_index + or end_index - start_index + 1 > max_answer_length + ): + continue + + answer = { + "text": context[ + offsets[start_index][0] : offsets[end_index][1] + ], + "logit_score": start_logit[start_index] + end_logit[end_index], + } + answers.append(answer) + + # Select the answer with the best score + if len(answers) > 0: + best_answer = max(answers, key=lambda x: x["logit_score"]) + predicted_answers.append( + {"id": example_id, "prediction_text": best_answer["text"]} + ) + else: + predicted_answers.append({"id": example_id, "prediction_text": ""}) + + theoretical_answers = [ + {"id": ex["id"], "answers": ex["answers"]} for ex in examples + ] + return metric.compute(predictions=predicted_answers, references=theoretical_answers) + + +###################################################################### +# Now that those are defined, we just need one additional helper function, +# which will help us benchmark our model. +# + +def measure_execution_time(model, batch_sizes, dataset): + dataset_for_model = dataset.remove_columns(["example_id", "offset_mapping"]) + dataset_for_model.set_format("torch") + batch_size_to_time_sec = {} + for batch_size in batch_sizes: + batch = { + k: dataset_for_model[k][:batch_size].cuda() + for k in dataset_for_model.column_names + } + + with torch.no_grad(): + baseline_predictions = model(**batch) + timer = benchmark.Timer( + stmt="model(**batch)", globals={"model": model, "batch": batch} + ) + p50 = timer.blocked_autorange().median * 1000 + batch_size_to_time_sec[batch_size] = p50 + + model_c = torch.compile(model, fullgraph=True) + timer = benchmark.Timer( + stmt="model(**batch)", globals={"model": model_c, "batch": batch} + ) + p50 = timer.blocked_autorange().median * 1000 + batch_size_to_time_sec[f"{batch_size}_compile"] = p50 + new_predictions = model_c(**batch) + + return batch_size_to_time_sec + + + +###################################################################### +# We will get started by loading our model and tokenizer, and then setting +# up our dataset. +# + +# load model +model_name = "bert-base-cased" +tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) +model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name) +print(f"Loading tokenizer: {model_name}") +print(f"Loading model: {model_name}") + +# set up train and val dataset +squad_dataset = datasets.load_dataset("squad") +tokenized_squad_dataset = {} +tokenized_squad_dataset["train"] = squad_dataset["train"].map( + lambda x: preprocess_train_function(x, tokenizer), batched=True +) +tokenized_squad_dataset["validation"] = squad_dataset["validation"].map( + lambda x: preprocess_validation_function(x, tokenizer), + batched=True, + remove_columns=squad_dataset["train"].column_names, +) +data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer) + + +###################################################################### +# Establishing a baseline +# ======================= +# +# Next, we’ll train a quick baseline of our model on SQuAD. This task asks +# our model to identify spans, or segments of text, in a given context +# (Wikipedia articles) that answer a given question. Running the following +# code gives me an F1 score of 86.9. This is quite close to the reported +# NVIDIA score and the difference is likely due to BERT-base +# vs. BERT-large or fine-tuning hyperparameters. +# + +training_args = transformers.TrainingArguments( + "trainer", + num_train_epochs=1, + lr_scheduler_type="constant", + per_device_train_batch_size=32, + per_device_eval_batch_size=256, + logging_steps=50, + # Limit max steps for tutorial runners. Delete the below line to see the reported accuracy numbers. + max_steps=500, + report_to=None, +) + +trainer = transformers.Trainer( + model, + training_args, + train_dataset=tokenized_squad_dataset["train"], + eval_dataset=tokenized_squad_dataset["validation"], + data_collator=data_collator, + tokenizer=tokenizer, +) + +trainer.train() + +# batch sizes to compare for eval +batch_sizes = [4, 16, 64, 256] +# 2:4 sparsity require fp16, so we cast here for a fair comparison +with torch.autocast("cuda"): + with torch.no_grad(): + predictions = trainer.predict(tokenized_squad_dataset["validation"]) + start_logits, end_logits = predictions.predictions + fp16_baseline = compute_metrics( + start_logits, + end_logits, + tokenized_squad_dataset["validation"], + squad_dataset["validation"], + ) + fp16_time = measure_execution_time( + model, + batch_sizes, + tokenized_squad_dataset["validation"], + ) + +print("fp16", fp16_baseline) +print("cuda_fp16 time", fp16_time) + +import pandas as pd +df = pd.DataFrame(trainer.state.log_history) +df.plot.line(x='step', y='loss', title="Loss vs. # steps", ylabel="loss") + + +###################################################################### +# Pruning BERT to be 2:4 sparse +# ----------------------------- +# +# Now that we have our baseline, it’s time we prune BERT. There are many +# different pruning strategies, but one of the most common is **magnitude +# pruning**, which seeks to remove the weights with the lowest L1 norm. +# Magnitude pruning was used by NVIDIA in all their results and is a +# common baseline. +# +# To do this, we will use the ``torch.ao.pruning`` package, which contains +# a weight-norm (magnitude) sparsifier. These sparsifiers work by applying +# mask parametrizations to the weight tensors in a model. This lets them +# simulate sparsity by masking out the pruned weights. +# +# We’ll also have to decide what layers of the model to apply sparsity to, +# which in this case is all of the ``nn.Linear`` layers, except for the +# task-specific head outputs. That’s because semi-structured sparsity has +# `shape constraints `_, +# and the task-specific ``nn.Linear`` layers do not satisfy them. +# + +sparsifier = WeightNormSparsifier( + # apply sparsity to all blocks + sparsity_level=1.0, + # shape of 4 elements is a block + sparse_block_shape=(1, 4), + # two zeros for every block of 4 + zeros_per_block=2 +) + +# add to config if ``nn.Linear`` and in the BERT model. +sparse_config = [ + {"tensor_fqn": f"{fqn}.weight"} + for fqn, module in model.named_modules() + if isinstance(module, nn.Linear) and "layer" in fqn +] + + +###################################################################### +# The first step for pruning the model is to insert parametrizations for +# masking the weights of the model. This is done by the prepare step. +# Anytime we try to access the ``.weight`` we will get ``mask * weight`` +# instead. +# + +# Prepare the model, insert fake-sparsity parametrizations for training +sparsifier.prepare(model, sparse_config) +print(model.bert.encoder.layer[0].output) + + +###################################################################### +# Then, we’ll take a single pruning step. All pruners implement a +# ``update_mask()`` method that updates the mask with the logic being +# determined by the pruner implementation. The step method calls this +# ``update_mask`` functions for the weights specified in the sparse +# config. +# +# We will also evaluate the model to show the accuracy degradation of +# zero-shot pruning, or pruning without fine-tuning / retraining. +# + +sparsifier.step() +with torch.autocast("cuda"): + with torch.no_grad(): + predictions = trainer.predict(tokenized_squad_dataset["validation"]) + pruned = compute_metrics( + *predictions.predictions, + tokenized_squad_dataset["validation"], + squad_dataset["validation"], + ) +print("pruned eval metrics:", pruned) + + +###################################################################### +# In this state, we can start fine-tuning the model, updating the elements +# that wouldn’t be pruned to better account for the accuracy loss. Once +# we’ve reached a satisfied state, we can call ``squash_mask`` to fuse the +# mask and the weight together. This will remove the parametrizations and +# we are left with a zeroed-out 2:4 dense model. +# + +trainer.train() +sparsifier.squash_mask() +torch.set_printoptions(edgeitems=4) +print(model.bert.encoder.layer[0].intermediate.dense.weight[:8, :8]) + +df["sparse_loss"] = pd.DataFrame(trainer.state.log_history)["loss"] +df.plot.line(x='step', y=["loss", "sparse_loss"], title="Loss vs. # steps", ylabel="loss") + + +###################################################################### +# Accelerating 2:4 sparse models for inference +# -------------------------------------------- +# +# Now that we have a model in this format, we can accelerate it for +# inference just like in the QuickStart Guide. +# + +model = model.cuda().half() +# accelerate for sparsity +for fqn, module in model.named_modules(): + if isinstance(module, nn.Linear) and "layer" in fqn: + module.weight = nn.Parameter(to_sparse_semi_structured(module.weight)) + +with torch.no_grad(): + predictions = trainer.predict(tokenized_squad_dataset["validation"]) +start_logits, end_logits = predictions.predictions +metrics_sparse = compute_metrics( + start_logits, + end_logits, + tokenized_squad_dataset["validation"], + squad_dataset["validation"], +) +print("sparse eval metrics: ", metrics_sparse) +sparse_perf = measure_execution_time( + model, + batch_sizes, + tokenized_squad_dataset["validation"], +) +print("sparse perf metrics: ", sparse_perf) + + +###################################################################### +# Retraining our model after magnitude pruning has recovered nearly all of +# the F1 that has been lost when the model was pruned. At the same time we +# have achieved a 1.28x speedup for ``bs=16``. Note that not all shapes are +# amenable to performance improvements. When batch sizes are small and +# limited time is spent in compute sparse kernels may be slower than their +# dense counterparts. +# +# Because semi-structured sparsity is implemented as a tensor subclass, it +# is compatible with ``torch.compile``. When composed with +# ``to_sparse_semi_structured``, we are able to achieve a total 2x speedup +# on BERT. +# +# .. table:: +# +# +--------------------+--------+--------------+-----------------+-----------+ +# | Metrics | fp16 | 2:4 sparse | delta / speedup | compiled | +# +====================+========+==============+=================+===========+ +# | Exact Match (%) | 78.53 | 78.44 | -0.09 | | +# +--------------------+--------+--------------+-----------------+-----------+ +# | F1 (%) | 86.93 | 86.49 | -0.44 | | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=4) | 11.10 | 15.54 | 0.71x | no | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=16) | 19.35 | 15.74 | 1.23x | no | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=64) | 72.71 | 59.41 | 1.22x | no | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=256) | 286.65 | 247.63 | 1.14x | no | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=4) | 7.59 | 7.46 | 1.02x | yes | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=16) | 11.47 | 9.68 | 1.18x | yes | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=64) | 41.57 | 36.92 | 1.13x | yes | +# +--------------------+--------+--------------+-----------------+-----------+ +# | Time (bs=256) | 159.22 | 142.23 | 1.12x | yes | +# +--------------------+--------+--------------+-----------------+-----------+ +# +# Conclusion +# ========== +# +# In this tutorial, we have shown how to prune BERT to be 2:4 sparse and +# how to accelerate a 2:4 sparse model for inference. By taking advantage +# of our ``SparseSemiStructuredTensor`` subclass, we were able to achieve a +# 1.3x speedup over the fp16 baseline, and up to 2x with +# ``torch.compile``. We also demonstrated the benefits of 2:4 sparsity by +# fine-tuning BERT to recover any lost F1 (86.92 dense vs 86.48 sparse). +# diff --git a/advanced_source/static_quantization_tutorial.rst b/advanced_source/static_quantization_tutorial.rst index 66c2aa4e2fd..efb171c0dfe 100644 --- a/advanced_source/static_quantization_tutorial.rst +++ b/advanced_source/static_quantization_tutorial.rst @@ -59,7 +59,7 @@ to enable quantization: - Replace ReLU6 with ReLU Note: this code is taken from -`here `_. +`here `_. .. code:: python @@ -206,14 +206,15 @@ Note: this code is taken from # Fuse Conv+BN and Conv+BN+Relu modules prior to quantization # This operation does not change the numerics - def fuse_model(self): + def fuse_model(self, is_qat=False): + fuse_modules = torch.ao.quantization.fuse_modules_qat if is_qat else torch.ao.quantization.fuse_modules for m in self.modules(): if type(m) == ConvBNReLU: - torch.ao.quantization.fuse_modules(m, ['0', '1', '2'], inplace=True) + fuse_modules(m, ['0', '1', '2'], inplace=True) if type(m) == InvertedResidual: for idx in range(len(m.conv)): if type(m.conv[idx]) == nn.Conv2d: - torch.ao.quantization.fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True) + fuse_modules(m.conv, [str(idx), str(idx + 1)], inplace=True) 2. Helper functions ------------------- @@ -285,7 +286,7 @@ We next define several helper functions to help with model evaluation. These mos def load_model(model_file): model = MobileNetV2() - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) model.to('cpu') return model @@ -434,6 +435,9 @@ values to floats - and then back to ints - between every operation, resulting in # Convert to quantized model torch.ao.quantization.convert(myModel, inplace=True) + # You may see a user warning about needing to calibrate the model. This warning can be safely ignored. + # This warning occurs because not all modules are run in each model runs, so some + # modules may not be calibrated. print('Post Training Quantization: Convert done') print('\n Inverted Residual Block: After fusion and quantization, note fused modules: \n\n',myModel.features[1].conv) @@ -533,7 +537,7 @@ We fuse modules as before .. code:: python qat_model = load_model(saved_model_dir + float_model_file) - qat_model.fuse_model() + qat_model.fuse_model(is_qat=True) optimizer = torch.optim.SGD(qat_model.parameters(), lr = 0.0001) # The old 'fbgemm' is still available but 'x86' is the recommended default. diff --git a/advanced_source/super_resolution_with_onnxruntime.py b/advanced_source/super_resolution_with_onnxruntime.py index f0a18948961..264678ee17a 100644 --- a/advanced_source/super_resolution_with_onnxruntime.py +++ b/advanced_source/super_resolution_with_onnxruntime.py @@ -2,14 +2,14 @@ (optional) Exporting a Model from PyTorch to ONNX and Running it using ONNX Runtime =================================================================================== -.. Note:: +.. note:: As of PyTorch 2.1, there are two versions of ONNX Exporter. - * ``torch.onnx.dynamo_export`is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0 - * ``torch.onnx.export`` is based on TorchScript backend and has been available since PyTorch 1.2.0 + * ``torch.onnx.dynamo_export`` is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0. + * ``torch.onnx.export`` is based on TorchScript backend and has been available since PyTorch 1.2.0. In this tutorial, we describe how to convert a model defined -in PyTorch into the ONNX format using the TorchScript ``torch.onnx.export` ONNX exporter. +in PyTorch into the ONNX format using the TorchScript ``torch.onnx.export`` ONNX exporter. The exported model will be executed with ONNX Runtime. ONNX Runtime is a performance-focused engine for ONNX models, @@ -26,7 +26,7 @@ .. code-block:: bash %%bash - pip install onnxruntime + pip install onnx onnxruntime ONNX Runtime recommends using the latest stable runtime for PyTorch. @@ -107,7 +107,7 @@ def _initialize_weights(self): # Load pretrained model weights model_url = 'https://s3.amazonaws.com/pytorch/test_data/export/superres_epoch100-44c6958e.pth' -batch_size = 1 # just a random number +batch_size = 64 # just a random number # Initialize model with the pretrained weights map_location = lambda storage, loc: storage @@ -218,6 +218,32 @@ def to_numpy(tensor): # ONNX exporter, so please contact us in that case. # +###################################################################### +# Timing Comparison Between Models +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# + +###################################################################### +# Since ONNX models optimize for inference speed, running the same +# data on an ONNX model instead of a native pytorch model should result in an +# improvement of up to 2x. Improvement is more pronounced with higher batch sizes. + + +import time + +x = torch.randn(batch_size, 1, 224, 224, requires_grad=True) + +start = time.time() +torch_out = torch_model(x) +end = time.time() +print(f"Inference of Pytorch model used {end - start} seconds") + +ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)} +start = time.time() +ort_outs = ort_session.run(None, ort_inputs) +end = time.time() +print(f"Inference of ONNX model used {end - start} seconds") + ###################################################################### # Running the model on an image using ONNX Runtime @@ -301,10 +327,20 @@ def to_numpy(tensor): # Save the image, we will compare this with the output image from mobile device final_img.save("./_static/img/cat_superres_with_ort.jpg") +# Save resized original image (without super-resolution) +img = transforms.Resize([img_out_y.size[0], img_out_y.size[1]])(img) +img.save("cat_resized.jpg") ###################################################################### +# Here is the comparison between the two images: +# +# .. figure:: /_static/img/cat_resized.jpg +# +# Low-resolution image +# # .. figure:: /_static/img/cat_superres_with_ort.jpg -# :alt: output\_cat +# +# Image after super-resolution # # # ONNX Runtime being a cross platform engine, you can run it across @@ -313,7 +349,7 @@ def to_numpy(tensor): # ONNX Runtime can also be deployed to the cloud for model inferencing # using Azure Machine Learning Services. More information `here `__. # -# More information about ONNX Runtime's performance `here `__. +# More information about ONNX Runtime's performance `here `__. # # # For more information about ONNX Runtime `here `__. diff --git a/advanced_source/torch-script-parallelism.rst b/advanced_source/torch-script-parallelism.rst index 5a2fd86e1f6..09900dbf0d3 100644 --- a/advanced_source/torch-script-parallelism.rst +++ b/advanced_source/torch-script-parallelism.rst @@ -1,6 +1,8 @@ Dynamic Parallelism in TorchScript ================================== +.. warning:: TorchScript is no longer in active development. + In this tutorial, we introduce the syntax for doing *dynamic inter-op parallelism* in TorchScript. This parallelism has the following properties: diff --git a/advanced_source/torch_script_custom_classes.rst b/advanced_source/torch_script_custom_classes.rst index cccb86ff4ce..8586a032aae 100644 --- a/advanced_source/torch_script_custom_classes.rst +++ b/advanced_source/torch_script_custom_classes.rst @@ -1,6 +1,8 @@ Extending TorchScript with Custom C++ Classes =============================================== +.. warning:: TorchScript is no longer in active development. + This tutorial is a follow-on to the :doc:`custom operator ` tutorial, and introduces the API we've built for binding C++ classes into TorchScript diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst index 55497d5defa..0a0e6e2bd70 100644 --- a/advanced_source/torch_script_custom_ops.rst +++ b/advanced_source/torch_script_custom_ops.rst @@ -1,6 +1,11 @@ Extending TorchScript with Custom C++ Operators =============================================== +.. warning:: + + This tutorial is deprecated as of PyTorch 2.4. Please see :ref:`custom-ops-landing-page` + for the newest up-to-date guides on PyTorch Custom Operators. + The PyTorch 1.0 release introduced a new programming model to PyTorch called `TorchScript `_. TorchScript is a subset of the Python programming language which can be parsed, compiled and diff --git a/advanced_source/usb_semisup_learn.py b/advanced_source/usb_semisup_learn.py new file mode 100644 index 00000000000..4ea6f621ab7 --- /dev/null +++ b/advanced_source/usb_semisup_learn.py @@ -0,0 +1,253 @@ +""" +Semi-Supervised Learning using USB built upon PyTorch +===================================================== + +**Author**: `Hao Chen `_ + +Unified Semi-supervised learning Benchmark (USB) is a semi-supervised +learning (SSL) framework built upon PyTorch. +Based on Datasets and Modules provided by PyTorch, USB becomes a flexible, +modular, and easy-to-use framework for semi-supervised learning. +It supports a variety of semi-supervised learning algorithms, including +``FixMatch``, ``FreeMatch``, ``DeFixMatch``, ``SoftMatch``, and so on. +It also supports a variety of imbalanced semi-supervised learning algorithms. +The benchmark results across different datasets of computer vision, natural +language processing, and speech processing are included in USB. + +This tutorial will walk you through the basics of using the USB lighting +package. +Let's get started by training a ``FreeMatch``/``SoftMatch`` model on +CIFAR-10 using pretrained Vision Transformers (ViT)! +And we will show it is easy to change the semi-supervised algorithm and train +on imbalanced datasets. + + +.. figure:: /_static/img/usb_semisup_learn/code.png + :alt: USB framework illustration +""" + + +###################################################################### +# Introduction to ``FreeMatch`` and ``SoftMatch`` in Semi-Supervised Learning +# --------------------------------------------------------------------------- +# +# Here we provide a brief introduction to ``FreeMatch`` and ``SoftMatch``. +# First, we introduce a famous baseline for semi-supervised learning called ``FixMatch``. +# ``FixMatch`` is a very simple framework for semi-supervised learning, where it +# utilizes a strong augmentation to generate pseudo labels for unlabeled data. +# It adopts a confidence thresholding strategy to filter out the low-confidence +# pseudo labels with a fixed threshold set. +# ``FreeMatch`` and ``SoftMatch`` are two algorithms that improve upon ``FixMatch``. +# ``FreeMatch`` proposes adaptive thresholding strategy to replace the fixed +# thresholding strategy in ``FixMatch``. The adaptive thresholding progressively +# increases the threshold according to the learning status of the model on each +# class. ``SoftMatch`` absorbs the idea of confidence thresholding as an +# weighting mechanism. It proposes a Gaussian weighting mechanism to overcome +# the quantity-quality trade-off in pseudo-labels. In this tutorial, we will +# use USB to train ``FreeMatch`` and ``SoftMatch``. + + +###################################################################### +# Use USB to Train ``FreeMatch``/``SoftMatch`` on CIFAR-10 with only 40 labels +# ---------------------------------------------------------------------------- +# +# USB is easy to use and extend, affordable to small groups, and comprehensive +# for developing and evaluating SSL algorithms. +# USB provides the implementation of 14 SSL algorithms based on Consistency +# Regularization, and 15 tasks for evaluation from CV, NLP, and Audio domain. +# It has a modular design that allows users to easily extend the package by +# adding new algorithms and tasks. +# It also supports a Python API for easier adaptation to different SSL +# algorithms on new data. +# +# +# Now, let's use USB to train ``FreeMatch`` and ``SoftMatch`` on CIFAR-10. +# First, we need to install USB package ``semilearn`` and import necessary API +# functions from USB. +# If you are running this in Google Colab, install ``semilearn`` by running: +# ``!pip install semilearn``. +# +# Below is a list of functions we will use from ``semilearn``: +# +# - ``get_dataset`` to load dataset, here we use CIFAR-10 +# - ``get_data_loader`` to create train (labeled and unlabeled) and test data +# loaders, the train unlabeled loaders will provide both strong and weak +# augmentation of unlabeled data +# - ``get_net_builder`` to create a model, here we use pretrained ViT +# - ``get_algorithm`` to create the semi-supervised learning algorithm, +# here we use ``FreeMatch`` and ``SoftMatch`` +# - ``get_config``: to get default configuration of the algorithm +# - ``Trainer``: a Trainer class for training and evaluating the +# algorithm on dataset +# +# Note that a CUDA-enabled backend is required for training with the ``semilearn`` package. +# See `Enabling CUDA in Google Colab `__ for instructions +# on enabling CUDA in Google Colab. +# +import semilearn +from semilearn import get_dataset, get_data_loader, get_net_builder, get_algorithm, get_config, Trainer + +###################################################################### +# After importing necessary functions, we first set the hyper-parameters of the +# algorithm. +# +config = { + 'algorithm': 'freematch', + 'net': 'vit_tiny_patch2_32', + 'use_pretrain': True, + 'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth', + + # optimization configs + 'epoch': 1, + 'num_train_iter': 500, + 'num_eval_iter': 500, + 'num_log_iter': 50, + 'optim': 'AdamW', + 'lr': 5e-4, + 'layer_decay': 0.5, + 'batch_size': 16, + 'eval_batch_size': 16, + + + # dataset configs + 'dataset': 'cifar10', + 'num_labels': 40, + 'num_classes': 10, + 'img_size': 32, + 'crop_ratio': 0.875, + 'data_dir': './data', + 'ulb_samples_per_class': None, + + # algorithm specific configs + 'hard_label': True, + 'T': 0.5, + 'ema_p': 0.999, + 'ent_loss_ratio': 0.001, + 'uratio': 2, + 'ulb_loss_ratio': 1.0, + + # device configs + 'gpu': 0, + 'world_size': 1, + 'distributed': False, + "num_workers": 4, +} +config = get_config(config) + + +###################################################################### +# Then, we load the dataset and create data loaders for training and testing. +# And we specify the model and algorithm to use. +# +dataset_dict = get_dataset(config, config.algorithm, config.dataset, config.num_labels, config.num_classes, data_dir=config.data_dir, include_lb_to_ulb=config.include_lb_to_ulb) +train_lb_loader = get_data_loader(config, dataset_dict['train_lb'], config.batch_size) +train_ulb_loader = get_data_loader(config, dataset_dict['train_ulb'], int(config.batch_size * config.uratio)) +eval_loader = get_data_loader(config, dataset_dict['eval'], config.eval_batch_size) +algorithm = get_algorithm(config, get_net_builder(config.net, from_name=False), tb_log=None, logger=None) + + +###################################################################### +# We can start training the algorithms on CIFAR-10 with 40 labels now. +# We train for 500 iterations and evaluate every 500 iterations. +# +trainer = Trainer(config, algorithm) +trainer.fit(train_lb_loader, train_ulb_loader, eval_loader) + + +###################################################################### +# Finally, let's evaluate the trained model on the validation set. +# After training 500 iterations with ``FreeMatch`` on only 40 labels of +# CIFAR-10, we obtain a classifier that achieves around 87% accuracy on the validation set. +trainer.evaluate(eval_loader) + + + +###################################################################### +# Use USB to Train ``SoftMatch`` with specific imbalanced algorithm on imbalanced CIFAR-10 +# ---------------------------------------------------------------------------------------- +# +# Now let's say we have imbalanced labeled set and unlabeled set of CIFAR-10, +# and we want to train a ``SoftMatch`` model on it. +# We create an imbalanced labeled set and imbalanced unlabeled set of CIFAR-10, +# by setting the ``lb_imb_ratio`` and ``ulb_imb_ratio`` to 10. +# Also, we replace the ``algorithm`` with ``softmatch`` and set the ``imbalanced`` +# to ``True``. +# +config = { + 'algorithm': 'softmatch', + 'net': 'vit_tiny_patch2_32', + 'use_pretrain': True, + 'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth', + + # optimization configs + 'epoch': 1, + 'num_train_iter': 500, + 'num_eval_iter': 500, + 'num_log_iter': 50, + 'optim': 'AdamW', + 'lr': 5e-4, + 'layer_decay': 0.5, + 'batch_size': 16, + 'eval_batch_size': 16, + + + # dataset configs + 'dataset': 'cifar10', + 'num_labels': 1500, + 'num_classes': 10, + 'img_size': 32, + 'crop_ratio': 0.875, + 'data_dir': './data', + 'ulb_samples_per_class': None, + 'lb_imb_ratio': 10, + 'ulb_imb_ratio': 10, + 'ulb_num_labels': 3000, + + # algorithm specific configs + 'hard_label': True, + 'T': 0.5, + 'ema_p': 0.999, + 'ent_loss_ratio': 0.001, + 'uratio': 2, + 'ulb_loss_ratio': 1.0, + + # device configs + 'gpu': 0, + 'world_size': 1, + 'distributed': False, + "num_workers": 4, +} +config = get_config(config) + +###################################################################### +# Then, we re-load the dataset and create data loaders for training and testing. +# And we specify the model and algorithm to use. +# +dataset_dict = get_dataset(config, config.algorithm, config.dataset, config.num_labels, config.num_classes, data_dir=config.data_dir, include_lb_to_ulb=config.include_lb_to_ulb) +train_lb_loader = get_data_loader(config, dataset_dict['train_lb'], config.batch_size) +train_ulb_loader = get_data_loader(config, dataset_dict['train_ulb'], int(config.batch_size * config.uratio)) +eval_loader = get_data_loader(config, dataset_dict['eval'], config.eval_batch_size) +algorithm = get_algorithm(config, get_net_builder(config.net, from_name=False), tb_log=None, logger=None) + + +###################################################################### +# We can start Train the algorithms on CIFAR-10 with 40 labels now. +# We train for 500 iterations and evaluate every 500 iterations. +# +trainer = Trainer(config, algorithm) +trainer.fit(train_lb_loader, train_ulb_loader, eval_loader) + + +###################################################################### +# Finally, let's evaluate the trained model on the validation set. +# +trainer.evaluate(eval_loader) + + + +###################################################################### +# References: +# - [1] USB: https://github.com/microsoft/Semi-supervised-learning +# - [2] Kihyuk Sohn et al. FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence +# - [3] Yidong Wang et al. FreeMatch: Self-adaptive Thresholding for Semi-supervised Learning +# - [4] Hao Chen et al. SoftMatch: Addressing the Quantity-Quality Trade-off in Semi-supervised Learning diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py index 21ee32ff384..54799229342 100644 --- a/beginner_source/Intro_to_TorchScript_tutorial.py +++ b/beginner_source/Intro_to_TorchScript_tutorial.py @@ -4,6 +4,8 @@ **Authors:** James Reed (jamesreed@fb.com), Michael Suo (suo@fb.com), rev2 +.. warning:: TorchScript is no longer in active development. + This tutorial is an introduction to TorchScript, an intermediate representation of a PyTorch model (subclass of ``nn.Module``) that can then be run in a high-performance environment such as C++. diff --git a/beginner_source/PyTorch Cheat.md b/beginner_source/PyTorch Cheat.md index db3252b71ed..4f7af63038c 100644 --- a/beginner_source/PyTorch Cheat.md +++ b/beginner_source/PyTorch Cheat.md @@ -102,7 +102,7 @@ See [math operations](https://pytorch.org/docs/stable/torch.html?highlight=mm#ma ### GPU Usage ``` -torch.cuda.is_available # check for cuda +torch.cuda.is_available() # check for cuda x.cuda() # move x's data from CPU to GPU and return new object x.cpu() # move x's data from GPU to CPU and return new object diff --git a/beginner_source/README.txt b/beginner_source/README.txt index 5017c80b86b..14f3b3047e9 100644 --- a/beginner_source/README.txt +++ b/beginner_source/README.txt @@ -20,7 +20,3 @@ Beginner Tutorials 5. nlp/* and deep_learning_nlp_tutorial.rst Deep Learning for NLP with Pytorch https://pytorch.org/tutorials/beginner/deep_learning_nlp_tutorial.html - -6. transformer_translation.py - Language Translation with Transformers - https://pytorch.org/tutorials/beginner/translation_transformer.html diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py index d8b53d6175b..8eff127ddee 100644 --- a/beginner_source/basics/autogradqs_tutorial.py +++ b/beginner_source/basics/autogradqs_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Automatic Differentiation with ``torch.autograd`` -======================================= +================================================= When training neural networks, the most frequently used algorithm is **back propagation**. In this algorithm, parameters (model weights) are @@ -170,7 +170,7 @@ ###################################################################### # Optional Reading: Tensor Gradients and Jacobian Products -# -------------------------------------- +# -------------------------------------------------------- # # In many cases, we have a scalar loss function, and we need to compute # the gradient with respect to some parameters. However, there are cases diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py index cae5c99134a..987bc7c44a2 100644 --- a/beginner_source/basics/buildmodel_tutorial.py +++ b/beginner_source/basics/buildmodel_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Build the Neural Network -=================== +======================== Neural networks comprise of layers/modules that perform operations on data. The `torch.nn `_ namespace provides all the building blocks you need to @@ -197,5 +197,5 @@ def forward(self, x): ################################################################# # Further Reading -# -------------- +# ----------------- # - `torch.nn API `_ diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py index 0ef1fb6b777..561e9723fde 100644 --- a/beginner_source/basics/data_tutorial.py +++ b/beginner_source/basics/data_tutorial.py @@ -10,7 +10,7 @@ `Save & Load Model `_ Datasets & DataLoaders -=================== +====================== """ @@ -69,7 +69,7 @@ ################################################################# # Iterating and Visualizing the Dataset -# ----------------- +# ------------------------------------- # # We can index ``Datasets`` manually like a list: ``training_data[index]``. # We use ``matplotlib`` to visualize some samples in our training data. @@ -144,7 +144,7 @@ def __getitem__(self, idx): ################################################################# -# __init__ +# ``__init__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __init__ function is run once when instantiating the Dataset object. We initialize @@ -167,7 +167,7 @@ def __init__(self, annotations_file, img_dir, transform=None, target_transform=N ################################################################# -# __len__ +# ``__len__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __len__ function returns the number of samples in our dataset. @@ -180,7 +180,7 @@ def __len__(self): ################################################################# -# __getitem__ +# ``__getitem__`` # ^^^^^^^^^^^^^^^^^^^^ # # The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``. @@ -220,7 +220,7 @@ def __getitem__(self, idx): ########################### # Iterate through the DataLoader -# -------------------------- +# ------------------------------- # # We have loaded that dataset into the ``DataLoader`` and can iterate through the dataset as needed. # Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively). @@ -243,5 +243,5 @@ def __getitem__(self, idx): ################################################################# # Further Reading -# -------------- +# ---------------- # - `torch.utils.data API `_ diff --git a/beginner_source/basics/intro.py b/beginner_source/basics/intro.py index b7369938643..6c048dcaecc 100644 --- a/beginner_source/basics/intro.py +++ b/beginner_source/basics/intro.py @@ -13,9 +13,9 @@ =================== Authors: -`Suraj Subramanian `_, +`Suraj Subramanian `_, `Seth Juarez `_, -`Cassie Breviu `_, +`Cassie Breviu `_, `Dmitry Soshnikov `_, `Ari Bornstein `_ @@ -31,7 +31,7 @@ Running the Tutorial Code ------------------- +------------------------- You can run this tutorial in a couple of ways: - **In the cloud**: This is the easiest way to get started! Each section has a "Run in Microsoft Learn" and "Run in Google Colab" link at the top, which opens an integrated notebook in Microsoft Learn or Google Colab, respectively, with the code in a fully-hosted environment. @@ -39,7 +39,7 @@ How to Use this Guide ------------------ +--------------------- If you're familiar with other deep learning frameworks, check out the `0. Quickstart `_ first to quickly familiarize yourself with PyTorch's API. @@ -49,6 +49,16 @@ .. include:: /beginner_source/basics/qs_toc.txt .. toctree:: + :maxdepth: 2 :hidden: + quickstart_tutorial + tensorqs_tutorial + data_tutorial + transforms_tutorial + buildmodel_tutorial + autogradqs_tutorial + optimization_tutorial + saveloadrun_tutorial + """ diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py index 93aed46161d..c6c327f8511 100644 --- a/beginner_source/basics/optimization_tutorial.py +++ b/beginner_source/basics/optimization_tutorial.py @@ -163,7 +163,7 @@ def train_loop(dataloader, model, loss_fn, optimizer): optimizer.zero_grad() if batch % 100 == 0: - loss, current = loss.item(), (batch + 1) * len(X) + loss, current = loss.item(), batch * batch_size + len(X) print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") diff --git a/beginner_source/basics/quickstart_tutorial.py b/beginner_source/basics/quickstart_tutorial.py index 07a1be517d1..df7628081ba 100644 --- a/beginner_source/basics/quickstart_tutorial.py +++ b/beginner_source/basics/quickstart_tutorial.py @@ -216,7 +216,7 @@ def test(dataloader, model, loss_fn): # the state dictionary into it. model = NeuralNetwork().to(device) -model.load_state_dict(torch.load("model.pth")) +model.load_state_dict(torch.load("model.pth", weights_only=True)) ############################################################# # This model can now be used to make predictions. diff --git a/beginner_source/basics/saveloadrun_tutorial.py b/beginner_source/basics/saveloadrun_tutorial.py index 8c50249a089..d8636f6a9de 100644 --- a/beginner_source/basics/saveloadrun_tutorial.py +++ b/beginner_source/basics/saveloadrun_tutorial.py @@ -32,9 +32,14 @@ ########################## # To load model weights, you need to create an instance of the same model first, and then load the parameters # using ``load_state_dict()`` method. +# +# In the code below, we set ``weights_only=True`` to limit the +# functions executed during unpickling to only those necessary for +# loading weights. Using ``weights_only=True`` is considered +# a best practice when loading weights. model = models.vgg16() # we do not specify ``weights``, i.e. create untrained model -model.load_state_dict(torch.load('model_weights.pth')) +model.load_state_dict(torch.load('model_weights.pth', weights_only=True)) model.eval() ########################### @@ -50,9 +55,14 @@ torch.save(model, 'model.pth') ######################## -# We can then load the model like this: +# We can then load the model as demonstrated below. +# +# As described in `Saving and loading torch.nn.Modules `_, +# saving ``state_dict`` is considered the best practice. However, +# below we use ``weights_only=False`` because this involves loading the +# model, which is a legacy use case for ``torch.save``. -model = torch.load('model.pth') +model = torch.load('model.pth', weights_only=False), ######################## # .. note:: This approach uses Python `pickle `_ module when serializing the model, thus it relies on the actual class definition to be available when loading the model. @@ -60,5 +70,5 @@ ####################### # Related Tutorials # ----------------- -# `Saving and Loading a General Checkpoint in PyTorch `_ -# `Tips for loading an `nn.Module` from a checkpoint `_ +# - `Saving and Loading a General Checkpoint in PyTorch `_ +# - `Tips for loading an nn.Module from a checkpoint `_ diff --git a/beginner_source/basics/tensorqs_tutorial.py b/beginner_source/basics/tensorqs_tutorial.py index 1a086fc5ad8..70a966d9f89 100644 --- a/beginner_source/basics/tensorqs_tutorial.py +++ b/beginner_source/basics/tensorqs_tutorial.py @@ -80,7 +80,7 @@ ###################################################################### # Attributes of a Tensor -# ~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~ # # Tensor attributes describe their shape, datatype, and the device on which they are stored. @@ -97,7 +97,7 @@ ###################################################################### # Operations on Tensors -# ~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~ # # Over 100 tensor operations, including arithmetic, linear algebra, matrix manipulation (transposing, # indexing, slicing), sampling and more are diff --git a/beginner_source/bettertransformer_tutorial.rst b/beginner_source/bettertransformer_tutorial.rst index 60ffa52ea83..76aebd839a0 100644 --- a/beginner_source/bettertransformer_tutorial.rst +++ b/beginner_source/bettertransformer_tutorial.rst @@ -1,251 +1,10 @@ Fast Transformer Inference with Better Transformer -=============================================================== +================================================== -**Author**: `Michael Gschwind `__ +This tutorial has been deprecated. -This tutorial introduces Better Transformer (BT) as part of the PyTorch 1.12 release. -In this tutorial, we show how to use Better Transformer for production -inference with torchtext. Better Transformer is a production ready fastpath to -accelerate deployment of Transformer models with high performance on CPU and GPU. -The fastpath feature works transparently for models based either directly on -PyTorch core ``nn.module`` or with torchtext. - -Models which can be accelerated by Better Transformer fastpath execution are those -using the following PyTorch core ``torch.nn.module`` classes ``TransformerEncoder``, -``TransformerEncoderLayer``, and ``MultiHeadAttention``. In addition, torchtext has -been updated to use the core library modules to benefit from fastpath acceleration. -(Additional modules may be enabled with fastpath execution in the future.) - -Better Transformer offers two types of acceleration: - -* Native multihead attention (MHA) implementation for CPU and GPU to improve overall execution efficiency. -* Exploiting sparsity in NLP inference. Because of variable input lengths, input - tokens may contain a large number of padding tokens for which processing may be - skipped, delivering significant speedups. - -Fastpath execution is subject to some criteria. Most importantly, the model -must be executed in inference mode and operate on input tensors that do not collect -gradient tape information (e.g., running with torch.no_grad). - -To follow this example in Google Colab, `click here -`__. - -Better Transformer Features in This Tutorial --------------------------------------------- - -* Load pretrained models (created before PyTorch version 1.12 without Better Transformer) -* Run and benchmark inference on CPU with and without BT fastpath (native MHA only) -* Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA only) -* Enable sparsity support -* Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA + sparsity) - -Additional Information ------------------------ -Additional information about Better Transformer may be found in the PyTorch.Org blog -`A Better Transformer for Fast Transformer Inference -`__. - - - -1. Setup - -1.1 Load pretrained models - -We download the XLM-R model from the predefined torchtext models by following the instructions in -`torchtext.models `__. We also set the DEVICE to execute -on-accelerator tests. (Enable GPU execution for your environment as appropriate.) - -.. code-block:: python - - import torch - import torch.nn as nn - - print(f"torch version: {torch.__version__}") - - DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - - print(f"torch cuda available: {torch.cuda.is_available()}") - - import torch, torchtext - from torchtext.models import RobertaClassificationHead - from torchtext.functional import to_tensor - xlmr_large = torchtext.models.XLMR_LARGE_ENCODER - classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024) - model = xlmr_large.get_model(head=classifier_head) - transform = xlmr_large.transform() - -1.2 Dataset Setup - -We set up two types of inputs: a small input batch and a big input batch with sparsity. - -.. code-block:: python - - small_input_batch = [ - "Hello world", - "How are you!" - ] - big_input_batch = [ - "Hello world", - "How are you!", - """`Well, Prince, so Genoa and Lucca are now just family estates of the - Buonapartes. But I warn you, if you don't tell me that this means war, - if you still try to defend the infamies and horrors perpetrated by - that Antichrist- I really believe he is Antichrist- I will have - nothing more to do with you and you are no longer my friend, no longer - my 'faithful slave,' as you call yourself! But how do you do? I see - I have frightened you- sit down and tell me all the news.` - - It was in July, 1805, and the speaker was the well-known Anna - Pavlovna Scherer, maid of honor and favorite of the Empress Marya - Fedorovna. With these words she greeted Prince Vasili Kuragin, a man - of high rank and importance, who was the first to arrive at her - reception. Anna Pavlovna had had a cough for some days. She was, as - she said, suffering from la grippe; grippe being then a new word in - St. Petersburg, used only by the elite.""" - ] - -Next, we select either the small or large input batch, preprocess the inputs and test the model. - -.. code-block:: python - - input_batch=big_input_batch - - model_input = to_tensor(transform(input_batch), padding_value=1) - output = model(model_input) - output.shape - -Finally, we set the benchmark iteration count: - -.. code-block:: python - - ITERATIONS=10 - -2. Execution - -2.1 Run and benchmark inference on CPU with and without BT fastpath (native MHA only) - -We run the model on CPU, and collect profile information: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` and disables gradient collection with `torch.no_grad()`. - -You can see an improvement (whose magnitude will depend on the CPU model) when the model is executing on CPU. Notice that the fastpath profile shows most of the execution time -in the native `TransformerEncoderLayer` implementation `aten::_transformer_encoder_layer_fwd`. - -.. code-block:: python - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=False) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=False) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -2.2 Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA only) - -We check the BT sparsity setting: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor - - -We disable the BT sparsity: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor=False - - -We run the model on DEVICE, and collect profile information for native MHA execution on DEVICE: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` - and disables gradient collection with `torch.no_grad()`. - -When executing on a GPU, you should see a significant speedup, in particular for the small input batch setting: - -.. code-block:: python - - model.to(DEVICE) - model_input = model_input.to(DEVICE) - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -2.3 Run and benchmark inference on (configurable) DEVICE with and without BT fastpath (native MHA + sparsity) - -We enable sparsity support: - -.. code-block:: python - - model.encoder.transformer.layers.enable_nested_tensor = True - -We run the model on DEVICE, and collect profile information for native MHA and sparsity support execution on DEVICE: - -* The first run uses traditional ("slow path") execution. -* The second run enables BT fastpath execution by putting the model in inference mode using `model.eval()` and disables gradient collection with `torch.no_grad()`. - -When executing on a GPU, you should see a significant speedup, in particular for the large input batch setting which includes sparsity: - -.. code-block:: python - - model.to(DEVICE) - model_input = model_input.to(DEVICE) - - print("slow path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - model.eval() - - print("fast path:") - print("==========") - with torch.autograd.profiler.profile(use_cuda=True) as prof: - with torch.no_grad(): - for i in range(ITERATIONS): - output = model(model_input) - print(prof) - - -Summary -------- - -In this tutorial, we have introduced fast transformer inference with -Better Transformer fastpath execution in torchtext using PyTorch core -Better Transformer support for Transformer Encoder models. We have -demonstrated the use of Better Transformer with models trained prior to -the availability of BT fastpath execution. We have demonstrated and -benchmarked the use of both BT fastpath execution modes, native MHA execution -and BT sparsity acceleration. +Redirecting in 3 seconds... +.. raw:: html + diff --git a/beginner_source/blitz/autograd_tutorial.py b/beginner_source/blitz/autograd_tutorial.py index 3b33ffca9a8..b736b429eee 100644 --- a/beginner_source/blitz/autograd_tutorial.py +++ b/beginner_source/blitz/autograd_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ A Gentle Introduction to ``torch.autograd`` ---------------------------------- +=========================================== ``torch.autograd`` is PyTorch’s automatic differentiation engine that powers neural network training. In this section, you will get a conceptual @@ -149,7 +149,7 @@ ###################################################################### # Optional Reading - Vector Calculus using ``autograd`` -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Mathematically, if you have a vector valued function # :math:`\vec{y}=f(\vec{x})`, then the gradient of :math:`\vec{y}` with @@ -191,7 +191,7 @@ # .. math:: # # -# J^{T}\cdot \vec{v}=\left(\begin{array}{ccc} +# J^{T}\cdot \vec{v} = \left(\begin{array}{ccc} # \frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}}\\ # \vdots & \ddots & \vdots\\ # \frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}} @@ -199,7 +199,7 @@ # \frac{\partial l}{\partial y_{1}}\\ # \vdots\\ # \frac{\partial l}{\partial y_{m}} -# \end{array}\right)=\left(\begin{array}{c} +# \end{array}\right) = \left(\begin{array}{c} # \frac{\partial l}{\partial x_{1}}\\ # \vdots\\ # \frac{\partial l}{\partial x_{n}} @@ -207,7 +207,6 @@ # # This characteristic of vector-Jacobian product is what we use in the above example; # ``external_grad`` represents :math:`\vec{v}`. -# @@ -266,7 +265,7 @@ z = torch.rand((5, 5), requires_grad=True) a = x + y -print(f"Does `a` require gradients? : {a.requires_grad}") +print(f"Does `a` require gradients?: {a.requires_grad}") b = x + z print(f"Does `b` require gradients?: {b.requires_grad}") @@ -321,3 +320,4 @@ # # - `In-place operations & Multithreaded Autograd `__ # - `Example implementation of reverse-mode autodiff `__ +# - `Video: PyTorch Autograd Explained - In-depth Tutorial `__ diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py index 5a9cde3f105..39dee479626 100644 --- a/beginner_source/blitz/cifar10_tutorial.py +++ b/beginner_source/blitz/cifar10_tutorial.py @@ -115,7 +115,7 @@ def imshow(img): ######################################################################## # 2. Define a Convolutional Neural Network -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Copy the neural network from the Neural Networks section before and modify it to # take 3-channel images (instead of 1-channel images as it was defined). @@ -221,7 +221,7 @@ def forward(self, x): # wasn't necessary here, we only did it to illustrate how to do so): net = Net() -net.load_state_dict(torch.load(PATH)) +net.load_state_dict(torch.load(PATH, weights_only=True)) ######################################################################## # Okay, now let us see what the neural network thinks these examples above are: @@ -252,7 +252,7 @@ def forward(self, x): # calculate outputs by running images through the network outputs = net(images) # the class with the highest energy is what we choose as prediction - _, predicted = torch.max(outputs.data, 1) + _, predicted = torch.max(outputs, 1) total += labels.size(0) correct += (predicted == labels).sum().item() diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py index 3b3c95fd229..9c04d9af0f3 100644 --- a/beginner_source/blitz/neural_networks_tutorial.py +++ b/beginner_source/blitz/neural_networks_tutorial.py @@ -55,16 +55,33 @@ def __init__(self): self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) - def forward(self, x): - # Max pooling over a (2, 2) window - x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) - # If the size is a square, you can specify with a single number - x = F.max_pool2d(F.relu(self.conv2(x)), 2) - x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x + def forward(self, input): + # Convolution layer C1: 1 input image channel, 6 output channels, + # 5x5 square convolution, it uses RELU activation function, and + # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch + c1 = F.relu(self.conv1(input)) + # Subsampling layer S2: 2x2 grid, purely functional, + # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor + s2 = F.max_pool2d(c1, (2, 2)) + # Convolution layer C3: 6 input channels, 16 output channels, + # 5x5 square convolution, it uses RELU activation function, and + # outputs a (N, 16, 10, 10) Tensor + c3 = F.relu(self.conv2(s2)) + # Subsampling layer S4: 2x2 grid, purely functional, + # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor + s4 = F.max_pool2d(c3, 2) + # Flatten operation: purely functional, outputs a (N, 400) Tensor + s4 = torch.flatten(s4, 1) + # Fully connected layer F5: (N, 400) Tensor input, + # and outputs a (N, 120) Tensor, it uses RELU activation function + f5 = F.relu(self.fc1(s4)) + # Fully connected layer F6: (N, 120) Tensor input, + # and outputs a (N, 84) Tensor, it uses RELU activation function + f6 = F.relu(self.fc2(f5)) + # Gaussian layer OUTPUT: (N, 84) Tensor input, and + # outputs a (N, 10) Tensor + output = self.fc3(f6) + return output net = Net() @@ -161,7 +178,7 @@ def forward(self, x): # ``.grad_fn`` attribute, you will see a graph of computations that looks # like this: # -# :: +# .. code-block:: sh # # input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d # -> flatten -> linear -> relu -> linear -> relu -> linear @@ -253,7 +270,7 @@ def forward(self, x): ############################################################### -# .. Note:: +# .. note:: # # Observe how gradient buffers had to be manually set to zero using # ``optimizer.zero_grad()``. This is because gradients are accumulated diff --git a/beginner_source/blitz/tensor_tutorial.py b/beginner_source/blitz/tensor_tutorial.py index 5219ad4ee43..ac54945bc3a 100644 --- a/beginner_source/blitz/tensor_tutorial.py +++ b/beginner_source/blitz/tensor_tutorial.py @@ -1,6 +1,6 @@ """ Tensors --------------------------------------------- +======== Tensors are a specialized data structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py index 44310cc3620..ee274866895 100644 --- a/beginner_source/chatbot_tutorial.py +++ b/beginner_source/chatbot_tutorial.py @@ -84,8 +84,7 @@ # Preparations # ------------ # -# To start, Download the data ZIP file -# `here `__ +# To get started, `download `__ the Movie-Dialogs Corpus zip file. # and put in a ``data/`` directory under the current directory. # @@ -1129,7 +1128,7 @@ def forward(self, input_seq, input_length, max_length): # Forward input through encoder model encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length) # Prepare encoder's final hidden layer to be first hidden input to the decoder - decoder_hidden = encoder_hidden[:decoder.n_layers] + decoder_hidden = encoder_hidden[:self.decoder.n_layers] # Initialize decoder input with SOS_token decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token # Initialize tensors to append decoded words to diff --git a/beginner_source/colab.rst b/beginner_source/colab.rst index 16d1b80e81e..812255704e7 100644 --- a/beginner_source/colab.rst +++ b/beginner_source/colab.rst @@ -93,3 +93,11 @@ Hopefully this example will give you a good starting point for running some of the more complex tutorials in Colab. As we evolve our use of Colab on the PyTorch tutorials site, we'll look at ways to make this easier for users. + +Enabling CUDA +~~~~~~~~~~~~~~~~ +Some tutorials require a CUDA-enabled device (NVIDIA GPU), which involves +changing the Runtime type prior to executing the tutorial. +To change the Runtime in Google Colab, on the top drop-down menu select **Runtime**, +then select **Change runtime type**. Under **Hardware accelerator**, select ``T4 GPU``, +then click ``Save``. diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py index 7c1c3487cb6..ab9de0d7d73 100644 --- a/beginner_source/data_loading_tutorial.py +++ b/beginner_source/data_loading_tutorial.py @@ -50,9 +50,9 @@ # estimation `__ # on a few images from imagenet tagged as 'face'. # -# Dataset comes with a csv file with annotations which looks like this: +# Dataset comes with a ``.csv`` file with annotations which looks like this: # -# :: +# .. code-block:: sh # # image_name,part_0_x,part_0_y,part_1_x,part_1_y,part_2_x, ... ,part_67_x,part_67_y # 0805personali01.jpg,27,83,27,98, ... 84,134 @@ -196,7 +196,7 @@ def __getitem__(self, idx): # called. For this, we just need to implement ``__call__`` method and # if required, ``__init__`` method. We can then use a transform like this: # -# :: +# .. code-block:: python # # tsfm = Transform(params) # transformed_sample = tsfm(sample) @@ -421,7 +421,9 @@ def show_landmarks_batch(sample_batched): # and dataloader. ``torchvision`` package provides some common datasets and # transforms. You might not even have to write custom classes. One of the # more generic datasets available in torchvision is ``ImageFolder``. -# It assumes that images are organized in the following way: :: +# It assumes that images are organized in the following way: +# +# .. code-block:: sh # # root/ants/xxx.png # root/ants/xxy.jpeg @@ -435,7 +437,9 @@ def show_landmarks_batch(sample_batched): # # where 'ants', 'bees' etc. are class labels. Similarly generic transforms # which operate on ``PIL.Image`` like ``RandomHorizontalFlip``, ``Scale``, -# are also available. You can use these to write a dataloader like this: :: +# are also available. You can use these to write a dataloader like this: +# +# .. code-block:: pytorch # # import torch # from torchvision import transforms, datasets diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py index 1a1f9c38606..e9ac3fdd504 100644 --- a/beginner_source/dcgan_faces_tutorial.py +++ b/beginner_source/dcgan_faces_tutorial.py @@ -226,7 +226,7 @@ # the ``celeba`` directory you just created. The resulting directory # structure should be: # -# :: +# .. code-block:: sh # # /path/to/celeba # -> img_align_celeba @@ -265,7 +265,7 @@ plt.axis("off") plt.title("Training Images") plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0))) - +plt.show() ###################################################################### diff --git a/beginner_source/ddp_series_fault_tolerance.rst b/beginner_source/ddp_series_fault_tolerance.rst index 95da10525a8..27fe7e273e7 100644 --- a/beginner_source/ddp_series_fault_tolerance.rst +++ b/beginner_source/ddp_series_fault_tolerance.rst @@ -9,16 +9,17 @@ Fault-tolerant Distributed Training with ``torchrun`` ===================================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites :margin: 0 - - Launching multi-GPU training jobs with ``torchrun`` - - Saving and loading snapshots of your training job - - Structuring your training script for graceful restarts + - Launching multi-GPU training jobs with ``torchrun`` + - Saving and loading snapshots of your training job + - Structuring your training script for graceful restarts .. grid:: 1 @@ -27,6 +28,7 @@ Authors: `Suraj Subramanian `__ :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites :margin: 0 * High-level `overview `__ of DDP @@ -93,11 +95,7 @@ In elastic training, whenever there are any membership changes (adding or removi on available devices. Having this structure ensures your training job can continue without manual intervention. - - - Diff for `multigpu.py `__ v/s `multigpu_torchrun.py `__ ------------------------------------------------------------ Process group initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/beginner_source/ddp_series_intro.rst b/beginner_source/ddp_series_intro.rst index 527a3cc1ce0..9aee5d8a5df 100644 --- a/beginner_source/ddp_series_intro.rst +++ b/beginner_source/ddp_series_intro.rst @@ -7,7 +7,7 @@ Distributed Data Parallel in PyTorch - Video Tutorials ====================================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ Follow along with the video below or on `youtube `__. diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst index a5eb60708d2..ef6549d4de0 100644 --- a/beginner_source/ddp_series_multigpu.rst +++ b/beginner_source/ddp_series_multigpu.rst @@ -9,23 +9,25 @@ Multi GPU training with DDP =========================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites - How to migrate a single-GPU training script to multi-GPU via DDP - Setting up the distributed process group - Saving and loading models in a distributed setup - + .. grid:: 1 .. grid-item:: :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ - + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites * High-level overview of `how DDP works `__ * A machine with multiple GPUs (this tutorial uses an AWS p3.8xlarge instance) @@ -43,42 +45,43 @@ In the `previous tutorial `__, we got a high-level overv In this tutorial, we start with a single-GPU training script and migrate that to running it on 4 GPUs on a single node. Along the way, we will talk through important concepts in distributed training while implementing them in our code. -.. note:: +.. note:: If your model contains any ``BatchNorm`` layers, it needs to be converted to ``SyncBatchNorm`` to sync the running stats of ``BatchNorm`` layers across replicas. - Use the helper function + Use the helper function `torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) `__ to convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``. Diff for `single_gpu.py `__ v/s `multigpu.py `__ ----------------------------------------------------- These are the changes you typically make to a single-GPU training script to enable DDP. Imports -~~~~~~~ +------- - ``torch.multiprocessing`` is a PyTorch wrapper around Python's native multiprocessing - The distributed process group contains all the processes that can communicate and synchronize with each other. -.. code-block:: diff +.. code-block:: python - import torch - import torch.nn.functional as F - from utils import MyTrainDataset + import torch + import torch.nn.functional as F + from utils import MyTrainDataset - + import torch.multiprocessing as mp - + from torch.utils.data.distributed import DistributedSampler - + from torch.nn.parallel import DistributedDataParallel as DDP - + from torch.distributed import init_process_group, destroy_process_group - + import os + import torch.multiprocessing as mp + from torch.utils.data.distributed import DistributedSampler + from torch.nn.parallel import DistributedDataParallel as DDP + from torch.distributed import init_process_group, destroy_process_group + import os Constructing the process group -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------ +- First, before initializing the group process, call `set_device `__, + which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0` - The process group can be initialized by TCP (default) or from a shared file-system. Read more on `process group initialization `__ @@ -86,68 +89,67 @@ Constructing the process group initializes the distributed process group. - Read more about `choosing a DDP backend `__ -- `set_device `__ - sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0` -.. code-block:: diff +.. code-block:: python + + def ddp_setup(rank: int, world_size: int): + """ + Args: + rank: Unique identifier of each process + world_size: Total number of processes + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.cuda.set_device(rank) + init_process_group(backend="nccl", rank=rank, world_size=world_size) - + def ddp_setup(rank: int, world_size: int): - + """ - + Args: - + rank: Unique identifier of each process - + world_size: Total number of processes - + """ - + os.environ["MASTER_ADDR"] = "localhost" - + os.environ["MASTER_PORT"] = "12355" - + init_process_group(backend="nccl", rank=rank, world_size=world_size) - + torch.cuda.set_device(rank) Constructing the DDP model -~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------------- -.. code-block:: diff +.. code-block:: python - - self.model = model.to(gpu_id) - + self.model = DDP(model, device_ids=[gpu_id]) + self.model = DDP(model, device_ids=[gpu_id]) Distributing input data -~~~~~~~~~~~~~~~~~~~~~~~ +----------------------- - `DistributedSampler `__ chunks the input data across all distributed processes. +- The `DataLoader `__ combines a dataset and a + sampler, and provides an iterable over the given dataset. - Each process will receive an input batch of 32 samples; the effective batch size is ``32 * nprocs``, or 128 when using 4 GPUs. -.. code-block:: diff +.. code-block:: python train_data = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=32, - - shuffle=True, - + shuffle=False, - + sampler=DistributedSampler(train_dataset), + shuffle=False, # We don't shuffle + sampler=DistributedSampler(train_dataset), # Use the Distributed Sampler here. ) -- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work +- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work properly across multiple epochs. Otherwise, the same ordering will be used in each epoch. -.. code-block:: diff +.. code-block:: python def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_data))[0]) - + self.train_data.sampler.set_epoch(epoch) + self.train_data.sampler.set_epoch(epoch) # call this additional line at every epoch for source, targets in self.train_data: ... self._run_batch(source, targets) Saving model checkpoints -~~~~~~~~~~~~~~~~~~~~~~~~ -- We only need to save model checkpoints from one process. Without this +------------------------ +- We only need to save model checkpoints from one process. Without this condition, each process would save its copy of the identical mode. Read more on saving and loading models with - DDP `here `__ + DDP `here `__ .. code-block:: diff @@ -162,18 +164,18 @@ Saving model checkpoints .. warning:: `Collective calls `__ are functions that run on all the distributed processes, and they are used to gather certain states or values to a specific process. Collective calls require all ranks to run the collective code. - In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process. + In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process. If you need to make any collective calls, it should be before the ``if self.gpu_id == 0`` check. Running the distributed training job -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ - Include new arguments ``rank`` (replacing ``device``) and ``world_size``. - ``rank`` is auto-allocated by DDP when calling `mp.spawn `__. -- ``world_size`` is the number of processes across the training job. For GPU training, +- ``world_size`` is the number of processes across the training job. For GPU training, this corresponds to the number of GPUs in use, and each process works on a dedicated GPU. .. code-block:: diff @@ -187,7 +189,7 @@ Running the distributed training job + trainer = Trainer(model, train_data, optimizer, rank, save_every) trainer.train(total_epochs) + destroy_process_group() - + if __name__ == "__main__": import sys total_epochs = int(sys.argv[1]) @@ -197,6 +199,24 @@ Running the distributed training job + world_size = torch.cuda.device_count() + mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size) +Here's what the code looks like: + +.. code-block:: python + def main(rank, world_size, total_epochs, save_every): + ddp_setup(rank, world_size) + dataset, model, optimizer = load_train_objs() + train_data = prepare_dataloader(dataset, batch_size=32) + trainer = Trainer(model, train_data, optimizer, rank, save_every) + trainer.train(total_epochs) + destroy_process_group() + + if __name__ == "__main__": + import sys + total_epochs = int(sys.argv[1]) + save_every = int(sys.argv[2]) + world_size = torch.cuda.device_count() + mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size) + Further Reading @@ -204,6 +224,6 @@ Further Reading - `Fault Tolerant distributed training `__ (next tutorial in this series) - `Intro to DDP `__ (previous tutorial in this series) -- `Getting Started with DDP `__ +- `Getting Started with DDP `__ - `Process Group - initialization `__ + Initialization `__ diff --git a/beginner_source/ddp_series_theory.rst b/beginner_source/ddp_series_theory.rst index 76083b2e343..ade98d9f01c 100644 --- a/beginner_source/ddp_series_theory.rst +++ b/beginner_source/ddp_series_theory.rst @@ -7,11 +7,12 @@ What is Distributed Data Parallel (DDP) ======================================= -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites * How DDP works under the hood * What is ``DistributedSampler`` @@ -19,6 +20,7 @@ Authors: `Suraj Subramanian `__ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites * Familiarity with `basic non-distributed training `__ in PyTorch diff --git a/beginner_source/deep_learning_nlp_tutorial.rst b/beginner_source/deep_learning_nlp_tutorial.rst deleted file mode 100644 index 7063bc7d8a5..00000000000 --- a/beginner_source/deep_learning_nlp_tutorial.rst +++ /dev/null @@ -1,56 +0,0 @@ -Deep Learning for NLP with Pytorch -********************************** -**Author**: `Robert Guthrie `_ - -This tutorial will walk you through the key ideas of deep learning -programming using Pytorch. Many of the concepts (such as the computation -graph abstraction and autograd) are not unique to Pytorch and are -relevant to any deep learning toolkit out there. - -I am writing this tutorial to focus specifically on NLP for people who -have never written code in any deep learning framework (e.g, TensorFlow, -Theano, Keras, DyNet). It assumes working knowledge of core NLP -problems: part-of-speech tagging, language modeling, etc. It also -assumes familiarity with neural networks at the level of an intro AI -class (such as one from the Russel and Norvig book). Usually, these -courses cover the basic backpropagation algorithm on feed-forward neural -networks, and make the point that they are chains of compositions of -linearities and non-linearities. This tutorial aims to get you started -writing deep learning code, given you have this prerequisite knowledge. - -Note this is about *models*, not data. For all of the models, I just -create a few test examples with small dimensionality so you can see how -the weights change as it trains. If you have some real data you want to -try, you should be able to rip out any of the models from this notebook -and use them on it. - - -.. toctree:: - :hidden: - - /beginner/nlp/pytorch_tutorial - /beginner/nlp/deep_learning_tutorial - /beginner/nlp/word_embeddings_tutorial - /beginner/nlp/sequence_models_tutorial - /beginner/nlp/advanced_tutorial - - -.. galleryitem:: /beginner/nlp/pytorch_tutorial.py - :intro: All of deep learning is computations on tensors, which are generalizations of a matrix that can be - -.. galleryitem:: /beginner/nlp/deep_learning_tutorial.py - :intro: Deep learning consists of composing linearities with non-linearities in clever ways. The introduction of non-linearities allows - -.. galleryitem:: /beginner/nlp/word_embeddings_tutorial.py - :intro: Word embeddings are dense vectors of real numbers, one per word in your vocabulary. In NLP, it is almost always the case that your features are - -.. galleryitem:: /beginner/nlp/sequence_models_tutorial.py - :intro: At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. - -.. galleryitem:: /beginner/nlp/advanced_tutorial.py - :intro: Dynamic versus Static Deep Learning Toolkits. Pytorch is a *dynamic* neural network kit. - - -.. raw:: html - -
    diff --git a/beginner_source/deeplabv3_on_android.rst b/beginner_source/deeplabv3_on_android.rst index f2fe0e48f15..7ec83477373 100644 --- a/beginner_source/deeplabv3_on_android.rst +++ b/beginner_source/deeplabv3_on_android.rst @@ -1,230 +1,10 @@ Image Segmentation DeepLabV3 on Android ================================================= -**Author**: `Jeff Tang `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -**Reviewed by**: `Jeremiah Chung `_ +Redirecting in 3 seconds... -Introduction ------------- +.. raw:: html -Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model `_ can be used to label image regions with `20 semantic classes `_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding. - -In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on Android, taking you from the beginning of having a model you may want to use on Android to the end of having a complete Android app using the model. We will also cover practical and general tips on how to check if your next favorable pretrained PyTorch models can run on Android, and how to avoid pitfalls. - -.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for Android `_ and give the PyTorch Android `Hello World `_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code for this tutorial is available `here `_. - -Learning Objectives -------------------- - -In this tutorial, you will learn how to: - -1. Convert the DeepLabV3 model for Android deployment. - -2. Get the output of the model for the example input image in Python and compare it to the output from the Android app. - -3. Build a new Android app or reuse an Android example app to load the converted model. - -4. Prepare the input into the format that the model expects and process the model output. - -5. Complete the UI, refactor, build and run the app to see image segmentation in action. - -Prerequisites ---------------- - -* PyTorch 1.6 or 1.7 - -* torchvision 0.7 or 0.8 - -* Android Studio 3.5.1 or above with NDK installed - -Steps ---------- - -1. Convert the DeepLabV3 model for Android deployment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The first step to deploying a model on Android is to convert the model into the `TorchScript `_ format. - -.. note:: - Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details. - -Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`: - -:: - - import torch - - # use deeplabv3_resnet50 instead of resnet101 to reduce the model size - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scriptedm = torch.jit.script(model) - torch.jit.save(scriptedm, "deeplabv3_scripted.pt") - -The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an Android app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization `_ to the DeepLabV3 model in a future tutorial or recipe. - -2. Get example input and output of the model in Python -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on Android. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site `_. Add the following code snippet to the code above: - -:: - - from PIL import Image - from torchvision import transforms - input_image = Image.open("deeplab.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor = preprocess(input_image) - input_batch = input_tensor.unsqueeze(0) - with torch.no_grad(): - output = model(input_batch)['out'][0] - - print(input_batch.shape) - print(output.shape) - -Download `deeplab.jpg` from `here `_, then run the script above and you will see the shapes of the input and output of the model: - -:: - - torch.Size([1, 3, 400, 400]) - torch.Size([21, 400, 400]) - -So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on Android, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the Android app. - -3. Build a new Android app or reuse an example app and load the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -First, follow Step 3 of the `Model Preparation for Android recipe <../recipes/model_preparation_android.html#add-the-model-and-pytorch-library-on-android>`_ to use our model in an Android Studio project with PyTorch Mobile enabled. Because both DeepLabV3 used in this tutorial and MobileNet v2 used in the PyTorch Hello World Android example are computer vision models, you can also get the `Hello World example repo `_ to make it easier to modify the code that loads the model and processes the input and output. The main goal in this step and Step 4 is to make sure the model `deeplabv3_scripted.pt` generated in Step 1 can indeed work correctly on Android. - -Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Android Studio project and modify the `onCreate` method in the `MainActivity` to resemble: - -.. code-block:: java - - Module module = null; - try { - module = Module.load(assetFilePath(this, "deeplabv3_scripted.pt")); - } catch (IOException e) { - Log.e("ImageSegmentation", "Error loading model!", e); - finish(); - } - -Then set a breakpoint at the line `finish()` and build and run the app. If the app doesn't stop at the breakpoint, it means that the scripted model in Step 1 has been successfully loaded on Android. - -4. Process the model input and output for model inference -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image the same as that of the MobileNet v2 in the Hello World example, we will reuse some of the code in the `MainActivity.java `_ file from Hello World for input processing. Replace the code snippet between `line 50 `_ and 73 in `MainActivity.java` with the following code: - -.. code-block:: java - - final Tensor inputTensor = TensorImageUtils.bitmapToFloat32Tensor(bitmap, - TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, - TensorImageUtils.TORCHVISION_NORM_STD_RGB); - final float[] inputs = inputTensor.getDataAsFloatArray(); - - Map outTensors = - module.forward(IValue.from(inputTensor)).toDictStringKey(); - - // the key "out" of the output tensor contains the semantic masks - // see https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101 - final Tensor outputTensor = outTensors.get("out").toTensor(); - final float[] outputs = outputTensor.getDataAsFloatArray(); - - int width = bitmap.getWidth(); - int height = bitmap.getHeight(); - -.. note:: - The model output is a dictionary for the DeepLabV3 model so we use `toDictStringKey` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things. - -With the code changes shown above, you can set breakpoints after `final float[] inputs` and `final float[] outputs`, which populate the input tensor and output tensor data to float arrays for easy debugging. Run the app and when it stops at the breakpoints, compare the numbers in `inputs` and `outputs` with the model input and output data you see in Step 2 to see if they match. For the same inputs to the models running on Android and Python, you should get the same outputs. - -.. warning:: - You may see different model outputs with the same image input when running on an Android emulator due to some Android emulator's floating point implementation issue. So it is best to test the app on a real Android device. - -All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our Android app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project. - -5. Complete the UI, refactor, build and run the app -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4: - -.. code-block:: java - - int[] intValues = new int[width * height]; - // go through each element in the output of size [WIDTH, HEIGHT] and - // set different color for different classnum - for (int j = 0; j < width; j++) { - for (int k = 0; k < height; k++) { - // maxi: the index of the 21 CLASSNUM with the max probability - int maxi = 0, maxj = 0, maxk = 0; - double maxnum = -100000.0; - for (int i=0; i < CLASSNUM; i++) { - if (outputs[i*(width*height) + j*width + k] > maxnum) { - maxnum = outputs[i*(width*height) + j*width + k]; - maxi = i; maxj = j; maxk= k; - } - } - // color coding for person (red), dog (green), sheep (blue) - // black color for background and other classes - if (maxi == PERSON) - intValues[maxj*width + maxk] = 0xFFFF0000; // red - else if (maxi == DOG) - intValues[maxj*width + maxk] = 0xFF00FF00; // green - else if (maxi == SHEEP) - intValues[maxj*width + maxk] = 0xFF0000FF; // blue - else - intValues[maxj*width + maxk] = 0xFF000000; // black - } - } - -The constants used in the code above are defined in the beginning of the class `MainActivity`: - -.. code-block:: java - - private static final int CLASSNUM = 21; - private static final int DOG = 12; - private static final int PERSON = 15; - private static final int SHEEP = 17; - - -The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset. - -After the output processing, you will also need to call the code below to render the RGB `intValues` array to a bitmap instance `outputBitmap` before displaying it on an `ImageView`: - -.. code-block:: java - - Bitmap bmpSegmentation = Bitmap.createScaledBitmap(bitmap, width, height, true); - Bitmap outputBitmap = bmpSegmentation.copy(bmpSegmentation.getConfig(), true); - outputBitmap.setPixels(intValues, 0, outputBitmap.getWidth(), 0, 0, - outputBitmap.getWidth(), outputBitmap.getHeight()); - imageView.setImageBitmap(outputBitmap); - -The UI for this app is also similar to that for Hello World, except that you do not need the `TextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repository to run the model inference and to show back the original image after the segmentation result is shown. - -Now when you run the app on an Android emulator or preferably an actual device, you will see screens like the following: - -.. image:: /_static/img/deeplabv3_android.png - :width: 300 px -.. image:: /_static/img/deeplabv3_android2.png - :width: 300 px - - -Recap --------- - -In this tutorial, we described what it takes to convert a pretrained PyTorch DeepLabV3 model for Android and how to make sure the model can run successfully on Android. Our focus was to help you understand the process of confirming that a model can indeed run on Android. The complete code repository is available `here `_. - -More advanced topics such as quantization and using models via transfer learning or of your own on Android will be covered soon in future demo apps and tutorials. - - -Learn More ------------- - -1. `PyTorch Mobile site `_ -2. `DeepLabV3 model `_ -3. `DeepLabV3 paper `_ + diff --git a/beginner_source/deeplabv3_on_ios.rst b/beginner_source/deeplabv3_on_ios.rst index 5a88c703bd8..66c052419fc 100644 --- a/beginner_source/deeplabv3_on_ios.rst +++ b/beginner_source/deeplabv3_on_ios.rst @@ -1,248 +1,10 @@ Image Segmentation DeepLabV3 on iOS ============================================== -**Author**: `Jeff Tang `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -**Reviewed by**: `Jeremiah Chung `_ +Redirecting in 3 seconds... -Introduction ------------- +.. raw:: html -Semantic image segmentation is a computer vision task that uses semantic labels to mark specific regions of an input image. The PyTorch semantic image segmentation `DeepLabV3 model `_ can be used to label image regions with `20 semantic classes `_ including, for example, bicycle, bus, car, dog, and person. Image segmentation models can be very useful in applications such as autonomous driving and scene understanding. - -In this tutorial, we will provide a step-by-step guide on how to prepare and run the PyTorch DeepLabV3 model on iOS, taking you from the beginning of having a model you may want to use on iOS to the end of having a complete iOS app using the model. We will also cover practical and general tips on how to check if your next favorite pretrained PyTorch models can run on iOS, and how to avoid pitfalls. - -.. note:: Before going through this tutorial, you should check out `PyTorch Mobile for iOS `_ and give the PyTorch iOS `HelloWorld `_ example app a quick try. This tutorial will go beyond the image classification model, usually the first kind of model deployed on mobile. The complete code for this tutorial is available `here `_. - -Learning Objectives -------------------- - -In this tutorial, you will learn how to: - -1. Convert the DeepLabV3 model for iOS deployment. - -2. Get the output of the model for the example input image in Python and compare it to the output from the iOS app. - -3. Build a new iOS app or reuse an iOS example app to load the converted model. - -4. Prepare the input into the format that the model expects and process the model output. - -5. Complete the UI, refactor, build and run the app to see image segmentation in action. - -Prerequisites ---------------- - -* PyTorch 1.6 or 1.7 - -* torchvision 0.7 or 0.8 - -* Xcode 11 or 12 - -Steps ---------- - - -1. Convert the DeepLabV3 model for iOS deployment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The first step to deploying a model on iOS is to convert the model into the `TorchScript `_ format. - -.. note:: - Not all PyTorch models can be converted to TorchScript at this time because a model definition may use language features that are not in TorchScript, which is a subset of Python. See the `Script and Optimize Recipe <../recipes/script_optimized.html>`_ for more details. - -Simply run the script below to generate the scripted model `deeplabv3_scripted.pt`: - -:: - - import torch - - # use deeplabv3_resnet50 instead of deeplabv3_resnet101 to reduce the model size - model = torch.hub.load('pytorch/vision:v0.8.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scriptedm = torch.jit.script(model) - torch.jit.save(scriptedm, "deeplabv3_scripted.pt") - -The size of the generated `deeplabv3_scripted.pt` model file should be around 168MB. Ideally, a model should also be quantized for significant size reduction and faster inference before being deployed on an iOS app. To have a general understanding of quantization, see the `Quantization Recipe <../recipes/quantization.html>`_ and the resource links there. We will cover in detail how to correctly apply a quantization workflow called Post Training `Static Quantization `_ to the DeepLabV3 model in a future tutorial or recipe. - -2. Get example input and output of the model in Python -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now that we have a scripted PyTorch model, let's test with some example inputs to make sure the model works correctly on iOS. First, let's write a Python script that uses the model to make inferences and examine inputs and outputs. For this example of the DeepLabV3 model, we can reuse the code in Step 1 and in the `DeepLabV3 model hub site `_. Add the following code snippet to the code above: - -:: - - from PIL import Image - from torchvision import transforms - input_image = Image.open("deeplab.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor = preprocess(input_image) - input_batch = input_tensor.unsqueeze(0) - with torch.no_grad(): - output = model(input_batch)['out'][0] - - print(input_batch.shape) - print(output.shape) - -Download `deeplab.jpg` from `here `_ and run the script above to see the shapes of the input and output of the model: - -:: - - torch.Size([1, 3, 400, 400]) - torch.Size([21, 400, 400]) - -So if you provide the same image input `deeplab.jpg` of size 400x400 to the model on iOS, the output of the model should have the size [21, 400, 400]. You should also print out at least the beginning parts of the actual data of the input and output, to be used in Step 4 below to compare with the actual input and output of the model when running in the iOS app. - -3. Build a new iOS app or reuse an example app and load the model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -First, follow Step 3 of the `Model Preparation for iOS recipe <../recipes/model_preparation_ios.html#add-the-model-and-pytorch-library-on-ios>`_ to use our model in an Xcode project with PyTorch Mobile enabled. Because both the DeepLabV3 model used in this tutorial and the MobileNet v2 model used in the PyTorch Hello World iOS example are computer vision models, you may choose to start with the `HelloWorld example repo `_ as a template to reuse the code that loads the model and processes the input and output. - -Now let's add `deeplabv3_scripted.pt` and `deeplab.jpg` used in Step 2 to the Xcode project and modify `ViewController.swift` to resemble: - -.. code-block:: swift - - class ViewController: UIViewController { - var image = UIImage(named: "deeplab.jpg")! - - override func viewDidLoad() { - super.viewDidLoad() - } - - private lazy var module: TorchModule = { - if let filePath = Bundle.main.path(forResource: "deeplabv3_scripted", - ofType: "pt"), - let module = TorchModule(fileAtPath: filePath) { - return module - } else { - fatalError("Can't load the model file!") - } - }() - } - -Then set a breakpoint at the line `return module` and build and run the app. The app should stop at the breakpoint, meaning that the scripted model in Step 1 has been successfully loaded on iOS. - -4. Process the model input and output for model inference -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -After the model loads in the previous step, let's verify that it works with expected inputs and can generate expected outputs. As the model input for the DeepLabV3 model is an image, the same as that of the MobileNet v2 in the Hello World example, we will reuse some of the code in the `TorchModule.mm `_ file from Hello World for input processing. Replace the `predictImage` method implementation in `TorchModule.mm` with the following code: - -.. code-block:: objective-c - - - (unsigned char*)predictImage:(void*)imageBuffer { - // 1. the example deeplab.jpg size is size 400x400 and there are 21 semantic classes - const int WIDTH = 400; - const int HEIGHT = 400; - const int CLASSNUM = 21; - - at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, WIDTH, HEIGHT}, at::kFloat); - torch::autograd::AutoGradMode guard(false); - at::AutoNonVariableTypeMode non_var_type_mode(true); - - // 2. convert the input tensor to an NSMutableArray for debugging - float* floatInput = tensor.data_ptr(); - if (!floatInput) { - return nil; - } - NSMutableArray* inputs = [[NSMutableArray alloc] init]; - for (int i = 0; i < 3 * WIDTH * HEIGHT; i++) { - [inputs addObject:@(floatInput[i])]; - } - - // 3. the output of the model is a dictionary of string and tensor, as - // specified at https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101 - auto outputDict = _impl.forward({tensor}).toGenericDict(); - - // 4. convert the output to another NSMutableArray for easy debugging - auto outputTensor = outputDict.at("out").toTensor(); - float* floatBuffer = outputTensor.data_ptr(); - if (!floatBuffer) { - return nil; - } - NSMutableArray* results = [[NSMutableArray alloc] init]; - for (int i = 0; i < CLASSNUM * WIDTH * HEIGHT; i++) { - [results addObject:@(floatBuffer[i])]; - } - - return nil; - } - -.. note:: - The model output is a dictionary for the DeepLabV3 model so we use `toGenericDict` to correctly extract the result. For other models, the model output may also be a single tensor or a tuple of tensors, among other things. - -With the code changes shown above, you can set breakpoints after the two for loops that populate `inputs` and `results` and compare them with the model input and output data you saw in Step 2 to see if they match. For the same inputs to the models running on iOS and Python, you should get the same outputs. - -All we have done so far is to confirm that the model of our interest can be scripted and run correctly in our iOS app as in Python. The steps we walked through so far for using a model in an iOS app consumes the bulk, if not most, of our app development time, similar to how data preprocessing is the heaviest lift for a typical machine learning project. - -5. Complete the UI, refactor, build and run the app -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Now we are ready to complete the app and the UI to actually see the processed result as a new image. The output processing code should be like this, added to the end of the code snippet in Step 4 in `TorchModule.mm` - remember to first remove the line `return nil;` temporarily put there to make the code build and run: - -.. code-block:: objective-c - - // see the 20 semantic classes link in Introduction - const int DOG = 12; - const int PERSON = 15; - const int SHEEP = 17; - - NSMutableData* data = [NSMutableData dataWithLength: - sizeof(unsigned char) * 3 * WIDTH * HEIGHT]; - unsigned char* buffer = (unsigned char*)[data mutableBytes]; - // go through each element in the output of size [WIDTH, HEIGHT] and - // set different color for different classnum - for (int j = 0; j < WIDTH; j++) { - for (int k = 0; k < HEIGHT; k++) { - // maxi: the index of the 21 CLASSNUM with the max probability - int maxi = 0, maxj = 0, maxk = 0; - float maxnum = -100000.0; - for (int i = 0; i < CLASSNUM; i++) { - if ([results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue] > maxnum) { - maxnum = [results[i * (WIDTH * HEIGHT) + j * WIDTH + k] floatValue]; - maxi = i; maxj = j; maxk = k; - } - } - int n = 3 * (maxj * width + maxk); - // color coding for person (red), dog (green), sheep (blue) - // black color for background and other classes - buffer[n] = 0; buffer[n+1] = 0; buffer[n+2] = 0; - if (maxi == PERSON) buffer[n] = 255; - else if (maxi == DOG) buffer[n+1] = 255; - else if (maxi == SHEEP) buffer[n+2] = 255; - } - } - return buffer; - -The implementation here is based on the understanding of the DeepLabV3 model which outputs a tensor of size [21, width, height] for an input image of width*height. Each element in the width*height output array is a value between 0 and 20 (for a total of 21 semantic labels described in Introduction) and the value is used to set a specific color. Color coding of the segmentation here is based on the class with the highest probability, and you can extend the color coding for all classes in your own dataset. - -After the output processing, you will also need to call a helper function to convert the RGB `buffer` to an `UIImage` instance to be shown on `UIImageView`. You can refer to the example code `convertRGBBufferToUIImage` defined in `UIImageHelper.mm` in the code repository. - -The UI for this app is also similar to that for Hello World, except that you do not need the `UITextView` to show the image classification result. You can also add two buttons `Segment` and `Restart` as shown in the code repository to run the model inference and to show back the original image after the segmentation result is shown. - -The last step before we can run the app is to connect all the pieces together. Modify the `ViewController.swift` file to use the `predictImage`, which is refactored and changed to `segmentImage` in the repository, and helper functions you built as shown in the example code in the repository in `ViewController.swift`. Connect the buttons to the actions and you should be good to go. - -Now when you run the app on an iOS simulator or an actual iOS device, you will see the following screens: - -.. image:: /_static/img/deeplabv3_ios.png - :width: 300 px -.. image:: /_static/img/deeplabv3_ios2.png - :width: 300 px - - -Recap --------- - -In this tutorial, we described what it takes to convert a pretrained PyTorch DeepLabV3 model for iOS and how to make sure the model can run successfully on iOS. Our focus was to help you understand the process of confirming that a model can indeed run on iOS. The complete code repository is available `here `_. - -More advanced topics such as quantization and using models via transfer learning or of your own on iOS will be covered soon in future demo apps and tutorials. - -Learn More ------------- - -1. `PyTorch Mobile site `_ -2. `DeepLabV3 model `_ -3. `DeepLabV3 paper `_ + diff --git a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py index 508fa5a057a..1fb0f4d24b4 100644 --- a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py +++ b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py @@ -3,6 +3,8 @@ Deploying a Seq2Seq Model with TorchScript ================================================== **Author:** `Matthew Inkawhich `_ + +.. warning:: TorchScript is no longer in active development. """ diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst index 12e9bfa0e55..2c74bb51a04 100644 --- a/beginner_source/dist_overview.rst +++ b/beginner_source/dist_overview.rst @@ -1,6 +1,6 @@ PyTorch Distributed Overview ============================ -**Author**: `Shen Li `_ +**Author**: `Will Constable `_ .. note:: |edit| View and edit this tutorial in `github `__. @@ -15,192 +15,80 @@ to the technology that can best serve your use case. Introduction ------------ -As of PyTorch v1.6.0, features in ``torch.distributed`` can be categorized into -three main components: - -* `Distributed Data-Parallel Training `__ - (DDP) is a widely adopted single-program multiple-data training paradigm. With - DDP, the model is replicated on every process, and every model replica will be - fed with a different set of input data samples. DDP takes care of gradient - communication to keep model replicas synchronized and overlaps it with the - gradient computations to speed up training. -* `RPC-Based Distributed Training `__ - (RPC) supports general training structures that cannot fit into - data-parallel training such as distributed pipeline parallelism, parameter - server paradigm, and combinations of DDP with other training paradigms. It - helps manage remote object lifetime and extends the - `autograd engine `__ beyond - machine boundaries. -* `Collective Communication `__ - (c10d) library supports sending tensors across processes within a group. It - offers both collective communication APIs (e.g., - `all_reduce `__ +The PyTorch Distributed library includes a collective of parallelism modules, +a communications layer, and infrastructure for launching and +debugging large training jobs. + + +Parallelism APIs +**************** + +These Parallelism Modules offer high-level functionality and compose with existing models: + +- `Distributed Data-Parallel (DDP) `__ +- `Fully Sharded Data-Parallel Training (FSDP) `__ +- `Tensor Parallel (TP) `__ +- `Pipeline Parallel (PP) `__ + +Sharding primitives +******************* + +``DTensor`` and ``DeviceMesh`` are primitives used to build parallelism in terms of sharded or replicated tensors on N-dimensional process groups. + +- `DTensor `__ represents a tensor that is sharded and/or replicated, and communicates automatically to reshard tensors as needed by operations. +- `DeviceMesh `__ abstracts the accelerator device communicators into a multi-dimensional array, which manages the underlying ``ProcessGroup`` instances for collective communications in multi-dimensional parallelisms. Try out our `Device Mesh Recipe `__ to learn more. + +Communications APIs +******************* + +The `PyTorch distributed communication layer (C10D) `__ offers both collective communication APIs (e.g., `all_reduce `__ and `all_gather `__) and P2P communication APIs (e.g., `send `__ - and `isend `__). - DDP and RPC (`ProcessGroup Backend `__) - are built on c10d, where the former uses collective communications - and the latter uses P2P communications. Usually, developers do not need to - directly use this raw communication API, as the DDP and RPC APIs can serve - many distributed training scenarios. However, there are use cases where this API - is still helpful. One example would be distributed parameter averaging, where - applications would like to compute the average values of all model parameters - after the backward pass instead of using DDP to communicate gradients. This can - decouple communications from computations and allow finer-grain control over - what to communicate, but on the other hand, it also gives up the performance - optimizations offered by DDP. + and `isend `__), + which are used under the hood in all of the parallelism implementations. `Writing Distributed Applications with PyTorch <../intermediate/dist_tuto.html>`__ shows examples of using c10d communication APIs. +Launcher +******** -Data Parallel Training ----------------------- +`torchrun `__ is a widely-used launcher script, which spawns processes on the local and remote machines for running distributed PyTorch programs. -PyTorch provides several options for data-parallel training. For applications -that gradually grow from simple to complex and from prototype to production, the -common development trajectory would be: -1. Use single-device training if the data and model can fit in one GPU, and - training speed is not a concern. -2. Use single-machine multi-GPU - `DataParallel `__ - to make use of multiple GPUs on a single machine to speed up training with - minimal code changes. -3. Use single-machine multi-GPU - `DistributedDataParallel `__, - if you would like to further speed up training and are willing to write a - little more code to set it up. -4. Use multi-machine `DistributedDataParallel `__ - and the `launching script `__, - if the application needs to scale across machine boundaries. -5. Use `torch.distributed.elastic `__ - to launch distributed training if errors (e.g., out-of-memory) are expected or if - resources can join and leave dynamically during training. +Applying Parallelism To Scale Your Model +---------------------------------------- +Data Parallelism is a widely adopted single-program multiple-data training paradigm +where the model is replicated on every process, every model replica computes local gradients for +a different set of input data samples, gradients are averaged within the data-parallel communicator group before each optimizer step. -.. note:: Data-parallel training also works with `Automatic Mixed Precision (AMP) `__. +Model Parallelism techniques (or Sharded Data Parallelism) are required when a model doesn't fit in GPU, and can be combined together to form multi-dimensional (N-D) parallelism techniques. +When deciding what parallelism techniques to choose for your model, use these common guidelines: -``torch.nn.DataParallel`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The `DataParallel `__ -package enables single-machine multi-GPU parallelism with the lowest coding -hurdle. It only requires a one-line change to the application code. The tutorial -`Optional: Data Parallelism <../beginner/blitz/data_parallel_tutorial.html>`__ -shows an example. Although ``DataParallel`` is very easy to -use, it usually does not offer the best performance because it replicates the -model in every forward pass, and its single-process multi-thread parallelism -naturally suffers from -`GIL `__ contention. To get -better performance, consider using -`DistributedDataParallel `__. - - -``torch.nn.parallel.DistributedDataParallel`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Compared to `DataParallel `__, -`DistributedDataParallel `__ -requires one more step to set up, i.e., calling -`init_process_group `__. -DDP uses multi-process parallelism, and hence there is no GIL contention across -model replicas. Moreover, the model is broadcast at DDP construction time instead -of in every forward pass, which also helps to speed up training. DDP is shipped -with several performance optimization technologies. For a more in-depth -explanation, refer to this -`paper `__ (VLDB'20). - - -DDP materials are listed below: - -1. `DDP notes `__ - offer a starter example and some brief descriptions of its design and - implementation. If this is your first time using DDP, start from this - document. -2. `Getting Started with Distributed Data Parallel <../intermediate/ddp_tutorial.html>`__ - explains some common problems with DDP training, including unbalanced - workload, checkpointing, and multi-device models. Note that, DDP can be - easily combined with single-machine multi-device model parallelism which is - described in the - `Single-Machine Model Parallel Best Practices <../intermediate/model_parallel_tutorial.html>`__ - tutorial. -3. The `Launching and configuring distributed data parallel applications `__ - document shows how to use the DDP launching script. -4. The `Shard Optimizer States With ZeroRedundancyOptimizer <../recipes/zero_redundancy_optimizer.html>`__ - recipe demonstrates how `ZeroRedundancyOptimizer `__ - helps to reduce optimizer memory footprint. -5. The `Distributed Training with Uneven Inputs Using the Join Context Manager <../advanced/generic_join.html>`__ - tutorial walks through using the generic join context for distributed training with uneven inputs. - -torch.distributed.elastic -~~~~~~~~~~~~~~~~~~~~~~~~~ - -With the growth of the application complexity and scale, failure recovery -becomes a requirement. Sometimes it is inevitable to hit errors -like out-of-memory (OOM) when using DDP, but DDP itself cannot recover from those errors, -and it is not possible to handle them using a standard ``try-except`` construct. -This is because DDP requires all processes to operate in a closely synchronized manner -and all ``AllReduce`` communications launched in different processes must match. -If one of the processes in the group -throws an exception, it is likely to lead to desynchronization (mismatched -``AllReduce`` operations) which would then cause a crash or hang. -`torch.distributed.elastic `__ -adds fault tolerance and the ability to make use of a dynamic pool of machines (elasticity). - -RPC-Based Distributed Training ----------------------------- - -Many training paradigms do not fit into data parallelism, e.g., -parameter server paradigm, distributed pipeline parallelism, reinforcement -learning applications with multiple observers or agents, etc. -`torch.distributed.rpc `__ aims at -supporting general distributed training scenarios. - -`torch.distributed.rpc `__ -has four main pillars: - -* `RPC `__ supports running - a given function on a remote worker. -* `RRef `__ helps to manage the - lifetime of a remote object. The reference counting protocol is presented in the - `RRef notes `__. -* `Distributed Autograd `__ - extends the autograd engine beyond machine boundaries. Please refer to - `Distributed Autograd Design `__ - for more details. -* `Distributed Optimizer `__ - automatically reaches out to all participating workers to update - parameters using gradients computed by the distributed autograd engine. - -RPC Tutorials are listed below: - -1. The `Getting Started with Distributed RPC Framework <../intermediate/rpc_tutorial.html>`__ - tutorial first uses a simple Reinforcement Learning (RL) example to - demonstrate RPC and RRef. Then, it applies a basic distributed model - parallelism to an RNN example to show how to use distributed autograd and - distributed optimizer. -2. The `Implementing a Parameter Server Using Distributed RPC Framework <../intermediate/rpc_param_server_tutorial.html>`__ - tutorial borrows the spirit of - `HogWild! training `__ - and applies it to an asynchronous parameter server (PS) training application. -3. The `Distributed Pipeline Parallelism Using RPC <../intermediate/dist_pipeline_parallel_tutorial.html>`__ - tutorial extends the single-machine pipeline parallel example (presented in - `Single-Machine Model Parallel Best Practices <../intermediate/model_parallel_tutorial.html>`__) - to a distributed environment and shows how to implement it using RPC. -4. The `Implementing Batch RPC Processing Using Asynchronous Executions <../intermediate/rpc_async_execution.html>`__ - tutorial demonstrates how to implement RPC batch processing using the - `@rpc.functions.async_execution `__ - decorator, which can help speed up inference and training. It uses - RL and PS examples similar to those in the above tutorials 1 and 2. -5. The `Combining Distributed DataParallel with Distributed RPC Framework <../advanced/rpc_ddp_tutorial.html>`__ - tutorial demonstrates how to combine DDP with RPC to train a model using - distributed data parallelism combined with distributed model parallelism. +#. Use `DistributedDataParallel (DDP) `__, + if your model fits in a single GPU but you want to easily scale up training using multiple GPUs. + + * Use `torchrun `__, to launch multiple pytorch processes if you are using more than one node. + + * See also: `Getting Started with Distributed Data Parallel <../intermediate/ddp_tutorial.html>`__ + +#. Use `FullyShardedDataParallel (FSDP) `__ when your model cannot fit on one GPU. + + * See also: `Getting Started with FSDP `__ + +#. Use `Tensor Parallel (TP) `__ and/or `Pipeline Parallel (PP) `__ if you reach scaling limitations with FSDP. + + * Try our `Tensor Parallelism Tutorial `__ + + * See also: `TorchTitan end to end example of 3D parallelism `__ + +.. note:: Data-parallel training also works with `Automatic Mixed Precision (AMP) `__. PyTorch Distributed Developers ------------------------------ -If you'd like to contribute to PyTorch Distributed, please refer to our +If you'd like to contribute to PyTorch Distributed, refer to our `Developer Guide `_. diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py index 007ad3fd956..9bdf52d84b4 100644 --- a/beginner_source/fgsm_tutorial.py +++ b/beginner_source/fgsm_tutorial.py @@ -192,7 +192,7 @@ def forward(self, x): model = Net().to(device) # Load the pretrained model -model.load_state_dict(torch.load(pretrained_model, map_location=device)) +model.load_state_dict(torch.load(pretrained_model, map_location=device, weights_only=True)) # Set the model in evaluation mode. In this case this is for the Dropout layers model.eval() diff --git a/beginner_source/former_torchies/README.txt b/beginner_source/former_torchies/README.txt deleted file mode 100644 index 5bb8c93f00c..00000000000 --- a/beginner_source/former_torchies/README.txt +++ /dev/null @@ -1,18 +0,0 @@ - PyTorch for former Torch users - ------------------------------ - -1. tensor_tutorial_old.py - Tensors - https://pytorch.org/tutorials/beginner/former_torchies/tensor_tutorial_old.html - -2. autograd_tutorial_old.py - Autograd - https://pytorch.org/tutorials/beginner/former_torchies/autograd_tutorial_old.html - -3. nnft_tutorial.py - nn package - https://pytorch.org/tutorials/beginner/former_torchies/nnft_tutorial.html - -4. parallelism_tutorial.py - Multi-GPU examples - https://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html diff --git a/beginner_source/former_torchies/autograd_tutorial_old.py b/beginner_source/former_torchies/autograd_tutorial_old.py deleted file mode 100644 index 4030831b8ef..00000000000 --- a/beginner_source/former_torchies/autograd_tutorial_old.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Autograd -======== - -Autograd is now a core torch package for automatic differentiation. -It uses a tape based system for automatic differentiation. - -In the forward phase, the autograd tape will remember all the operations -it executed, and in the backward phase, it will replay the operations. - -Tensors that track history --------------------------- - -In autograd, if any input ``Tensor`` of an operation has ``requires_grad=True``, -the computation will be tracked. After computing the backward pass, a gradient -w.r.t. this tensor is accumulated into ``.grad`` attribute. - -There’s one more class which is very important for autograd -implementation - a ``Function``. ``Tensor`` and ``Function`` are -interconnected and build up an acyclic graph, that encodes a complete -history of computation. Each variable has a ``.grad_fn`` attribute that -references a function that has created a function (except for Tensors -created by the user - these have ``None`` as ``.grad_fn``). - -If you want to compute the derivatives, you can call ``.backward()`` on -a ``Tensor``. If ``Tensor`` is a scalar (i.e. it holds a one element -tensor), you don’t need to specify any arguments to ``backward()``, -however if it has more elements, you need to specify a ``grad_output`` -argument that is a tensor of matching shape. -""" - -import torch - -############################################################### -# Create a tensor and set requires_grad=True to track computation with it -x = torch.ones(2, 2, requires_grad=True) -print(x) - -############################################################### -# -print(x.data) - -############################################################### -# -print(x.grad) - -############################################################### -# - -print(x.grad_fn) # we've created x ourselves - -############################################################### -# Do an operation of x: - -y = x + 2 -print(y) - -############################################################### -# y was created as a result of an operation, -# so it has a grad_fn -print(y.grad_fn) - -############################################################### -# More operations on y: - -z = y * y * 3 -out = z.mean() - -print(z, out) - -################################################################ -# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad`` -# flag in-place. The input flag defaults to ``True`` if not given. -a = torch.randn(2, 2) -a = ((a * 3) / (a - 1)) -print(a.requires_grad) -a.requires_grad_(True) -print(a.requires_grad) -b = (a * a).sum() -print(b.grad_fn) - -############################################################### -# Gradients -# --------- -# -# let's backprop now and print gradients d(out)/dx - -out.backward() -print(x.grad) - - -############################################################### -# By default, gradient computation flushes all the internal buffers -# contained in the graph, so if you even want to do the backward on some -# part of the graph twice, you need to pass in ``retain_variables = True`` -# during the first pass. - -x = torch.ones(2, 2, requires_grad=True) -y = x + 2 -y.backward(torch.ones(2, 2), retain_graph=True) -# the retain_variables flag will prevent the internal buffers from being freed -print(x.grad) - -############################################################### -# -z = y * y -print(z) - -############################################################### -# -# just backprop random gradients - -gradient = torch.randn(2, 2) - -# this would fail if we didn't specify -# that we want to retain variables -y.backward(gradient) - -print(x.grad) - -############################################################### -# You can also stop autograd from tracking history on Tensors -# with requires_grad=True by wrapping the code block in -# ``with torch.no_grad():`` -print(x.requires_grad) -print((x ** 2).requires_grad) - -with torch.no_grad(): - print((x ** 2).requires_grad) diff --git a/beginner_source/former_torchies/autograd_tutorial_old.rst b/beginner_source/former_torchies/autograd_tutorial_old.rst new file mode 100644 index 00000000000..8c887e00c8a --- /dev/null +++ b/beginner_source/former_torchies/autograd_tutorial_old.rst @@ -0,0 +1,8 @@ +Autograd +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/nnft_tutorial.py b/beginner_source/former_torchies/nnft_tutorial.py deleted file mode 100644 index 316bf03a985..00000000000 --- a/beginner_source/former_torchies/nnft_tutorial.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -""" -nn package -========== - -We’ve redesigned the nn package, so that it’s fully integrated with -autograd. Let's review the changes. - -**Replace containers with autograd:** - - You no longer have to use Containers like ``ConcatTable``, or modules like - ``CAddTable``, or use and debug with nngraph. We will seamlessly use - autograd to define our neural networks. For example, - - * ``output = nn.CAddTable():forward({input1, input2})`` simply becomes - ``output = input1 + input2`` - * ``output = nn.MulConstant(0.5):forward(input)`` simply becomes - ``output = input * 0.5`` - -**State is no longer held in the module, but in the network graph:** - - Using recurrent networks should be simpler because of this reason. If - you want to create a recurrent network, simply use the same Linear layer - multiple times, without having to think about sharing weights. - - .. figure:: /_static/img/torch-nn-vs-pytorch-nn.png - :alt: torch-nn-vs-pytorch-nn - - torch-nn-vs-pytorch-nn - -**Simplified debugging:** - - Debugging is intuitive using Python’s pdb debugger, and **the debugger - and stack traces stop at exactly where an error occurred.** What you see - is what you get. - -Example 1: ConvNet ------------------- - -Let’s see how to create a small ConvNet. - -All of your networks are derived from the base class ``nn.Module``: - -- In the constructor, you declare all the layers you want to use. -- In the forward function, you define how your model is going to be - run, from input to output -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class MNISTConvNet(nn.Module): - - def __init__(self): - # this is the place where you instantiate all your modules - # you can later access them using the same names you've given them in - # here - super(MNISTConvNet, self).__init__() - self.conv1 = nn.Conv2d(1, 10, 5) - self.pool1 = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(10, 20, 5) - self.pool2 = nn.MaxPool2d(2, 2) - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - # it's the forward function that defines the network structure - # we're accepting only a single input in here, but if you want, - # feel free to use more - def forward(self, input): - x = self.pool1(F.relu(self.conv1(input))) - x = self.pool2(F.relu(self.conv2(x))) - - # in your model definition you can go full crazy and use arbitrary - # python code to define your model structure - # all these are perfectly legal, and will be handled correctly - # by autograd: - # if x.gt(0) > x.numel() / 2: - # ... - # - # you can even do a loop and reuse the same module inside it - # modules no longer hold ephemeral state, so you can use them - # multiple times during your forward pass - # while x.norm(2) < 10: - # x = self.conv1(x) - - x = x.view(x.size(0), -1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return x - -############################################################### -# Let's use the defined ConvNet now. -# You create an instance of the class first. - - -net = MNISTConvNet() -print(net) - -######################################################################## -# .. note:: -# -# ``torch.nn`` only supports mini-batches The entire ``torch.nn`` -# package only supports inputs that are a mini-batch of samples, and not -# a single sample. -# -# For example, ``nn.Conv2d`` will take in a 4D Tensor of -# ``nSamples x nChannels x Height x Width``. -# -# If you have a single sample, just use ``input.unsqueeze(0)`` to add -# a fake batch dimension. -# -# Create a mini-batch containing a single sample of random data and send the -# sample through the ConvNet. - -input = torch.randn(1, 1, 28, 28) -out = net(input) -print(out.size()) - -######################################################################## -# Define a dummy target label and compute error using a loss function. - -target = torch.tensor([3], dtype=torch.long) -loss_fn = nn.CrossEntropyLoss() # LogSoftmax + ClassNLL Loss -err = loss_fn(out, target) -err.backward() - -print(err) - -######################################################################## -# The output of the ConvNet ``out`` is a ``Tensor``. We compute the loss -# using that, and that results in ``err`` which is also a ``Tensor``. -# Calling ``.backward`` on ``err`` hence will propagate gradients all the -# way through the ConvNet to it’s weights -# -# Let's access individual layer weights and gradients: - -print(net.conv1.weight.grad.size()) - -######################################################################## -print(net.conv1.weight.data.norm()) # norm of the weight -print(net.conv1.weight.grad.data.norm()) # norm of the gradients - -######################################################################## -# Forward and Backward Function Hooks -# ----------------------------------- -# -# We’ve inspected the weights and the gradients. But how about inspecting -# / modifying the output and grad\_output of a layer? -# -# We introduce **hooks** for this purpose. -# -# You can register a function on a ``Module`` or a ``Tensor``. -# The hook can be a forward hook or a backward hook. -# The forward hook will be executed when a forward call is executed. -# The backward hook will be executed in the backward phase. -# Let’s look at an example. -# -# We register a forward hook on conv2 and print some information - - -def printnorm(self, input, output): - # input is a tuple of packed inputs - # output is a Tensor. output.data is the Tensor we are interested - print('Inside ' + self.__class__.__name__ + ' forward') - print('') - print('input: ', type(input)) - print('input[0]: ', type(input[0])) - print('output: ', type(output)) - print('') - print('input size:', input[0].size()) - print('output size:', output.data.size()) - print('output norm:', output.data.norm()) - - -net.conv2.register_forward_hook(printnorm) - -out = net(input) - -######################################################################## -# -# We register a backward hook on conv2 and print some information - - -def printgradnorm(self, grad_input, grad_output): - print('Inside ' + self.__class__.__name__ + ' backward') - print('Inside class:' + self.__class__.__name__) - print('') - print('grad_input: ', type(grad_input)) - print('grad_input[0]: ', type(grad_input[0])) - print('grad_output: ', type(grad_output)) - print('grad_output[0]: ', type(grad_output[0])) - print('') - print('grad_input size:', grad_input[0].size()) - print('grad_output size:', grad_output[0].size()) - print('grad_input norm:', grad_input[0].norm()) - - -net.conv2.register_backward_hook(printgradnorm) - -out = net(input) -err = loss_fn(out, target) -err.backward() - -######################################################################## -# A full and working MNIST example is located here -# https://github.com/pytorch/examples/tree/master/mnist -# -# Example 2: Recurrent Net -# ------------------------ -# -# Next, let’s look at building recurrent nets with PyTorch. -# -# Since the state of the network is held in the graph and not in the -# layers, you can simply create an nn.Linear and reuse it over and over -# again for the recurrence. - - -class RNN(nn.Module): - - # you can also accept arguments in your model constructor - def __init__(self, data_size, hidden_size, output_size): - super(RNN, self).__init__() - - self.hidden_size = hidden_size - input_size = data_size + hidden_size - - self.i2h = nn.Linear(input_size, hidden_size) - self.h2o = nn.Linear(hidden_size, output_size) - - def forward(self, data, last_hidden): - input = torch.cat((data, last_hidden), 1) - hidden = self.i2h(input) - output = self.h2o(hidden) - return hidden, output - - -rnn = RNN(50, 20, 10) - -######################################################################## -# -# A more complete Language Modeling example using LSTMs and Penn Tree-bank -# is located -# `here `_ -# -# PyTorch by default has seamless CuDNN integration for ConvNets and -# Recurrent Nets - -loss_fn = nn.MSELoss() - -batch_size = 10 -TIMESTEPS = 5 - -# Create some fake data -batch = torch.randn(batch_size, 50) -hidden = torch.zeros(batch_size, 20) -target = torch.zeros(batch_size, 10) - -loss = 0 -for t in range(TIMESTEPS): - # yes! you can reuse the same network several times, - # sum up the losses, and call backward! - hidden, output = rnn(batch, hidden) - loss += loss_fn(output, target) -loss.backward() diff --git a/beginner_source/former_torchies/nnft_tutorial.rst b/beginner_source/former_torchies/nnft_tutorial.rst new file mode 100644 index 00000000000..db378a7162b --- /dev/null +++ b/beginner_source/former_torchies/nnft_tutorial.rst @@ -0,0 +1,8 @@ +nn Package +=============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/nn_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py deleted file mode 100644 index a11d844e1bd..00000000000 --- a/beginner_source/former_torchies/parallelism_tutorial.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Multi-GPU Examples -================== - -Data Parallelism is when we split the mini-batch of samples into -multiple smaller mini-batches and run the computation for each of the -smaller mini-batches in parallel. - -Data Parallelism is implemented using ``torch.nn.DataParallel``. -One can wrap a Module in ``DataParallel`` and it will be parallelized -over multiple GPUs in the batch dimension. - - -DataParallel -------------- -""" -import torch -import torch.nn as nn - - -class DataParallelModel(nn.Module): - - def __init__(self): - super().__init__() - self.block1 = nn.Linear(10, 20) - - # wrap block2 in DataParallel - self.block2 = nn.Linear(20, 20) - self.block2 = nn.DataParallel(self.block2) - - self.block3 = nn.Linear(20, 20) - - def forward(self, x): - x = self.block1(x) - x = self.block2(x) - x = self.block3(x) - return x - -######################################################################## -# The code does not need to be changed in CPU-mode. -# -# The documentation for DataParallel can be found -# `here `_. -# -# **Attributes of the wrapped module** -# -# After wrapping a Module with ``DataParallel``, the attributes of the module -# (e.g. custom methods) became inaccessible. This is because ``DataParallel`` -# defines a few new members, and allowing other attributes might lead to -# clashes in their names. For those who still want to access the attributes, -# a workaround is to use a subclass of ``DataParallel`` as below. - -class MyDataParallel(nn.DataParallel): - def __getattr__(self, name): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.module, name) - -######################################################################## -# **Primitives on which DataParallel is implemented upon:** -# -# -# In general, pytorch’s `nn.parallel` primitives can be used independently. -# We have implemented simple MPI-like primitives: -# -# - replicate: replicate a Module on multiple devices -# - scatter: distribute the input in the first-dimension -# - gather: gather and concatenate the input in the first-dimension -# - parallel\_apply: apply a set of already-distributed inputs to a set of -# already-distributed models. -# -# To give a better clarity, here function ``data_parallel`` composed using -# these collectives - - -def data_parallel(module, input, device_ids, output_device=None): - if not device_ids: - return module(input) - - if output_device is None: - output_device = device_ids[0] - - replicas = nn.parallel.replicate(module, device_ids) - inputs = nn.parallel.scatter(input, device_ids) - replicas = replicas[:len(inputs)] - outputs = nn.parallel.parallel_apply(replicas, inputs) - return nn.parallel.gather(outputs, output_device) - -######################################################################## -# Part of the model on CPU and part on the GPU -# -------------------------------------------- -# -# Let’s look at a small example of implementing a network where part of it -# is on the CPU and part on the GPU - -device = torch.device("cuda:0") - -class DistributedModel(nn.Module): - - def __init__(self): - super().__init__( - embedding=nn.Embedding(1000, 10), - rnn=nn.Linear(10, 10).to(device), - ) - - def forward(self, x): - # Compute embedding on CPU - x = self.embedding(x) - - # Transfer to GPU - x = x.to(device) - - # Compute RNN on GPU - x = self.rnn(x) - return x - -######################################################################## -# -# This was a small introduction to PyTorch for former Torch users. -# There’s a lot more to learn. -# -# Look at our more comprehensive introductory tutorial which introduces -# the ``optim`` package, data loaders etc.: :doc:`/beginner/deep_learning_60min_blitz`. -# -# Also look at -# -# - :doc:`Train neural nets to play video games ` -# - `Train a state-of-the-art ResNet network on imagenet`_ -# - `Train a face generator using Generative Adversarial Networks`_ -# - `Train a word-level language model using Recurrent LSTM networks`_ -# - `More examples`_ -# - `More tutorials`_ -# - `Discuss PyTorch on the Forums`_ -# - `Chat with other users on Slack`_ -# -# .. _`Deep Learning with PyTorch: a 60-minute blitz`: https://github.com/pytorch/tutorials/blob/main/Deep%20Learning%20with%20PyTorch.ipynb -# .. _Train a state-of-the-art ResNet network on imagenet: https://github.com/pytorch/examples/tree/master/imagenet -# .. _Train a face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan -# .. _Train a word-level language model using Recurrent LSTM networks: https://github.com/pytorch/examples/tree/master/word_language_model -# .. _More examples: https://github.com/pytorch/examples -# .. _More tutorials: https://github.com/pytorch/tutorials -# .. _Discuss PyTorch on the Forums: https://discuss.pytorch.org/ -# .. _Chat with other users on Slack: https://pytorch.slack.com/messages/beginner/ diff --git a/beginner_source/former_torchies/parallelism_tutorial.rst b/beginner_source/former_torchies/parallelism_tutorial.rst new file mode 100644 index 00000000000..04bb1d69e57 --- /dev/null +++ b/beginner_source/former_torchies/parallelism_tutorial.rst @@ -0,0 +1,8 @@ +Multi-GPU Examples +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies/tensor_tutorial_old.py b/beginner_source/former_torchies/tensor_tutorial_old.py deleted file mode 100644 index 10a9d81fadb..00000000000 --- a/beginner_source/former_torchies/tensor_tutorial_old.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -Tensors -======= - -Tensors behave almost exactly the same way in PyTorch as they do in -Torch. - -Create a tensor of size (5 x 7) with uninitialized memory: - -""" - -import torch -a = torch.empty(5, 7, dtype=torch.float) - -############################################################### -# Initialize a double tensor randomized with a normal distribution with mean=0, -# var=1: - -a = torch.randn(5, 7, dtype=torch.double) -print(a) -print(a.size()) - -############################################################### -# .. note:: -# ``torch.Size`` is in fact a tuple, so it supports the same operations -# -# Inplace / Out-of-place -# ---------------------- -# -# The first difference is that ALL operations on the tensor that operate -# in-place on it will have an ``_`` postfix. For example, ``add`` is the -# out-of-place version, and ``add_`` is the in-place version. - -a.fill_(3.5) -# a has now been filled with the value 3.5 - -b = a.add(4.0) -# a is still filled with 3.5 -# new tensor b is returned with values 3.5 + 4.0 = 7.5 - -print(a, b) - -############################################################### -# Some operations like ``narrow`` do not have in-place versions, and -# hence, ``.narrow_`` does not exist. Similarly, some operations like -# ``fill_`` do not have an out-of-place version, so ``.fill`` does not -# exist. -# -# Zero Indexing -# ------------- -# -# Another difference is that Tensors are zero-indexed. (In lua, tensors are -# one-indexed) - -b = a[0, 3] # select 1st row, 4th column from a - -############################################################### -# Tensors can be also indexed with Python's slicing - -b = a[:, 3:5] # selects all rows, 4th column and 5th column from a - -############################################################### -# No camel casing -# --------------- -# -# The next small difference is that all functions are now NOT camelCase -# anymore. For example ``indexAdd`` is now called ``index_add_`` - - -x = torch.ones(5, 5) -print(x) - -############################################################### -# - -z = torch.empty(5, 2) -z[:, 0] = 10 -z[:, 1] = 100 -print(z) - -############################################################### -# -x.index_add_(1, torch.tensor([4, 0], dtype=torch.long), z) -print(x) - -############################################################### -# Numpy Bridge -# ------------ -# -# Converting a torch Tensor to a numpy array and vice versa is a breeze. -# The torch Tensor and numpy array will share their underlying memory -# locations, and changing one will change the other. -# -# Converting torch Tensor to numpy Array -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -a = torch.ones(5) -print(a) - -############################################################### -# - -b = a.numpy() -print(b) - -############################################################### -# -a.add_(1) -print(a) -print(b) # see how the numpy array changed in value - - -############################################################### -# Converting numpy Array to torch Tensor -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -import numpy as np -a = np.ones(5) -b = torch.from_numpy(a) -np.add(a, 1, out=a) -print(a) -print(b) # see how changing the np array changed the torch Tensor automatically - -############################################################### -# All the Tensors on the CPU except a CharTensor support converting to -# NumPy and back. -# -# CUDA Tensors -# ------------ -# -# CUDA Tensors are nice and easy in pytorch, and transfering a CUDA tensor -# from the CPU to GPU will retain its underlying type. - -# let us run this cell only if CUDA is available -if torch.cuda.is_available(): - - # creates a LongTensor and transfers it - # to GPU as torch.cuda.LongTensor - a = torch.full((10,), 3, device=torch.device("cuda")) - print(type(a)) - b = a.to(torch.device("cpu")) - # transfers it to CPU, back to - # being a torch.LongTensor diff --git a/beginner_source/former_torchies/tensor_tutorial_old.rst b/beginner_source/former_torchies/tensor_tutorial_old.rst new file mode 100644 index 00000000000..939a6855c27 --- /dev/null +++ b/beginner_source/former_torchies/tensor_tutorial_old.rst @@ -0,0 +1,8 @@ +Tensors +============== + +This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html + +.. raw:: html + + diff --git a/beginner_source/former_torchies_tutorial.rst b/beginner_source/former_torchies_tutorial.rst index e6ae59b7082..5071a62e73c 100644 --- a/beginner_source/former_torchies_tutorial.rst +++ b/beginner_source/former_torchies_tutorial.rst @@ -1,37 +1,9 @@ PyTorch for Former Torch Users ------------------------------- -**Author**: `Soumith Chintala `_ +============== +This tutorial is out of date. Please check out the PyTorch tutorials here: https://pytorch.org/tutorials/ -In this tutorial, you will learn the following: - -1. Using torch Tensors, and important difference against (Lua)Torch -2. Using the autograd package -3. Building neural networks - - - Building a ConvNet - - Building a Recurrent Net - -4. Use multiple GPUs - - -.. toctree:: - :hidden: - - /beginner/former_torchies/tensor_tutorial_old - /beginner/former_torchies/autograd_tutorial_old - /beginner/former_torchies/nnft_tutorial - /beginner/former_torchies/parallelism_tutorial - -.. galleryitem:: /beginner/former_torchies/tensor_tutorial_old.py - :figure: /_static/img/tensor_illustration_flat.png - -.. galleryitem:: /beginner/former_torchies/autograd_tutorial_old.py - -.. galleryitem:: /beginner/former_torchies/nnft_tutorial.py - :figure: /_static/img/torch-nn-vs-pytorch-nn.png - -.. galleryitem:: /beginner/former_torchies/parallelism_tutorial.py +You will be redirected in 3 seconds. .. raw:: html -
    + diff --git a/beginner_source/hta_intro_tutorial.rst b/beginner_source/hta_intro_tutorial.rst new file mode 100644 index 00000000000..dc7c8cedf9e --- /dev/null +++ b/beginner_source/hta_intro_tutorial.rst @@ -0,0 +1,390 @@ +Introduction to Holistic Trace Analysis +======================================= + +**Author:** `Anupam Bhatnagar `_ + +In this tutorial, we demonstrate how to use Holistic Trace Analysis (HTA) to +analyze traces from a distributed training job. To get started follow the steps +below. + +Installing HTA +~~~~~~~~~~~~~~ + +We recommend using a Conda environment to install HTA. To install Anaconda, see +`the official Anaconda documentation `_. + +1. Install HTA using pip: + + .. code-block:: python + + pip install HolisticTraceAnalysis + +2. (Optional and recommended) Set up a Conda environment: + + .. code-block:: python + + # create the environment env_name + conda create -n env_name + + # activate the environment + conda activate env_name + + # When you are done, deactivate the environment by running ``conda deactivate`` + +Getting Started +~~~~~~~~~~~~~~~ + +Launch a Jupyter notebook and set the ``trace_dir`` variable to the location of the traces. + +.. code-block:: python + + from hta.trace_analysis import TraceAnalysis + trace_dir = "/path/to/folder/with/traces" + analyzer = TraceAnalysis(trace_dir=trace_dir) + + +Temporal Breakdown +------------------ + +To effectively utilize the GPUs, it is crucial to understand how they are spending +time for a specific job. Are they primarily engaged in computation, communication, +memory events, or are they idle? The temporal breakdown feature provides a detailed +analysis of the time spent in these three categories. + +* Idle time - GPU is idle. +* Compute time - GPU is being used for matrix multiplications or vector operations. +* Non-compute time - GPU is being used for communication or memory events. + +To achieve high training efficiency, the code should maximize compute time and +minimize idle time and non-compute time. The following function generates a +dataframe that provides a detailed breakdown of the temporal usage for each rank. + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + time_spent_df = analyzer.get_temporal_breakdown() + + +.. image:: ../_static/img/hta/temporal_breakdown_df.png + +When the ``visualize`` argument is set to ``True`` in the `get_temporal_breakdown +`_ +function it also generates a bar graph representing the breakdown by rank. + +.. image:: ../_static/img/hta/temporal_breakdown_plot.png + + +Idle Time Breakdown +------------------- + +Gaining insight into the amount of time the GPU spends idle and the +reasons behind it can help guide optimization strategies. A GPU is +considered idle when no kernel is running on it. We have developed an +algorithm to categorize the `Idle` time into three distinct categories: + +* **Host wait:** refers to the idle time on the GPU that is caused by + the CPU not enqueuing kernels quickly enough to keep the GPU fully utilized. + These types of inefficiencies can be addressed by examining the CPU + operators that are contributing to the slowdown, increasing the batch + size and applying operator fusion. + +* **Kernel wait:** This refers to brief overhead associated with launching + consecutive kernels on the GPU. The idle time attributed to this category + can be minimized by using CUDA Graph optimizations. + +* **Other wait:** This category includes idle time that cannot currently + be attributed due to insufficient information. The likely causes include + synchronization among CUDA streams using CUDA events and delays in launching + kernels. + +The host wait time can be interpreted as the time when the GPU is stalling due +to the CPU. To attribute the idle time as kernel wait we use the following +heuristic: + + | **gap between consecutive kernels < threshold** + +The default threshold value is 30 nanoseconds and can be configured using the +``consecutive_kernel_delay`` argument. By default, the idle time breakdown is +computed for rank 0 only. In order to calculate the breakdown for other ranks, +use the ``ranks`` argument in the `get_idle_time_breakdown +`_ +function. The idle time breakdown can be generated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + idle_time_df = analyzer.get_idle_time_breakdown() + +.. image:: ../_static/img/hta/idle_time_breakdown_percentage.png + +The function returns a tuple of dataframes. The first dataframe contains the +idle time by category on each stream for each rank. + +.. image:: ../_static/img/hta/idle_time.png + :scale: 100% + :align: center + +The second dataframe is generated when ``show_idle_interval_stats`` is set to +``True``. It contains the summary statistics of the idle time for each stream +on each rank. + +.. image:: ../_static/img/hta/idle_time_summary.png + :scale: 100% + +.. tip:: + + By default, the idle time breakdown presents the percentage of each of the + idle time categories. Setting the ``visualize_pctg`` argument to ``False``, + the function renders with absolute time on the y-axis. + + +Kernel Breakdown +---------------- + +The kernel breakdown feature breaks down the time spent for each kernel type, +such as communication (COMM), computation (COMP), and memory (MEM), across all +ranks and presents the proportion of time spent in each category. Here is the +percentage of time spent in each category as a pie chart: + +.. image:: ../_static/img/hta/kernel_type_breakdown.png + :align: center + +The kernel breakdown can be calculated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown() + +The first dataframe returned by the function contains the raw values used to +generate the pie chart. + +Kernel Duration Distribution +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second dataframe returned by `get_gpu_kernel_breakdown +`_ +contains duration summary statistics for each kernel. In particular, this +includes the count, min, max, average, standard deviation, sum, and kernel type +for each kernel on each rank. + +.. image:: ../_static/img/hta/kernel_metrics_df.png + :align: center + +Using this data HTA creates many visualizations to identify performance +bottlenecks. + +#. Pie charts of the top kernels for each kernel type for each rank. + +#. Bar graphs of the average duration across all ranks for each of the top + kernels and for each kernel type. + +.. image:: ../_static/img/hta/pie_charts.png + +.. tip:: + + All images are generated using plotly. Hovering on the graph shows the + mode bar on the top right which allows the user to zoom, pan, select, and + download the graph. + +The pie charts above show the top 5 computation, communication, and memory +kernels. Similar pie charts are generated for each rank. The pie charts can be +configured to show the top k kernels using the ``num_kernels`` argument passed +to the `get_gpu_kernel_breakdown` function. Additionally, the +``duration_ratio`` argument can be used to tune the percentage of time that +needs to be analyzed. If both ``num_kernels`` and ``duration_ratio`` are +specified, then ``num_kernels`` takes precedence. + +.. image:: ../_static/img/hta/comm_across_ranks.png + +The bar graph above shows the average duration of the NCCL AllReduce kernel +across all the ranks. The black lines indicate the minimum and maximum time +taken on each rank. + +.. warning:: + When using jupyter-lab set the "image_renderer" argument value to + "jupyterlab" otherwise the graphs will not render in the notebook. + +For a detailed walkthrough of this feature see the `gpu_kernel_breakdown +notebook +`_ +in the examples folder of the repo. + + +Communication Computation Overlap +--------------------------------- + +In distributed training, a significant amount of time is spent in communication +and synchronization events between GPUs. To achieve high GPU efficiency (such as +TFLOPS/GPU), it is crucial to keep the GPU oversubscribed with computation +kernels. In other words, the GPU should not be blocked due to unresolved data +dependencies. One way to measure the extent to which computation is blocked by +data dependencies is to calculate the communication computation overlap. Higher +GPU efficiency is observed if communication events overlap computation events. +Lack of communication and computation overlap will lead to the GPU being idle, +resulting in low efficiency. +To sum up, a higher communication computation overlap is desirable. To calculate +the overlap percentage for each rank, we measure the following ratio: + + | **(time spent in computation while communicating) / (time spent in communication)** + +The communication computation overlap can be calculated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + overlap_df = analyzer.get_comm_comp_overlap() + +The function returns a dataframe containing the overlap percentage +for each rank. + +.. image:: ../_static/img/hta/overlap_df.png + :align: center + :scale: 50% + +When the ``visualize`` argument is set to True, the `get_comm_comp_overlap +`_ +function also generates a bar graph representing the overlap by rank. + +.. image:: ../_static/img/hta/overlap_plot.png + + +Augmented Counters +------------------ + +Memory Bandwidth & Queue Length Counters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Memory bandwidth counters measure the memory copy bandwidth used while copying +the data from H2D, D2H and D2D by memory copy (memcpy) and memory set (memset) +events. HTA also computes the number of outstanding operations on each CUDA +stream. We refer to this as **queue length**. When the queue length on a stream +is 1024 or larger new events cannot be scheduled on that stream and the CPU +will stall until the events on the GPU stream have processed. + +The `generate_trace_with_counters +`_ +API outputs a new trace file with the memory bandwidth and queue length +counters. The new trace file contains tracks which indicate the memory +bandwidth used by memcpy/memset operations and tracks for the queue length on +each stream. By default, these counters are generated using the rank 0 +trace file, and the new file contains the suffix ``_with_counters`` in its name. +Users have the option to generate the counters for multiple ranks by using the +``ranks`` argument in the ``generate_trace_with_counters`` API. + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer.generate_trace_with_counters() + +A screenshot of the generated trace file with augmented counters. + +.. image:: ../_static/img/hta/mem_bandwidth_queue_length.png + :scale: 100% + +HTA also provides a summary of the memory copy bandwidth and queue length +counters as well as the time series of the counters for the profiled portion of +the code using the following API: + +* `get_memory_bw_summary `_ + +* `get_queue_length_summary `_ + +* `get_memory_bw_time_series `_ + +* `get_queue_length_time_series `_ + +To view the summary and time series, use: + +.. code-block:: python + + # generate summary + mem_bw_summary = analyzer.get_memory_bw_summary() + queue_len_summary = analyzer.get_queue_length_summary() + + # get time series + mem_bw_series = analyzer.get_memory_bw_time_series() + queue_len_series = analyzer.get_queue_length_series() + +The summary contains the count, min, max, mean, standard deviation, 25th, 50th, +and 75th percentile. + +.. image:: ../_static/img/hta/queue_length_summary.png + :scale: 100% + :align: center + +The time series only contains the points when a value changes. Once a value is +observed the time series stays constant until the next update. The memory +bandwidth and queue length time series functions return a dictionary whose key +is the rank and the value is the time series for that rank. By default, the +time series is computed for rank 0 only. + +CUDA Kernel Launch Statistics +----------------------------- + +.. image:: ../_static/img/hta/cuda_kernel_launch.png + +For each event launched on the GPU, there is a corresponding scheduling event on +the CPU, such as ``CudaLaunchKernel``, ``CudaMemcpyAsync``, ``CudaMemsetAsync``. +These events are linked by a common correlation ID in the trace - see the figure +above. This feature computes the duration of the CPU runtime event, its corresponding GPU +kernel and the launch delay, for example, the difference between GPU kernel starting and +CPU operator ending. The kernel launch info can be generated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir") + kernel_info_df = analyzer.get_cuda_kernel_launch_stats() + +A screenshot of the generated dataframe is given below. + +.. image:: ../_static/img/hta/cuda_kernel_launch_stats.png + :scale: 100% + :align: center + +The duration of the CPU op, GPU kernel, and the launch delay allow us to find +the following: + +* **Short GPU kernels** - GPU kernels with duration less than the corresponding + CPU runtime event. + +* **Runtime event outliers** - CPU runtime events with excessive duration. + +* **Launch delay outliers** - GPU kernels which take too long to be scheduled. + +HTA generates distribution plots for each of the aforementioned three categories. + +**Short GPU kernels** + +Typically, the launch time on the CPU side ranges from 5-20 microseconds. In some +cases, the GPU execution time is lower than the launch time itself. The graph +below helps us to find how frequently such instances occur in the code. + +.. image:: ../_static/img/hta/short_gpu_kernels.png + + +**Runtime event outliers** + +The runtime outliers depend on the cutoff used to classify the outliers, hence +the `get_cuda_kernel_launch_stats +`_ +API provides the ``runtime_cutoff`` argument to configure the value. + +.. image:: ../_static/img/hta/runtime_outliers.png + +**Launch delay outliers** + +The launch delay outliers depend on the cutoff used to classify the outliers, +hence the `get_cuda_kernel_launch_stats` API provides the +``launch_delay_cutoff`` argument to configure the value. + +.. image:: ../_static/img/hta/launch_delay_outliers.png + + +Conclusion +~~~~~~~~~~ + +In this tutorial, you have learned how to install and use HTA, +a performance tool that enables you analyze bottlenecks in your distributed +training workflows. To learn how you can use the HTA tool to perform trace +diff analysis, see `Trace Diff using Holistic Trace Analysis `__. diff --git a/beginner_source/hta_trace_diff_tutorial.rst b/beginner_source/hta_trace_diff_tutorial.rst new file mode 100644 index 00000000000..608d29ea358 --- /dev/null +++ b/beginner_source/hta_trace_diff_tutorial.rst @@ -0,0 +1,66 @@ +Trace Diff using Holistic Trace Analysis +======================================== + +**Author:** `Anupam Bhatnagar `_ + +Occasionally, users need to identify the changes in PyTorch operators and CUDA +kernels resulting from a code change. To support this requirement, HTA +provides a trace comparison feature. This feature allows the user to input two +sets of trace files where the first can be thought of as the *control group* +and the second as the *test group*, similar to an A/B test. The ``TraceDiff`` class +provides functions to compare the differences between traces and functionality +to visualize these differences. In particular, users can find operators and +kernels that were added and removed from each group, along with the frequency +of each operator/kernel and the cumulative time taken by the operator/kernel. + +The `TraceDiff `_ class +has the following methods: + +* `compare_traces `_: + Compare the frequency and total duration of CPU operators and GPU kernels from + two sets of traces. + +* `ops_diff `_: + Get the operators and kernels which have been: + + #. **added** to the test trace and are absent in the control trace + #. **deleted** from the test trace and are present in the control trace + #. **increased** in frequency in the test trace and exist in the control trace + #. **decreased** in frequency in the test trace and exist in the control trace + #. **unchanged** between the two sets of traces + +* `visualize_counts_diff `_ + +* `visualize_duration_diff `_ + +The last two methods can be used to visualize various changes in frequency and +duration of CPU operators and GPU kernels, using the output of the +``compare_traces`` method. + +For example, the top ten operators with increase in frequency can be computed as +follows: + +.. code-block:: python + + df = compare_traces_output.sort_values(by="diff_counts", ascending=False).head(10) + TraceDiff.visualize_counts_diff(df) + +.. image:: ../_static/img/hta/counts_diff.png + +Similarly, the top ten operators with the largest change in duration can be computed as +follows: + +.. code-block:: python + + df = compare_traces_output.sort_values(by="diff_duration", ascending=False) + # The duration differerence can be overshadowed by the "ProfilerStep", + # so we can filter it out to show the trend of other operators. + df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10) + TraceDiff.visualize_duration_diff(df) + +.. image:: ../_static/img/hta/duration_diff.png + +For a detailed example of this feature see the `trace_diff_demo notebook +`_ +in the examples folder of the repository. + diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index 228879fa5f2..aa84069f7be 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -10,7 +10,7 @@ Fortunately, there are tools that help with finding the best combination of parameters. `Ray Tune `_ is an industry standard tool for distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search -algorithms, integrates with TensorBoard and other analysis libraries, and natively +algorithms, integrates with various analysis libraries, and natively supports distributed training through `Ray's distributed machine learning engine `_. @@ -41,6 +41,8 @@ """ from functools import partial import os +import tempfile +from pathlib import Path import torch import torch.nn as nn import torch.nn.functional as F @@ -48,12 +50,22 @@ from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms +# sphinx_gallery_start_ignore +# Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``. +# This is only needed to run with sphinx-build. +import sys +if not hasattr(sys.stdout, "encoding"): + sys.stdout.encoding = "latin1" + sys.stdout.fileno = lambda: 0 +# sphinx_gallery_end_ignore from ray import tune -from ray.air import Checkpoint, session +from ray import train +from ray.train import Checkpoint, get_checkpoint from ray.tune.schedulers import ASHAScheduler +import ray.cloudpickle as pickle ###################################################################### -# Most of the imports are needed for building the PyTorch model. Only the last three +# Most of the imports are needed for building the PyTorch model. Only the last # imports are for Ray Tune. # # Data loaders @@ -124,13 +136,15 @@ def forward(self, x): # # net = Net(config["l1"], config["l2"]) # -# checkpoint = session.get_checkpoint() -# +# checkpoint = get_checkpoint() # if checkpoint: -# checkpoint_state = checkpoint.to_dict() -# start_epoch = checkpoint_state["epoch"] -# net.load_state_dict(checkpoint_state["net_state_dict"]) -# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) +# with checkpoint.as_directory() as checkpoint_dir: +# data_path = Path(checkpoint_dir) / "data.pkl" +# with open(data_path, "rb") as fp: +# checkpoint_state = pickle.load(fp) +# start_epoch = checkpoint_state["epoch"] +# net.load_state_dict(checkpoint_state["net_state_dict"]) +# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) # else: # start_epoch = 0 # @@ -186,12 +200,16 @@ def forward(self, x): # "net_state_dict": net.state_dict(), # "optimizer_state_dict": optimizer.state_dict(), # } -# checkpoint = Checkpoint.from_dict(checkpoint_data) +# with tempfile.TemporaryDirectory() as checkpoint_dir: +# data_path = Path(checkpoint_dir) / "data.pkl" +# with open(data_path, "wb") as fp: +# pickle.dump(checkpoint_data, fp) # -# session.report( -# {"loss": val_loss / val_steps, "accuracy": correct / total}, -# checkpoint=checkpoint, -# ) +# checkpoint = Checkpoint.from_directory(checkpoint_dir) +# train.report( +# {"loss": val_loss / val_steps, "accuracy": correct / total}, +# checkpoint=checkpoint, +# ) # # Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically, # we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics @@ -225,13 +243,15 @@ def train_cifar(config, data_dir=None): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - checkpoint = session.get_checkpoint() - + checkpoint = get_checkpoint() if checkpoint: - checkpoint_state = checkpoint.to_dict() - start_epoch = checkpoint_state["epoch"] - net.load_state_dict(checkpoint_state["net_state_dict"]) - optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) + with checkpoint.as_directory() as checkpoint_dir: + data_path = Path(checkpoint_dir) / "data.pkl" + with open(data_path, "rb") as fp: + checkpoint_state = pickle.load(fp) + start_epoch = checkpoint_state["epoch"] + net.load_state_dict(checkpoint_state["net_state_dict"]) + optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) else: start_epoch = 0 @@ -300,12 +320,17 @@ def train_cifar(config, data_dir=None): "net_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } - checkpoint = Checkpoint.from_dict(checkpoint_data) - - session.report( - {"loss": val_loss / val_steps, "accuracy": correct / total}, - checkpoint=checkpoint, - ) + with tempfile.TemporaryDirectory() as checkpoint_dir: + data_path = Path(checkpoint_dir) / "data.pkl" + with open(data_path, "wb") as fp: + pickle.dump(checkpoint_data, fp) + + checkpoint = Checkpoint.from_directory(checkpoint_dir) + train.report( + {"loss": val_loss / val_steps, "accuracy": correct / total}, + checkpoint=checkpoint, + ) + print("Finished Training") @@ -438,23 +463,18 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) - best_checkpoint = best_trial.checkpoint.to_air_checkpoint() - best_checkpoint_data = best_checkpoint.to_dict() + best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max") + with best_checkpoint.as_directory() as checkpoint_dir: + data_path = Path(checkpoint_dir) / "data.pkl" + with open(data_path, "rb") as fp: + best_checkpoint_data = pickle.load(fp) - best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) - - test_acc = test_accuracy(best_trained_model, device) - print("Best trial test set accuracy: {}".format(test_acc)) + best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) + test_acc = test_accuracy(best_trained_model, device) + print("Best trial test set accuracy: {}".format(test_acc)) if __name__ == "__main__": - # sphinx_gallery_start_ignore - # Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``. - # This is only needed to run with sphinx-build. - import sys - - sys.stdout.fileno = lambda: False - # sphinx_gallery_end_ignore # You can change the number of GPUs per trial here: main(num_samples=10, max_num_epochs=10, gpus_per_trial=0) @@ -462,7 +482,7 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): ###################################################################### # If you run the code, an example output could look like this: # -# :: +# .. code-block:: sh # # Number of trials: 10/10 (10 TERMINATED) # +-----+--------------+------+------+-------------+--------+---------+------------+ diff --git a/beginner_source/introyt.rst b/beginner_source/introyt.rst index 841cc3cd28d..9b2a630c245 100644 --- a/beginner_source/introyt.rst +++ b/beginner_source/introyt.rst @@ -1,29 +1,10 @@ -`Introduction `_ || -`Tensors `_ || -`Autograd `_ || -`Building Models `_ || -`TensorBoard Support `_ || -`Training Models `_ || -`Model Understanding `_ - Introduction to PyTorch - YouTube Series ======================================== -Authors: -`Brad Heintz `_ - -This tutorial follows along with the `PyTorch Beginner Series `_ on YouTube. - -`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.` - -Running the Tutorial Code -------------------------- -You can run this tutorial in a couple of ways: +This page has been moved. -- **In the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU* -- **Locally**: This option requires you to setup PyTorch and torchvision first on your local machine (`installation instructions `_). Download the notebook or copy the code into your favorite IDE. +Redirecting now... -.. include:: /beginner_source/introyt/tocyt.txt +.. raw:: html -.. toctree:: - :hidden: + diff --git a/beginner_source/introyt/README.txt b/beginner_source/introyt/README.txt index ebe8f2e9c21..b90d269cfab 100644 --- a/beginner_source/introyt/README.txt +++ b/beginner_source/introyt/README.txt @@ -1,7 +1,7 @@ Introduction to PyTorch on YouTube ---------------------------------- -1. introyt.rst +1. introyt.py Introduction to PyTorch - Youtube Series https://pytorch.org/tutorials/beginner/introyt/introyt.html diff --git a/beginner_source/introyt/autogradyt_tutorial.py b/beginner_source/introyt/autogradyt_tutorial.py index a2ed238e52b..abf75a7d266 100644 --- a/beginner_source/introyt/autogradyt_tutorial.py +++ b/beginner_source/introyt/autogradyt_tutorial.py @@ -213,7 +213,7 @@ ######################################################################### # Recall the computation steps we took to get here: # -# :: +# .. code-block:: python # # a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) # b = torch.sin(a) @@ -250,9 +250,9 @@ class TinyModel(torch.nn.Module): def __init__(self): super(TinyModel, self).__init__() - self.layer1 = torch.nn.Linear(1000, 100) + self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE) self.relu = torch.nn.ReLU() - self.layer2 = torch.nn.Linear(100, 10) + self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT) def forward(self, x): x = self.layer1(x) @@ -456,10 +456,10 @@ def add_tensors2(x, y): # .. note:: # The following code cell throws a runtime error. This is expected. # -# :: +# .. code-block:: python # -# a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) -# torch.sin_(a) +# a = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True) +# torch.sin_(a) # ######################################################################### diff --git a/beginner_source/introyt/captumyt.py b/beginner_source/introyt/captumyt.py index cf63b6109b6..abf2391d254 100644 --- a/beginner_source/introyt/captumyt.py +++ b/beginner_source/introyt/captumyt.py @@ -109,11 +109,15 @@ To install Captum in an Anaconda or pip virtual environment, use the appropriate command for your environment below: -With ``conda``:: +With ``conda``: + +.. code-block:: sh conda install pytorch torchvision captum flask-compress matplotlib=3.3.4 -c pytorch -With ``pip``:: +With ``pip``: + +.. code-block:: sh pip install torch torchvision captum matplotlib==3.3.4 Flask-Compress diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py index a5d65bcab16..74675070708 100644 --- a/beginner_source/introyt/introyt1_tutorial.py +++ b/beginner_source/introyt/introyt1_tutorial.py @@ -580,7 +580,7 @@ def forward(self, x): # # **When you run the cell above,** you should see something like this: # -# :: +# .. code-block:: sh # # [1, 2000] loss: 2.235 # [1, 4000] loss: 1.940 diff --git a/beginner_source/introyt/introyt_index.py b/beginner_source/introyt/introyt_index.py new file mode 100644 index 00000000000..63b9c053a3c --- /dev/null +++ b/beginner_source/introyt/introyt_index.py @@ -0,0 +1,38 @@ +""" +`Introduction `_ || +`Tensors `_ || +`Autograd `_ || +`Building Models `_ || +`TensorBoard Support `_ || +`Training Models `_ || +`Model Understanding `_ + +Introduction to PyTorch - YouTube Series +======================================== + +Authors: +`Brad Heintz `_ + +This tutorial follows along with the `PyTorch Beginner Series `_ on YouTube. + +`This tutorial assumes a basic familiarity with Python and Deep Learning concepts.` + +Running the Tutorial Code +------------------------- +You can run this tutorial in a couple of ways: + +- **On the cloud**: This is the easiest way to get started! Each section has a Colab link at the top, which opens a notebook with the code in a fully-hosted environment. Pro tip: Use Colab with a GPU runtime to speed up operations *Runtime > Change runtime type > GPU* +- **Locally**: This option requires you to set up PyTorch and torchvision on your local machine (`installation instructions `_). Download the notebook or copy the code into your favorite IDE. + +.. toctree:: + :maxdepth: 2 + :hidden: + + introyt1_tutorial + tensors_deeper_tutorial + autogradyt_tutorial + modelsyt_tutorial + tensorboardyt_tutorial + trainingyt + captumyt +""" diff --git a/beginner_source/introyt/modelsyt_tutorial.py b/beginner_source/introyt/modelsyt_tutorial.py index 884fcbdb105..61c27d5c543 100644 --- a/beginner_source/introyt/modelsyt_tutorial.py +++ b/beginner_source/introyt/modelsyt_tutorial.py @@ -311,9 +311,7 @@ def forward(self, sentence): # ``TransformerDecoder``) and subcomponents (``TransformerEncoderLayer``, # ``TransformerDecoderLayer``). For details, check out the # `documentation `__ -# on transformer classes, and the relevant -# `tutorial `__ -# on pytorch.org. +# on transformer classes. # # Other Layers and Functions # -------------------------- @@ -342,7 +340,7 @@ def forward(self, sentence): # the 6x6 input. # # **Normalization layers** re-center and normalize the output of one layer -# before feeding it to another. Centering the and scaling the intermediate +# before feeding it to another. Centering and scaling the intermediate # tensors has a number of beneficial effects, such as letting you use # higher learning rates without exploding/vanishing gradients. # diff --git a/beginner_source/introyt/tensorboardyt_tutorial.py b/beginner_source/introyt/tensorboardyt_tutorial.py index 29e83066726..49d321bd6df 100644 --- a/beginner_source/introyt/tensorboardyt_tutorial.py +++ b/beginner_source/introyt/tensorboardyt_tutorial.py @@ -24,12 +24,16 @@ To run this tutorial, you’ll need to install PyTorch, TorchVision, Matplotlib, and TensorBoard. -With ``conda``:: +With ``conda``: + +.. code-block:: sh conda install pytorch torchvision -c pytorch conda install matplotlib tensorboard -With ``pip``:: +With ``pip``: + +.. code-block:: sh pip install torch torchvision matplotlib tensorboard @@ -214,13 +218,14 @@ def forward(self, x): # Check against the validation set running_vloss = 0.0 - net.train(False) # Don't need to track gradents for validation + # In evaluation mode some model specific operations can be omitted eg. dropout layer + net.train(False) # Switching to evaluation mode, eg. turning off regularisation for j, vdata in enumerate(validation_loader, 0): vinputs, vlabels = vdata voutputs = net(vinputs) vloss = criterion(voutputs, vlabels) running_vloss += vloss.item() - net.train(True) # Turn gradients back on for training + net.train(True) # Switching back to training mode, eg. turning on regularisation avg_loss = running_loss / 1000 avg_vloss = running_vloss / len(validation_loader) @@ -244,7 +249,7 @@ def forward(self, x): # # TensorBoard can also be used to examine the data flow within your model. # To do this, call the ``add_graph()`` method with a model and sample -# input. When you open +# input: # # Again, grab a single mini-batch of images diff --git a/beginner_source/introyt/tensors_deeper_tutorial.py b/beginner_source/introyt/tensors_deeper_tutorial.py index 8b2c1630aff..d7293dfe295 100644 --- a/beginner_source/introyt/tensors_deeper_tutorial.py +++ b/beginner_source/introyt/tensors_deeper_tutorial.py @@ -228,18 +228,7 @@ # integer with the ``.to()`` method. Note that ``c`` contains all the same # values as ``b``, but truncated to integers. # -# Available data types include: -# -# - ``torch.bool`` -# - ``torch.int8`` -# - ``torch.uint8`` -# - ``torch.int16`` -# - ``torch.int32`` -# - ``torch.int64`` -# - ``torch.half`` -# - ``torch.float`` -# - ``torch.double`` -# - ``torch.bfloat`` +# For more information, see the `data types documentation `__. # # Math & Logic with PyTorch Tensors # --------------------------------- @@ -292,14 +281,14 @@ # binary operation on tensors if dissimilar shape? # # .. note:: -# The following cell throws a run-time error. This is intentional. +# The following cell throws a run-time error. This is intentional. # -# :: +# .. code-block:: sh # -# a = torch.rand(2, 3) -# b = torch.rand(3, 2) +# a = torch.rand(2, 3) +# b = torch.rand(3, 2) # -# print(a * b) +# print(a * b) # @@ -378,7 +367,7 @@ # # - The multiplication operation that created ``b`` was # broadcast over every “layer” of ``a``. -# - For ``c``, the operation was broadcast over ever layer and row of +# - For ``c``, the operation was broadcast over every layer and row of # ``a`` - every 3-element column is identical. # - For ``d``, we switched it around - now every *row* is identical, # across layers and columns. @@ -390,17 +379,17 @@ # Here are some examples of attempts at broadcasting that will fail: # # .. note:: -# The following cell throws a run-time error. This is intentional. +# The following cell throws a run-time error. This is intentional. # -# :: +# .. code-block:: python # -# a = torch.ones(4, 3, 2) +# a = torch.ones(4, 3, 2) # -# b = a * torch.rand(4, 3) # dimensions must match last-to-first +# b = a * torch.rand(4, 3) # dimensions must match last-to-first # -# c = a * torch.rand( 2, 3) # both 3rd & 2nd dims different +# c = a * torch.rand( 2, 3) # both 3rd & 2nd dims different # -# d = a * torch.rand((0, )) # can't broadcast with an empty tensor +# d = a * torch.rand((0, )) # can't broadcast with an empty tensor # @@ -459,17 +448,19 @@ m2 = torch.tensor([[3., 0.], [0., 3.]]) # three times identity matrix print('\nVectors & Matrices:') -print(torch.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) +print(torch.linalg.cross(v2, v1)) # negative of z unit vector (v1 x v2 == -v2 x v1) print(m1) -m3 = torch.matmul(m1, m2) +m3 = torch.linalg.matmul(m1, m2) print(m3) # 3 times m1 -print(torch.svd(m3)) # singular value decomposition +print(torch.linalg.svd(m3)) # singular value decomposition ################################################################################## # This is a small sample of operations. For more details and the full inventory of # math functions, have a look at the # `documentation `__. +# For more details and the full inventory of linear algebra operations, have a +# look at this `documentation `__. # # Altering Tensors in Place # ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -729,7 +720,7 @@ # following code will throw a runtime error, regardless of whether you # have a GPU device available: # -# :: +# .. code-block:: python # # x = torch.rand(2, 2) # y = torch.rand(2, 2, device='gpu') @@ -820,9 +811,9 @@ # Another place you might use ``unsqueeze()`` is to ease broadcasting. # Recall the example above where we had the following code: # -# :: +# .. code-block:: python # -# a = torch.ones(4, 3, 2) +# a = torch.ones(4, 3, 2) # # c = a * torch.rand( 3, 1) # 3rd dim = 1, 2nd dim identical to a # print(c) diff --git a/beginner_source/introyt/tocyt.txt b/beginner_source/introyt/tocyt.txt index f956671c11b..24b47e48913 100644 --- a/beginner_source/introyt/tocyt.txt +++ b/beginner_source/introyt/tocyt.txt @@ -5,4 +5,3 @@ 5. `PyTorch TensorBoard Support `_ 6. `Training with PyTorch `_ 7. `Model Understanding with Captum `_ -8. `Production Inference Deployment with PyTorch `_ (video only) diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py index 304ac661d4e..49ab9a134dc 100644 --- a/beginner_source/knowledge_distillation_tutorial.py +++ b/beginner_source/knowledge_distillation_tutorial.py @@ -25,7 +25,7 @@ # - How to improve the performance of lightweight models by using more complex models as teachers # # Prerequisites -# ~~~~~~~~~~~ +# ~~~~~~~~~~~~~ # # * 1 GPU, 4GB of memory # * PyTorch v2.0 or later @@ -324,7 +324,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1) # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network" - soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2) + soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2) # Calculate the true label loss label_loss = ce_loss(student_logits, labels) @@ -352,7 +352,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin # Cosine loss minimization run # ---------------------------- # Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients. -# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization. +# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization. # Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers. # Our goal is to convey information from the teacher's representation to the student by including a naive loss function, # whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases. diff --git a/beginner_source/nlp/README.txt b/beginner_source/nlp/README.txt index 707ee0cb17e..b2f7b19145e 100644 --- a/beginner_source/nlp/README.txt +++ b/beginner_source/nlp/README.txt @@ -1,6 +1,28 @@ Deep Learning for NLP with Pytorch ---------------------------------- +These tutorials will walk you through the key ideas of deep learning +programming using Pytorch. Many of the concepts (such as the computation +graph abstraction and autograd) are not unique to Pytorch and are +relevant to any deep learning toolkit out there. + +They are focused specifically on NLP for people who have never written +code in any deep learning framework (e.g, TensorFlow,Theano, Keras, DyNet). +The tutorials assumes working knowledge of core NLP problems: part-of-speech +tagging, language modeling, etc. It also assumes familiarity with neural +networks at the level of an intro AI class (such as one from the Russel and +Norvig book). Usually, these courses cover the basic backpropagation algorithm +on feed-forward neural networks, and make the point that they are chains of +compositions of linearities and non-linearities. This tutorial aims to get +you started writing deep learning code, given you have this prerequisite +knowledge. + +Note these tutorials are about *models*, not data. For all of the models, +a few test examples are created with small dimensionality so you can see how +the weights change as it trains. If you have some real data you want to +try, you should be able to rip out any of the models from this notebook +and use them on it. + 1. pytorch_tutorial.py Introduction to PyTorch https://pytorch.org/tutorials/beginner/nlp/pytorch_tutorial.html diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py index 183aca1748b..9a1ce8218e0 100644 --- a/beginner_source/nn_tutorial.py +++ b/beginner_source/nn_tutorial.py @@ -31,7 +31,7 @@ # MNIST data setup # ---------------- # -# We will use the classic `MNIST `_ dataset, +# We will use the classic `MNIST `_ dataset, # which consists of black-and-white images of hand-drawn digits (between 0 and 9). # # We will use `pathlib `_ @@ -98,7 +98,7 @@ ############################################################################### # Neural net from scratch (without ``torch.nn``) -# --------------------------------------------- +# ----------------------------------------------- # # Let's first create a model using nothing but PyTorch tensor operations. We're assuming # you're already familiar with the basics of neural networks. (If you're not, you can @@ -328,7 +328,7 @@ def forward(self, xb): # Previously for our training loop we had to update the values for each parameter # by name, and manually zero out the grads for each parameter separately, like this: # -# :: +# .. code-block:: python # # with torch.no_grad(): # weights -= weights.grad * lr @@ -342,7 +342,7 @@ def forward(self, xb): # and less prone to the error of forgetting some of our parameters, particularly # if we had a more complicated model: # -# :: +# .. code-block:: python # # with torch.no_grad(): # for p in model.parameters(): p -= p.grad * lr @@ -418,7 +418,7 @@ def forward(self, xb): # # This will let us replace our previous manually coded optimization step: # -# :: +# .. code-block:: python # # with torch.no_grad(): # for p in model.parameters(): p -= p.grad * lr @@ -426,7 +426,7 @@ def forward(self, xb): # # and instead use just: # -# :: +# .. code-block:: python # # opt.step() # opt.zero_grad() @@ -490,7 +490,7 @@ def get_model(): ############################################################################### # Previously, we had to iterate through minibatches of ``x`` and ``y`` values separately: # -# :: +# .. code-block:: python # # xb = x_train[start_i:end_i] # yb = y_train[start_i:end_i] @@ -498,7 +498,7 @@ def get_model(): # # Now, we can do these two steps together: # -# :: +# .. code-block:: python # # xb,yb = train_ds[i*bs : i*bs+bs] # @@ -534,7 +534,7 @@ def get_model(): ############################################################################### # Previously, our loop iterated over batches ``(xb, yb)`` like this: # -# :: +# .. code-block:: python # # for i in range((n-1)//bs + 1): # xb,yb = train_ds[i*bs : i*bs+bs] @@ -542,7 +542,7 @@ def get_model(): # # Now, our loop is much cleaner, as ``(xb, yb)`` are loaded automatically from the data loader: # -# :: +# .. code-block:: python # # for xb,yb in train_dl: # pred = model(xb) diff --git a/beginner_source/onnx/README.txt b/beginner_source/onnx/README.txt index f73ed11bc8f..277dcabbd58 100644 --- a/beginner_source/onnx/README.txt +++ b/beginner_source/onnx/README.txt @@ -3,8 +3,16 @@ ONNX 1. intro_onnx.py Introduction to ONNX - https://pytorch.org/tutorials/onnx/intro_onnx.html + https://pytorch.org/tutorials/beginner/onnx/intro_onnx.html 2. export_simple_model_to_onnx_tutorial.py - Export a PyTorch model to ONNX + Exporting a PyTorch model to ONNX https://pytorch.org/tutorials/beginner/onnx/export_simple_model_to_onnx_tutorial.html + +3. onnx_registry_tutorial.py + Extending the ONNX Registry + https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html + +4. export_simple_model_to_onnx_with_a_test.py + Export a Pytorch model with a test to ONNX + https://pytorch.org/tutorials/beginner/onnx/export_simple_model_to_onnx_with_a_test.html diff --git a/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py index fa09dc86abc..760c40ab43c 100644 --- a/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py +++ b/beginner_source/onnx/export_simple_model_to_onnx_tutorial.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- """ `Introduction to ONNX `_ || -**Export a PyTorch model to ONNX** +**Exporting a PyTorch model to ONNX** || +`Extending the ONNX Registry `_ Export a PyTorch model to ONNX ============================== @@ -89,11 +90,11 @@ def forward(self, x): torch_model = MyModel() torch_input = torch.randn(1, 1, 32, 32) -export_output = torch.onnx.dynamo_export(torch_model, torch_input) +onnx_program = torch.onnx.dynamo_export(torch_model, torch_input) ###################################################################### # As we can see, we didn't need any code change to the model. -# The resulting ONNX model is stored within ``torch.onnx.ExportOutput`` as a binary protobuf file. +# The resulting ONNX model is stored within ``torch.onnx.ONNXProgram`` as a binary protobuf file. # # 4. Save the ONNX model in a file # -------------------------------- @@ -101,10 +102,10 @@ def forward(self, x): # Although having the exported model loaded in memory is useful in many applications, # we can save it to disk with the following code: -export_output.save("my_image_classifier.onnx") +onnx_program.save("my_image_classifier.onnx") ###################################################################### -# The ONNX file can be loaded back into memory and checked if it is well formed with the following code: +# You can load the ONNX file back into memory and check if it is well formed with the following code: import onnx onnx_model = onnx.load("my_image_classifier.onnx") @@ -154,7 +155,7 @@ def forward(self, x): import onnxruntime -onnx_input = export_output.adapt_torch_inputs_to_onnx(torch_input) +onnx_input = onnx_program.adapt_torch_inputs_to_onnx(torch_input) print(f"Input length: {len(onnx_input)}") print(f"Sample input: {onnx_input}") @@ -167,9 +168,9 @@ def to_numpy(tensor): onnxruntime_outputs = ort_session.run(None, onnxruntime_input) -###################################################################### +#################################################################### # 7. Compare the PyTorch results with the ones from the ONNX Runtime -# ----------------------------------------------------------------- +# ------------------------------------------------------------------ # # The best way to determine whether the exported model is looking good is through numerical evaluation # against PyTorch, which is our source of truth. @@ -178,7 +179,7 @@ def to_numpy(tensor): # Before comparing the results, we need to convert the PyTorch's output to match ONNX's format. torch_outputs = torch_model(torch_input) -torch_outputs = export_output.adapt_torch_outputs_to_onnx(torch_outputs) +torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) assert len(torch_outputs) == len(onnxruntime_outputs) for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): diff --git a/beginner_source/onnx/export_simple_model_to_onnx_with_a_test.py b/beginner_source/onnx/export_simple_model_to_onnx_with_a_test.py new file mode 100644 index 00000000000..21f79f15b42 --- /dev/null +++ b/beginner_source/onnx/export_simple_model_to_onnx_with_a_test.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +""" +**Introduction to ONNX** || +`Exporting a PyTorch model to ONNX `_ || +`Extending the ONNX Registry `_ + +Export a Pytorch model with a test to ONNX +========================================== + +Tests cannot be exported into ONNX unless they refactored +to use :func:`torch.cond`. Let's start with a simple model +implementing a test. +""" + +from onnx.printer import to_text +import torch + +class ForwardWithControlFlowTest(torch.nn.Module): + def forward(self, x): + if x.sum(): + return x * 2 + return -x + + +class ModelWithControlFlowTest(torch.nn.Module): + def __init__(self): + super().__init__() + self.mlp = torch.nn.Sequential( + torch.nn.Linear(3, 2), + torch.nn.Linear(2, 1), + ForwardWithControlFlowTest(), + ) + + def forward(self, x): + out = self.mlp(x) + return out + + +model = ModelWithControlFlowTest() + +# %% +# Let's check it runs. +x = torch.randn(3) +model(x) + +# %% +# As expected, it does not export. +try: + torch.export.export(model, (x,)) + raise AssertionError("This export should failed unless pytorch now supports this model.") +except Exception as e: + print(e) + +# %% +# It does export with :func:`torch.onnx.export` because +# it uses JIT to trace the execution. +# But the model is not exactly the same as the initial model. +ep = torch.onnx.export(model, (x,), dynamo=True) +print(to_text(ep.model_proto)) + + +# %% +# Suggested Patch +# +++++++++++++++ +# +# Let's avoid the graph break by replacing the forward. + + +def new_forward(x): + def identity2(x): + return x * 2 + + def neg(x): + return -x + + return torch.cond(x.sum() > 0, identity2, neg, (x,)) + + +print("the list of submodules") +for name, mod in model.named_modules(): + print(name, type(mod)) + if isinstance(mod, ForwardWithControlFlowTest): + mod.forward = new_forward + +# %% +# Let's see what the fx graph looks like. + +print(torch.export.export(model, (x,)).graph) + +# %% +# Let's export again. + +ep = torch.onnx.export(model, (x,), dynamo=True) +print(to_text(ep.model_proto)) + + +# %% +# Let's optimize to see a small model. + +ep = torch.onnx.export(model, (x,), dynamo=True) +ep.optimize() +print(to_text(ep.model_proto)) diff --git a/beginner_source/onnx/intro_onnx.py b/beginner_source/onnx/intro_onnx.py index 05ad3090cc8..ec625ec78ff 100644 --- a/beginner_source/onnx/intro_onnx.py +++ b/beginner_source/onnx/intro_onnx.py @@ -1,6 +1,7 @@ """ **Introduction to ONNX** || -`Export a PyTorch model to ONNX `_ +`Exporting a PyTorch model to ONNX `_ || +`Extending the ONNX Registry `_ Introduction to ONNX ==================== @@ -32,16 +33,37 @@ Dependencies ------------ +PyTorch 2.1.0 or newer is required. + The ONNX exporter depends on extra Python packages: - - `ONNX `_ - - `ONNX Script `_ + - `ONNX `_ standard library + - `ONNX Script `_ library that enables developers to author ONNX operators, + functions and models using a subset of Python in an expressive, and yet simple fashion + - `ONNX Runtime `_ accelerated machine learning library. They can be installed through `pip `_: .. code-block:: bash - pip install --upgrade onnx onnxscript + pip install --upgrade onnx onnxscript onnxruntime + +To validate the installation, run the following commands: + +.. code-block:: python + + import torch + print(torch.__version__) + + import onnxscript + print(onnxscript.__version__) + + from onnxscript import opset18 # opset 18 is the latest (and only) supported version for now + + import onnxruntime + print(onnxruntime.__version__) + +Each `import` must succeed without any errors and the library versions must be printed out. Further reading --------------- diff --git a/beginner_source/onnx/onnx_registry_tutorial.py b/beginner_source/onnx/onnx_registry_tutorial.py new file mode 100644 index 00000000000..0f64ba9c4d4 --- /dev/null +++ b/beginner_source/onnx/onnx_registry_tutorial.py @@ -0,0 +1,459 @@ +# -*- coding: utf-8 -*- + +""" +`Introduction to ONNX `_ || +`Exporting a PyTorch model to ONNX `_ || +**Extending the ONNX Registry** + +Extending the ONNX Registry +=========================== + +**Authors:** Ti-Tai Wang (titaiwang@microsoft.com) +""" + + +############################################################################### +# Overview +# -------- +# +# This tutorial is an introduction to ONNX registry, which empowers users to implement new ONNX operators +# or even replace existing operators with a new implementation. +# +# During the model export to ONNX, the PyTorch model is lowered to an intermediate +# representation composed of `ATen operators `_. +# While ATen operators are maintained by PyTorch core team, it is the responsibility of the ONNX exporter team +# to independently implement each of these operators to ONNX through `ONNX Script `_. +# The users can also replace the behavior implemented by the ONNX exporter team with their own implementation +# to fix bugs or improve performance for a specific ONNX runtime. +# +# The ONNX Registry manages the mapping between PyTorch operators and the ONNX operators counterparts and provides +# APIs to extend the registry. +# +# In this tutorial, we will cover three scenarios that require extending the ONNX registry with custom operators: +# +# * Unsupported ATen operators +# * Custom operators with existing ONNX Runtime support +# * Custom operators without ONNX Runtime support +# +# Unsupported ATen operators +# -------------------------- +# +# Although the ONNX exporter team does their best efforts to support all ATen operators, some of them +# might not be supported yet. In this section, we will demonstrate how you can add +# unsupported ATen operators to the ONNX Registry. +# +# .. note:: +# The steps to implement unsupported ATen operators are the same to replace the implementation of an existing +# ATen operator with a custom implementation. +# Because we don't actually have an unsupported ATen operator to use in this tutorial, we are going to leverage +# this and replace the implementation of ``aten::add.Tensor`` with a custom implementation the same way we would +# if the operator was not present in the ONNX Registry. +# +# When a model cannot be exported to ONNX due to an unsupported operator, the ONNX exporter will show an error message +# similar to: +# +# .. code-block:: python +# +# RuntimeErrorWithDiagnostic: Unsupported FX nodes: {'call_function': ['aten.add.Tensor']}. +# +# The error message indicates that the fully qualified name of unsupported ATen operator is ``aten::add.Tensor``. +# The fully qualified name of an operator is composed of the namespace, operator name, and overload following +# the format ``namespace::operator_name.overload``. +# +# To add support for an unsupported ATen operator or to replace the implementation for an existing one, we need: +# +# * The fully qualified name of the ATen operator (e.g. ``aten::add.Tensor``). +# This information is always present in the error message as show above. +# * The implementation of the operator using `ONNX Script `__. +# ONNX Script is a prerequisite for this tutorial. Please make sure you have read the +# `ONNX Script tutorial `_ +# before proceeding. +# +# Because ``aten::add.Tensor`` is already supported by the ONNX Registry, we will demonstrate how to replace it with a +# custom implementation, but keep in mind that the same steps apply to support new unsupported ATen operators. +# +# This is possible because the :class:`OnnxRegistry` allows users to override an operator registration. +# We will override the registration of ``aten::add.Tensor`` with our custom implementation and verify it exists. +# + +import torch +import onnxruntime +import onnxscript +from onnxscript import opset18 # opset 18 is the latest (and only) supported version for now + +class Model(torch.nn.Module): + def forward(self, input_x, input_y): + return torch.ops.aten.add(input_x, input_y) # generates a aten::add.Tensor node + +input_add_x = torch.randn(3, 4) +input_add_y = torch.randn(3, 4) +aten_add_model = Model() + + +# Now we create a ONNX Script function that implements ``aten::add.Tensor``. +# The function name (e.g. ``custom_aten_add``) is displayed in the ONNX graph, so we recommend to use intuitive names. +custom_aten = onnxscript.values.Opset(domain="custom.aten", version=1) + +# NOTE: The function signature must match the signature of the unsupported ATen operator. +# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml +# NOTE: All attributes must be annotated with type hints. +@onnxscript.script(custom_aten) +def custom_aten_add(input_x, input_y, alpha: float = 1.0): + input_y = opset18.Mul(input_y, alpha) + return opset18.Add(input_x, input_y) + + +# Now we have everything we need to support unsupported ATen operators. +# Let's register the ``custom_aten_add`` function to ONNX registry, and export the model to ONNX again. +onnx_registry = torch.onnx.OnnxRegistry() +onnx_registry.register_op( + namespace="aten", op_name="add", overload="Tensor", function=custom_aten_add + ) +print(f"aten::add.Tensor is supported by ONNX registry: \ + {onnx_registry.is_registered_op(namespace='aten', op_name='add', overload='Tensor')}" + ) +export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) +onnx_program = torch.onnx.dynamo_export( + aten_add_model, input_add_x, input_add_y, export_options=export_options + ) + +###################################################################### +# Now let's inspect the model and verify the model has a ``custom_aten_add`` instead of ``aten::add.Tensor``. +# The graph has one graph node for ``custom_aten_add``, and inside of it there are four function nodes, one for each +# operator, and one for constant attribute. +# + +# graph node domain is the custom domain we registered +assert onnx_program.model_proto.graph.node[0].domain == "custom.aten" +assert len(onnx_program.model_proto.graph.node) == 1 +# graph node name is the function name +assert onnx_program.model_proto.graph.node[0].op_type == "custom_aten_add" +# function node domain is empty because we use standard ONNX operators +assert {node.domain for node in onnx_program.model_proto.functions[0].node} == {""} +# function node name is the standard ONNX operator name +assert {node.op_type for node in onnx_program.model_proto.functions[0].node} == {"Add", "Mul", "Constant"} + + +###################################################################### +# This is how ``custom_aten_add_model`` looks in the ONNX graph using Netron: +# +# .. image:: /_static/img/onnx/custom_aten_add_model.png +# :width: 70% +# :align: center +# +# Inside the ``custom_aten_add`` function, we can see the three ONNX nodes we +# used in the function (``CastLike``, ``Add``, and ``Mul``), and one ``Constant`` attribute: +# +# .. image:: /_static/img/onnx/custom_aten_add_function.png +# :width: 70% +# :align: center +# +# This was all that we needed to register the new ATen operator into the ONNX Registry. +# As an additional step, we can use ONNX Runtime to run the model, and compare the results with PyTorch. +# + + +# Use ONNX Runtime to run the model, and compare the results with PyTorch +onnx_program.save("./custom_add_model.onnx") +ort_session = onnxruntime.InferenceSession( + "./custom_add_model.onnx", providers=['CPUExecutionProvider'] + ) + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + +onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_add_x, input_add_y) +onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} +onnxruntime_outputs = ort_session.run(None, onnxruntime_input) + +torch_outputs = aten_add_model(input_add_x, input_add_y) +torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) + +assert len(torch_outputs) == len(onnxruntime_outputs) +for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): + torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) + + +###################################################################### +# Custom operators with existing ONNX Runtime support +# --------------------------------------------------- +# +# In this case, the user creates a model with standard PyTorch operators, but the ONNX runtime +# (e.g. Microsoft's ONNX Runtime) can provide a custom implementation for that kernel, effectively replacing the +# existing implementation in the ONNX Registry. Another use case is when the user wants to use a custom implementation +# of an existing ONNX operator to fix a bug or improve performance of a specific operator. +# To achieve this, we only need to register the new implementation with the existing ATen fully qualified name. +# +# In the following example, we use the ``com.microsoft.Gelu`` from ONNX Runtime, +# which is not the same ``Gelu`` from ONNX spec. Thus, we register the Gelu with +# the namespace ``com.microsoft`` and operator name ``Gelu``. +# +# Before we begin, let's check whether ``aten::gelu.default`` is really supported by the ONNX registry. + +onnx_registry = torch.onnx.OnnxRegistry() +print(f"aten::gelu.default is supported by ONNX registry: \ + {onnx_registry.is_registered_op(namespace='aten', op_name='gelu', overload='default')}") + + +###################################################################### +# In our example, ``aten::gelu.default`` operator is supported by the ONNX registry, +# so :meth:`onnx_registry.is_registered_op` returns ``True``. + +class CustomGelu(torch.nn.Module): + def forward(self, input_x): + return torch.ops.aten.gelu(input_x) + +# com.microsoft is an official ONNX Runtime namspace +custom_ort = onnxscript.values.Opset(domain="com.microsoft", version=1) + +# NOTE: The function signature must match the signature of the unsupported ATen operator. +# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml +# NOTE: All attributes must be annotated with type hints. +@onnxscript.script(custom_ort) +def custom_aten_gelu(input_x, approximate: str = "none"): + # We know com.microsoft::Gelu is supported by ONNX Runtime + # It's only not supported by ONNX + return custom_ort.Gelu(input_x) + + +onnx_registry = torch.onnx.OnnxRegistry() +onnx_registry.register_op( + namespace="aten", op_name="gelu", overload="default", function=custom_aten_gelu) +export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) + +aten_gelu_model = CustomGelu() +input_gelu_x = torch.randn(3, 3) + +onnx_program = torch.onnx.dynamo_export( + aten_gelu_model, input_gelu_x, export_options=export_options + ) + + +###################################################################### +# Let's inspect the model and verify the model uses op_type ``Gelu`` +# from namespace ``com.microsoft``. +# +# .. note:: +# :func:`custom_aten_gelu` does not exist in the graph because +# functions with fewer than three operators are inlined automatically. +# + +# graph node domain is the custom domain we registered +assert onnx_program.model_proto.graph.node[0].domain == "com.microsoft" +# graph node name is the function name +assert onnx_program.model_proto.graph.node[0].op_type == "Gelu" + + +###################################################################### +# The following diagram shows ``custom_aten_gelu_model`` ONNX graph using Netron, +# we can see the ``Gelu`` node from module ``com.microsoft`` used in the function: +# +# .. image:: /_static/img/onnx/custom_aten_gelu_model.png +# +# That is all we need to do. As an additional step, we can use ONNX Runtime to run the model, +# and compare the results with PyTorch. +# + +onnx_program.save("./custom_gelu_model.onnx") +ort_session = onnxruntime.InferenceSession( + "./custom_gelu_model.onnx", providers=['CPUExecutionProvider'] + ) + +def to_numpy(tensor): + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + +onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_gelu_x) +onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} +onnxruntime_outputs = ort_session.run(None, onnxruntime_input) + +torch_outputs = aten_gelu_model(input_gelu_x) +torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) + +assert len(torch_outputs) == len(onnxruntime_outputs) +for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): + torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) + +###################################################################### +# Custom operators without ONNX Runtime support +# --------------------------------------------- +# +# In this case, the operator is not supported by any ONNX runtime, but we +# would like to use it as custom operator in ONNX graph. Therefore, we need to implement +# the operator in three places: +# +# 1. PyTorch FX graph +# 2. ONNX Registry +# 3. ONNX Runtime +# +# In the following example, we would like to use a custom operator +# that takes one tensor input, and returns one output. The operator adds +# the input to itself, and returns the rounded result. +# +# +# Custom Ops Registration in PyTorch FX Graph (Beta) +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Firstly, we need to implement the operator in PyTorch FX graph. +# This can be done by using ``torch._custom_op``. +# + +# NOTE: This is a beta feature in PyTorch, and is subject to change. +from torch._custom_op import impl as custom_op + +@custom_op.custom_op("mylibrary::addandround_op") +def addandround_op(tensor_x: torch.Tensor) -> torch.Tensor: + ... + +@addandround_op.impl_abstract() +def addandround_op_impl_abstract(tensor_x): + return torch.empty_like(tensor_x) + +@addandround_op.impl("cpu") +def addandround_op_impl(tensor_x): + return torch.round(tensor_x + tensor_x) # add x to itself, and round the result + +torch._dynamo.allow_in_graph(addandround_op) + +class CustomFoo(torch.nn.Module): + def forward(self, tensor_x): + return addandround_op(tensor_x) + +input_addandround_x = torch.randn(3) +custom_addandround_model = CustomFoo() + + +###################################################################### +# +# Custom Ops Registration in ONNX Registry +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# For the step 2 and 3, we need to implement the operator in ONNX registry. +# In this example, we will implement the operator in ONNX registry +# with the namespace ``test.customop`` and operator name ``CustomOpOne``, +# and ``CustomOpTwo``. These two ops are registered and built in +# `cpu_ops.cc `__. +# + + +custom_opset = onnxscript.values.Opset(domain="test.customop", version=1) + +# NOTE: The function signature must match the signature of the unsupported ATen operator. +# https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml +# NOTE: All attributes must be annotated with type hints. +@onnxscript.script(custom_opset) +def custom_addandround(input_x): + # The same as opset18.Add(x, x) + add_x = custom_opset.CustomOpOne(input_x, input_x) + # The same as opset18.Round(x, x) + round_x = custom_opset.CustomOpTwo(add_x) + # Cast to FLOAT to match the ONNX type + return opset18.Cast(round_x, to=1) + + +onnx_registry = torch.onnx.OnnxRegistry() +onnx_registry.register_op( + namespace="mylibrary", op_name="addandround_op", overload="default", function=custom_addandround + ) + +export_options = torch.onnx.ExportOptions(onnx_registry=onnx_registry) +onnx_program = torch.onnx.dynamo_export( + custom_addandround_model, input_addandround_x, export_options=export_options + ) +onnx_program.save("./custom_addandround_model.onnx") + + +###################################################################### +# The ``onnx_program`` exposes the exported model as protobuf through ``onnx_program.model_proto``. +# The graph has one graph nodes for ``custom_addandround``, and inside ``custom_addandround``, +# there are two function nodes, one for each operator. +# + +assert onnx_program.model_proto.graph.node[0].domain == "test.customop" +assert onnx_program.model_proto.graph.node[0].op_type == "custom_addandround" +assert onnx_program.model_proto.functions[0].node[0].domain == "test.customop" +assert onnx_program.model_proto.functions[0].node[0].op_type == "CustomOpOne" +assert onnx_program.model_proto.functions[0].node[1].domain == "test.customop" +assert onnx_program.model_proto.functions[0].node[1].op_type == "CustomOpTwo" + + +###################################################################### +# This is how ``custom_addandround_model`` ONNX graph looks using Netron: +# +# .. image:: /_static/img/onnx/custom_addandround_model.png +# :width: 70% +# :align: center +# +# Inside the ``custom_addandround`` function, we can see the two custom operators we +# used in the function (``CustomOpOne``, and ``CustomOpTwo``), and they are from module +# ``test.customop``: +# +# .. image:: /_static/img/onnx/custom_addandround_function.png +# +# Custom Ops Registration in ONNX Runtime +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# To link your custom op library to ONNX Runtime, you need to +# compile your C++ code into a shared library and link it to ONNX Runtime. +# Follow the instructions below: +# +# 1. Implement your custom op in C++ by following +# `ONNX Runtime instructions <`https://github.com/microsoft/onnxruntime/blob/gh-pages/docs/reference/operators/add-custom-op.md>`__. +# 2. Download ONNX Runtime source distribution from +# `ONNX Runtime releases `__. +# 3. Compile and link your custom op library to ONNX Runtime, for example: +# +# .. code-block:: bash +# +# $ gcc -shared -o libcustom_op_library.so custom_op_library.cc -L /path/to/downloaded/ort/lib/ -lonnxruntime -fPIC +# +# 4. Run the model with ONNX Runtime Python API and compare the results with PyTorch. +# +# .. code-block:: python +# +# ort_session_options = onnxruntime.SessionOptions() +# +# # NOTE: Link the custom op library to ONNX Runtime and replace the path +# # with the path to your custom op library +# ort_session_options.register_custom_ops_library( +# "/path/to/libcustom_op_library.so" +# ) +# ort_session = onnxruntime.InferenceSession( +# "./custom_addandround_model.onnx", providers=['CPUExecutionProvider'], sess_options=ort_session_options) +# +# def to_numpy(tensor): +# return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() +# +# onnx_input = onnx_program.adapt_torch_inputs_to_onnx(input_addandround_x) +# onnxruntime_input = {k.name: to_numpy(v) for k, v in zip(ort_session.get_inputs(), onnx_input)} +# onnxruntime_outputs = ort_session.run(None, onnxruntime_input) +# +# torch_outputs = custom_addandround_model(input_addandround_x) +# torch_outputs = onnx_program.adapt_torch_outputs_to_onnx(torch_outputs) +# +# assert len(torch_outputs) == len(onnxruntime_outputs) +# for torch_output, onnxruntime_output in zip(torch_outputs, onnxruntime_outputs): +# torch.testing.assert_close(torch_output, torch.tensor(onnxruntime_output)) +# +# Conclusion +# ---------- +# +# Congratulations! In this tutorial, we explored the :class:`ONNXRegistry` API and +# discovered how to create custom implementations for unsupported or existing ATen operators +# using ONNX Script. +# Finally, we leveraged ONNX Runtime to execute the model and compare the results with PyTorch, +# providing us with a comprehensive understanding of handling unsupported +# operators in the ONNX ecosystem. +# +# Further reading +# --------------- +# +# The list below refers to tutorials that ranges from basic examples to advanced scenarios, +# not necessarily in the order they are listed. +# Feel free to jump directly to specific topics of your interest or +# sit tight and have fun going through all of them to learn all there is about the ONNX exporter. +# +# .. include:: /beginner_source/onnx/onnx_toc.txt +# +# .. toctree:: +# :hidden: +# diff --git a/beginner_source/onnx/onnx_toc.txt b/beginner_source/onnx/onnx_toc.txt index 2386430ba7b..a11bbe890af 100644 --- a/beginner_source/onnx/onnx_toc.txt +++ b/beginner_source/onnx/onnx_toc.txt @@ -1 +1,3 @@ -| 1. `Export a PyTorch model to ONNX `_ \ No newline at end of file +| 1. `Exporting a PyTorch model to ONNX `_ +| 2. `Extending the ONNX registry `_ +| 3. `Export a Pytorch model with a test to ONNX `_ diff --git a/beginner_source/profiler.py b/beginner_source/profiler.py index 95d077f7ba3..b395edbaca6 100644 --- a/beginner_source/profiler.py +++ b/beginner_source/profiler.py @@ -1,6 +1,7 @@ """ Profiling your PyTorch Module ------------- +----------------------------- + **Author:** `Suraj Subramanian `_ PyTorch includes a profiler API that is useful to identify the time and @@ -81,7 +82,7 @@ def forward(self, input, mask): # ``profiler.profile`` context manager. The ``with_stack=True`` parameter appends the # file and line number of the operation in the trace. # -# .. WARNING:: +# .. warning:: # ``with_stack=True`` incurs an additional overhead, and is better suited for investigating code. # Remember to remove it if you are benchmarking performance. # @@ -114,7 +115,7 @@ def forward(self, input, mask): # `docs `__ for # valid sorting keys). # -# .. Note:: +# .. note:: # When running profiler in a notebook, you might see entries like ``(13): forward`` # instead of filenames in the stacktrace. These correspond to ``(line number): calling-function``. diff --git a/beginner_source/ptcheat.rst b/beginner_source/ptcheat.rst index 91a05866181..49f9c9f5951 100644 --- a/beginner_source/ptcheat.rst +++ b/beginner_source/ptcheat.rst @@ -22,27 +22,12 @@ Neural Network API import torch.nn as nn # neural networks import torch.nn.functional as F # layers, activations and more import torch.optim as optim # optimizers e.g. gradient descent, ADAM, etc. - from torch.jit import script, trace # hybrid frontend decorator and tracing jit See `autograd `__, `nn `__, `functional `__ and `optim `__ -TorchScript and JIT -------------------- - -.. code-block:: python - - torch.jit.trace() # takes your module or function and an example - # data input, and traces the computational steps - # that the data encounters as it progresses through the model - - @script # decorator used to indicate data-dependent - # control flow within the code being traced - -See `Torchscript `__ - ONNX ---- @@ -225,8 +210,10 @@ Optimizers opt = optim.x(model.parameters(), ...) # create optimizer opt.step() # update weights - optim.X # where X is SGD, Adadelta, Adagrad, Adam, - # AdamW, SparseAdam, Adamax, ASGD, + opt.zero_grad() # clear the gradients + optim.X # where X is SGD, AdamW, Adam, + # Adafactor, NAdam, RAdam, Adadelta, + # Adagrad, SparseAdam, Adamax, ASGD, # LBFGS, RMSprop or Rprop See `optimizers `__ diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst index c43b90a4c44..e5642dfa139 100644 --- a/beginner_source/pytorch_with_examples.rst +++ b/beginner_source/pytorch_with_examples.rst @@ -1,5 +1,6 @@ Learning PyTorch with Examples -****************************** +============================== + **Author**: `Justin Johnson `_ .. note:: @@ -29,7 +30,7 @@ between the network output and the true output. :local: Tensors -======= +~~~~~~~ Warm-up: numpy -------------- @@ -74,7 +75,7 @@ and backward passes through the network: Autograd -======== +~~~~~~~~ PyTorch: Tensors and autograd ------------------------------- @@ -133,7 +134,7 @@ our model: .. includenodoc:: /beginner/examples_autograd/polynomial_custom_function.py ``nn`` module -=========== +~~~~~~~~~~~~~ PyTorch: ``nn`` --------------- @@ -148,7 +149,7 @@ which will be optimized during learning. In TensorFlow, packages like `Keras `__, -`TensorFlow-Slim `__, +`TensorFlow-Slim `__, and `TFLearn `__ provide higher-level abstractions over raw computational graphs that are useful for building neural networks. @@ -219,7 +220,7 @@ We can easily implement this model as a Module subclass: .. _examples-download: Examples -======== +~~~~~~~~ You can browse the above examples here. @@ -261,7 +262,7 @@ Autograd
    ``nn`` module ------------ +-------------- .. toctree:: :maxdepth: 2 diff --git a/beginner_source/saving_loading_models.py b/beginner_source/saving_loading_models.py index d4b328156ce..13bca8ca3de 100644 --- a/beginner_source/saving_loading_models.py +++ b/beginner_source/saving_loading_models.py @@ -115,7 +115,7 @@ # # **Output:** # -# :: +# .. code-block:: sh # # Model's state_dict: # conv1.weight torch.Size([6, 3, 5, 5]) @@ -153,14 +153,14 @@ # .. code:: python # # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH, weights_only=True)) # model.eval() # # .. note:: # The 1.6 release of PyTorch switched ``torch.save`` to use a new # zip file-based format. ``torch.load`` still retains the ability to # load files in the old format. If for any reason you want ``torch.save`` -# to use the old format, pass the ``kwarg``parameter ``_use_new_zipfile_serialization=False``. +# to use the old format, pass the ``kwarg`` parameter ``_use_new_zipfile_serialization=False``. # # When saving a model for inference, it is only necessary to save the # trained model’s learned parameters. Saving the model’s *state_dict* with @@ -175,7 +175,7 @@ # normalization layers to evaluation mode before running inference. # Failing to do this will yield inconsistent inference results. # -# .. Note :: +# .. note:: # # Notice that the ``load_state_dict()`` function takes a dictionary # object, NOT a path to a saved object. This means that you must @@ -183,7 +183,7 @@ # ``load_state_dict()`` function. For example, you CANNOT load using # ``model.load_state_dict(PATH)``. # -# .. Note :: +# .. note:: # # If you only plan to keep the best performing model (according to the # acquired validation loss), don't forget that ``best_model_state = model.state_dict()`` @@ -206,7 +206,7 @@ # .. code:: python # # # Model class must be defined somewhere -# model = torch.load(PATH) +# model = torch.load(PATH, weights_only=False) # model.eval() # # This save/load process uses the most intuitive syntax and involves the @@ -290,7 +290,7 @@ # model = TheModelClass(*args, **kwargs) # optimizer = TheOptimizerClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # epoch = checkpoint['epoch'] @@ -354,7 +354,7 @@ # optimizerA = TheOptimizerAClass(*args, **kwargs) # optimizerB = TheOptimizerBClass(*args, **kwargs) # -# checkpoint = torch.load(PATH) +# checkpoint = torch.load(PATH, weights_only=True) # modelA.load_state_dict(checkpoint['modelA_state_dict']) # modelB.load_state_dict(checkpoint['modelB_state_dict']) # optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) @@ -407,7 +407,7 @@ # .. code:: python # # modelB = TheModelBClass(*args, **kwargs) -# modelB.load_state_dict(torch.load(PATH), strict=False) +# modelB.load_state_dict(torch.load(PATH, weights_only=True), strict=False) # # Partially loading a model or loading a partial model are common # scenarios when transfer learning or training a new complex model. @@ -446,7 +446,7 @@ # # device = torch.device('cpu') # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location=device)) +# model.load_state_dict(torch.load(PATH, map_location=device, weights_only=True)) # # When loading a model on a CPU that was trained with a GPU, pass # ``torch.device('cpu')`` to the ``map_location`` argument in the @@ -469,7 +469,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH)) +# model.load_state_dict(torch.load(PATH, weights_only=True)) # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # @@ -497,7 +497,7 @@ # # device = torch.device("cuda") # model = TheModelClass(*args, **kwargs) -# model.load_state_dict(torch.load(PATH, map_location="cuda:0")) # Choose whatever GPU device number you want +# model.load_state_dict(torch.load(PATH, weights_only=True, map_location="cuda:0")) # Choose whatever GPU device number you want # model.to(device) # # Make sure to call input = input.to(device) on any input tensors that you feed to the model # diff --git a/beginner_source/t5_tutoria.rst b/beginner_source/t5_tutoria.rst new file mode 100644 index 00000000000..65de42b9320 --- /dev/null +++ b/beginner_source/t5_tutoria.rst @@ -0,0 +1,10 @@ +T5-Base Model for Summarization, Sentiment Classification, and Translation +========================================================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/t5_tutorial.py b/beginner_source/t5_tutorial.py deleted file mode 100644 index 8f77cd278ea..00000000000 --- a/beginner_source/t5_tutorial.py +++ /dev/null @@ -1,456 +0,0 @@ -""" -T5-Base Model for Summarization, Sentiment Classification, and Translation -========================================================================== - -**Authors**: `Pendo Abbo `__, `Joe Cummings `__ - -""" - -###################################################################### -# Overview -# -------- -# -# This tutorial demonstrates how to use a pretrained T5 Model for summarization, sentiment classification, and -# translation tasks. We will demonstrate how to use the torchtext library to: -# -# 1. Build a text preprocessing pipeline for a T5 model -# 2. Instantiate a pretrained T5 model with base configuration -# 3. Read in the CNNDM, IMDB, and Multi30k datasets and preprocess their texts in preparation for the model -# 4. Perform text summarization, sentiment classification, and translation -# -# .. note:: -# This tutorial requires PyTorch 2.0.0 or later. -# -####################################################################### -# Data Transformation -# ------------------- -# -# The T5 model does not work with raw text. Instead, it requires the text to be transformed into numerical form -# in order to perform training and inference. The following transformations are required for the T5 model: -# -# 1. Tokenize text -# 2. Convert tokens into (integer) IDs -# 3. Truncate the sequences to a specified maximum length -# 4. Add end-of-sequence (EOS) and padding token IDs -# -# T5 uses a ``SentencePiece`` model for text tokenization. Below, we use a pretrained ``SentencePiece`` model to build -# the text preprocessing pipeline using torchtext's T5Transform. Note that the transform supports both -# batched and non-batched text input (for example, one can either pass a single sentence or a list of sentences), however the T5 model expects the input to be batched. -# - -from torchtext.models import T5Transform - -padding_idx = 0 -eos_idx = 1 -max_seq_len = 512 -t5_sp_model_path = "https://download.pytorch.org/models/text/t5_tokenizer_base.model" - -transform = T5Transform( - sp_model_path=t5_sp_model_path, - max_seq_len=max_seq_len, - eos_idx=eos_idx, - padding_idx=padding_idx, -) - -####################################################################### -# Alternatively, we can also use the transform shipped with the pretrained models that does all of the above out-of-the-box -# -# .. code-block:: -# -# from torchtext.models import T5_BASE_GENERATION -# transform = T5_BASE_GENERATION.transform() -# - - -###################################################################### -# Model Preparation -# ----------------- -# -# torchtext provides SOTA pretrained models that can be used directly for NLP tasks or fine-tuned on downstream tasks. Below -# we use the pretrained T5 model with standard base configuration to perform text summarization, sentiment classification, and -# translation. For additional details on available pretrained models, see `the torchtext documentation `__ -# -# -from torchtext.models import T5_BASE_GENERATION - - -t5_base = T5_BASE_GENERATION -transform = t5_base.transform() -model = t5_base.get_model() -model.eval() - - -####################################################################### -# Using ``GenerationUtils`` -# ------------------------- -# -# We can use torchtext's ``GenerationUtils`` to produce an output sequence based on the input sequence provided. This calls on the -# model's encoder and decoder, and iteratively expands the decoded sequences until the end-of-sequence token is generated -# for all sequences in the batch. The ``generate`` method shown below uses greedy search to generate the sequences. Beam search and -# other decoding strategies are also supported. -# -# -from torchtext.prototype.generate import GenerationUtils - -sequence_generator = GenerationUtils(model) - - -####################################################################### -# Datasets -# -------- -# torchtext provides several standard NLP datasets. For a complete list, refer to the documentation -# at https://pytorch.org/text/stable/datasets.html. These datasets are built using composable torchdata -# datapipes and hence support standard flow-control and mapping/transformation using user defined -# functions and transforms. -# -# Below we demonstrate how to preprocess the CNNDM dataset to include the prefix necessary for the -# model to identify the task it is performing. The CNNDM dataset has a train, validation, and test -# split. Below we demo on the test split. -# -# The T5 model uses the prefix "summarize" for text summarization. For more information on task -# prefixes, please visit Appendix D of the `T5 Paper `__ -# -# .. note:: -# Using datapipes is still currently subject to a few caveats. If you wish -# to extend this example to include shuffling, multi-processing, or -# distributed learning, please see :ref:`this note ` -# for further instructions. - -from functools import partial - -from torch.utils.data import DataLoader -from torchtext.datasets import CNNDM - -cnndm_batch_size = 5 -cnndm_datapipe = CNNDM(split="test") -task = "summarize" - - -def apply_prefix(task, x): - return f"{task}: " + x[0], x[1] - - -cnndm_datapipe = cnndm_datapipe.map(partial(apply_prefix, task)) -cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size) -cnndm_datapipe = cnndm_datapipe.rows2columnar(["article", "abstract"]) -cnndm_dataloader = DataLoader(cnndm_datapipe, shuffle=True, batch_size=None) - -####################################################################### -# Alternately, we can also use batched API, for example, apply the prefix on the whole batch: -# -# .. code-block:: -# -# def batch_prefix(task, x): -# return { -# "article": [f'{task}: ' + y for y in x["article"]], -# "abstract": x["abstract"] -# } -# -# cnndm_batch_size = 5 -# cnndm_datapipe = CNNDM(split="test") -# task = 'summarize' -# -# cnndm_datapipe = cnndm_datapipe.batch(cnndm_batch_size).rows2columnar(["article", "abstract"]) -# cnndm_datapipe = cnndm_datapipe.map(partial(batch_prefix, task)) -# cnndm_dataloader = DataLoader(cnndm_datapipe, batch_size=None) -# - -####################################################################### -# We can also load the IMDB dataset, which will be used to demonstrate sentiment classification using the T5 model. -# This dataset has a train and test split. Below we demo on the test split. -# -# The T5 model was trained on the SST2 dataset (also available in torchtext) for sentiment classification using the -# prefix ``sst2 sentence``. Therefore, we will use this prefix to perform sentiment classification on the IMDB dataset. -# - -from torchtext.datasets import IMDB - -imdb_batch_size = 3 -imdb_datapipe = IMDB(split="test") -task = "sst2 sentence" -labels = {"1": "negative", "2": "positive"} - - -def process_labels(labels, x): - return x[1], labels[str(x[0])] - - -imdb_datapipe = imdb_datapipe.map(partial(process_labels, labels)) -imdb_datapipe = imdb_datapipe.map(partial(apply_prefix, task)) -imdb_datapipe = imdb_datapipe.batch(imdb_batch_size) -imdb_datapipe = imdb_datapipe.rows2columnar(["text", "label"]) -imdb_dataloader = DataLoader(imdb_datapipe, batch_size=None) - -####################################################################### -# Finally, we can also load the Multi30k dataset to demonstrate English to German translation using the T5 model. -# This dataset has a train, validation, and test split. Below we demo on the test split. -# -# The T5 model uses the prefix "translate English to German" for this task. - -from torchtext.datasets import Multi30k - -multi_batch_size = 5 -language_pair = ("en", "de") -multi_datapipe = Multi30k(split="test", language_pair=language_pair) -task = "translate English to German" - -multi_datapipe = multi_datapipe.map(partial(apply_prefix, task)) -multi_datapipe = multi_datapipe.batch(multi_batch_size) -multi_datapipe = multi_datapipe.rows2columnar(["english", "german"]) -multi_dataloader = DataLoader(multi_datapipe, batch_size=None) - -####################################################################### -# Generate Summaries -# ------------------ -# -# We can put all of the components together to generate summaries on the first batch of articles in the CNNDM test set -# using a beam size of 1. -# - -batch = next(iter(cnndm_dataloader)) -input_text = batch["article"] -target = batch["abstract"] -beam_size = 1 - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(cnndm_batch_size): - print(f"Example {i+1}:\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Summarization Output (Might vary since we shuffle the dataloader) -# -------------------- -# -# .. code-block:: -# -# Example 1: -# -# prediction: the 24-year-old has been tattooed for over a decade . he has landed in australia -# to start work on a new campaign . he says he is 'taking it in your stride' to be honest . -# -# target: London-based model Stephen James Hendry famed for his full body tattoo . The supermodel -# is in Sydney for a new modelling campaign . Australian fans understood to have already located -# him at his hotel . The 24-year-old heartthrob is recently single . -# -# -# Example 2: -# -# prediction: a stray pooch has used up at least three of her own after being hit by a -# car and buried in a field . the dog managed to stagger to a nearby farm, dirt-covered -# and emaciated, where she was found . she suffered a dislocated jaw, leg injuries and a -# caved-in sinus cavity -- and still requires surgery to help her breathe . -# -# target: Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer -# and buried in a field . "She's a true miracle dog and she deserves a good life," says -# Sara Mellado, who is looking for a home for Theia . -# -# -# Example 3: -# -# prediction: mohammad Javad Zarif arrived in Iran on a sunny friday morning . he has gone -# a long way to bring Iran in from the cold and allow it to rejoin the international -# community . but there are some facts about him that are less well-known . -# -# target: Mohammad Javad Zarif has spent more time with John Kerry than any other -# foreign minister . He once participated in a takeover of the Iranian Consulate in San -# Francisco . The Iranian foreign minister tweets in English . -# -# -# Example 4: -# -# prediction: five americans were monitored for three weeks after being exposed to Ebola in -# west africa . one of the five had a heart-related issue and has been discharged but hasn't -# left the area . they are clinicians for Partners in Health, a Boston-based aid group . -# -# target: 17 Americans were exposed to the Ebola virus while in Sierra Leone in March . -# Another person was diagnosed with the disease and taken to hospital in Maryland . -# National Institutes of Health says the patient is in fair condition after weeks of -# treatment . -# -# -# Example 5: -# -# prediction: the student was identified during an investigation by campus police and -# the office of student affairs . he admitted to placing the noose on the tree early -# Wednesday morning . the incident is one of several recent racist events to affect -# college students . -# -# target: Student is no longer on Duke University campus and will face disciplinary -# review . School officials identified student during investigation and the person -# admitted to hanging the noose, Duke says . The noose, made of rope, was discovered on -# campus about 2 a.m. -# - - -####################################################################### -# Generate Sentiment Classifications -# ---------------------------------- -# -# Similarly, we can use the model to generate sentiment classifications on the first batch of reviews from the IMDB test set -# using a beam size of 1. -# - -batch = next(iter(imdb_dataloader)) -input_text = batch["text"] -target = batch["label"] -beam_size = 1 - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(imdb_batch_size): - print(f"Example {i+1}:\n") - print(f"input_text: {input_text[i]}\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Sentiment Output -# ---------------- -# -# :: -# -# Example 1: -# -# input_text: sst2 sentence: I love sci-fi and am willing to put up with a lot. Sci-fi -# movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like -# this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). -# Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the -# background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' -# setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. -# It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character -# development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may -# treat important issues, yet not as a serious philosophy. It's really difficult to care about -# the characters here as they are not simply foolish, just missing a spark of life. Their -# actions and reactions are wooden and predictable, often painful to watch. The makers of Earth -# KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people -# would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, -# cheap, poorly edited (watching it without advert breaks really brings this home) trudging -# Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring -# him back as another actor. Jeeez. Dallas all over again. -# -# prediction: negative -# -# target: negative -# -# -# Example 2: -# -# input_text: sst2 sentence: Worth the entertainment value of a rental, especially if you like -# action movies. This one features the usual car chases, fights with the great Van Damme kick -# style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All -# of this is entertaining and competently handled but there is nothing that really blows you -# away if you've seen your share before.

    The plot is made interesting by the -# inclusion of a rabbit, which is clever but hardly profound. Many of the characters are -# heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, -# the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat -# federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. -# All passably acted but again nothing special.

    I thought the main villains were -# pretty well done and fairly well acted. By the end of the movie you certainly knew who the -# good guys were and weren't. There was an emotional lift as the really bad ones got their just -# deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found -# really annoying was the constant cuts to VDs daughter during the last fight scene.

    -# Not bad. Not good. Passable 4. -# -# prediction: positive -# -# target: negative -# -# -# Example 3: -# -# input_text: sst2 sentence: its a totally average film with a few semi-alright action sequences -# that make the plot seem a little better and remind the viewer of the classic van dam films. -# parts of the plot don't make sense and seem to be added in to use up time. the end plot is that -# of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the -# beginning. the end scene with the flask backs don't make sense as they are added in and seem to -# have little relevance to the history of van dam's character. not really worth watching again, -# bit disappointed in the end production, even though it is apparent it was shot on a low budget -# certain shots and sections in the film are of poor directed quality. -# -# prediction: negative -# -# target: negative -# - - -####################################################################### -# Generate Translations -# --------------------- -# -# Finally, we can also use the model to generate English to German translations on the first batch of examples from the Multi30k -# test set. -# - -batch = next(iter(multi_dataloader)) -input_text = batch["english"] -target = batch["german"] - -model_input = transform(input_text) -model_output = sequence_generator.generate(model_input, eos_idx=eos_idx, num_beams=beam_size) -output_text = transform.decode(model_output.tolist()) - -for i in range(multi_batch_size): - print(f"Example {i+1}:\n") - print(f"input_text: {input_text[i]}\n") - print(f"prediction: {output_text[i]}\n") - print(f"target: {target[i]}\n\n") - - -####################################################################### -# Translation Output -# ------------------ -# -# :: -# -# Example 1: -# -# input_text: translate English to German: A man in an orange hat starring at something. -# -# prediction: Ein Mann in einem orangen Hut, der an etwas schaut. -# -# target: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt. -# -# -# Example 2: -# -# input_text: translate English to German: A Boston Terrier is running on lush green grass in front of a white fence. -# -# prediction: Ein Boston Terrier läuft auf üppigem grünem Gras vor einem weißen Zaun. -# -# target: Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun. -# -# -# Example 3: -# -# input_text: translate English to German: A girl in karate uniform breaking a stick with a front kick. -# -# prediction: Ein Mädchen in Karate-Uniform bricht einen Stöck mit einem Frontkick. -# -# target: Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt. -# -# -# Example 4: -# -# input_text: translate English to German: Five people wearing winter jackets and helmets stand in the snow, with snowmobiles in the background. -# -# prediction: Fünf Menschen mit Winterjacken und Helmen stehen im Schnee, mit Schneemobilen im Hintergrund. -# -# target: Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit Schneemobilen im Hintergrund. -# -# -# Example 5: -# -# input_text: translate English to German: People are fixing the roof of a house. -# -# prediction: Die Leute fixieren das Dach eines Hauses. -# -# target: Leute Reparieren das Dach eines Hauses. -# diff --git a/beginner_source/template_tutorial.py b/beginner_source/template_tutorial.py index 520bd40eb03..d7fae7c4c5e 100644 --- a/beginner_source/template_tutorial.py +++ b/beginner_source/template_tutorial.py @@ -9,16 +9,18 @@ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites - * Item 1 - * Item 2 - * Item 3 + * Item 1 + * Item 2 + * Item 3 .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites - * PyTorch v2.0.0 - * GPU ??? - * Other items 3 + * PyTorch v2.0.0 + * GPU ??? + * Other items 3 If you have a video, add it here like this: diff --git a/beginner_source/text_sentiment_ngrams_tutorial.py b/beginner_source/text_sentiment_ngrams_tutorial.py deleted file mode 100644 index 021befdb972..00000000000 --- a/beginner_source/text_sentiment_ngrams_tutorial.py +++ /dev/null @@ -1,372 +0,0 @@ -""" -Text classification with the torchtext library -============================================== - -In this tutorial, we will show how to use the torchtext library to build the dataset for the text classification analysis. Users will have the flexibility to - - - Access to the raw data as an iterator - - Build data processing pipeline to convert the raw text strings into ``torch.Tensor`` that can be used to train the model - - Shuffle and iterate the data with `torch.utils.data.DataLoader `__ - - -Prerequisites -~~~~~~~~~~~~~~~~ - -A recent 2.x version of the ``portalocker`` package needs to be installed prior to running the tutorial. -For example, in the Colab environment, this can be done by adding the following line at the top of the script: - -.. code-block:: bash - - !pip install -U portalocker>=2.0.0` - -""" - - -###################################################################### -# Access to the raw dataset iterators -# ----------------------------------- -# -# The torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the ``AG_NEWS`` dataset iterators yield the raw data as a tuple of label and text. -# -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# - -import torch -from torchtext.datasets import AG_NEWS - -train_iter = iter(AG_NEWS(split="train")) - -###################################################################### -# :: -# -# next(train_iter) -# >>> (3, "Fears for T N pension after talks Unions representing workers at Turner -# Newall say they are 'disappointed' after talks with stricken parent firm Federal -# Mogul.") -# -# next(train_iter) -# >>> (4, "The Race is On: Second Private Team Sets Launch Date for Human -# Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of -# rocketeers competing for the #36;10 million Ansari X Prize, a contest -# for\\privately funded suborbital space flight, has officially announced -# the first\\launch date for its manned rocket.") -# -# next(train_iter) -# >>> (4, 'Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded -# by a chemistry researcher at the University of Louisville won a grant to develop -# a method of producing better peptides, which are short chains of amino acids, the -# building blocks of proteins.') -# - - -###################################################################### -# Prepare data processing pipelines -# --------------------------------- -# -# We have revisited the very basic components of the torchtext library, including vocab, word vectors, tokenizer. Those are the basic data processing building blocks for raw text string. -# -# Here is an example for typical NLP data processing with tokenizer and vocabulary. The first step is to build a vocabulary with the raw training dataset. Here we use built in -# factory function `build_vocab_from_iterator` which accepts iterator that yield list or iterator of tokens. Users can also pass any special symbols to be added to the -# vocabulary. - - -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator - -tokenizer = get_tokenizer("basic_english") -train_iter = AG_NEWS(split="train") - - -def yield_tokens(data_iter): - for _, text in data_iter: - yield tokenizer(text) - - -vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[""]) -vocab.set_default_index(vocab[""]) - -###################################################################### -# The vocabulary block converts a list of tokens into integers. -# -# :: -# -# vocab(['here', 'is', 'an', 'example']) -# >>> [475, 21, 30, 5297] -# -# Prepare the text processing pipeline with the tokenizer and vocabulary. The text and label pipelines will be used to process the raw data strings from the dataset iterators. - -text_pipeline = lambda x: vocab(tokenizer(x)) -label_pipeline = lambda x: int(x) - 1 - - -###################################################################### -# The text pipeline converts a text string into a list of integers based on the lookup table defined in the vocabulary. The label pipeline converts the label into integers. For example, -# -# :: -# -# text_pipeline('here is the an example') -# >>> [475, 21, 2, 30, 5297] -# label_pipeline('10') -# >>> 9 -# - - -###################################################################### -# Generate data batch and iterator -# -------------------------------- -# -# `torch.utils.data.DataLoader `__ -# is recommended for PyTorch users (a tutorial is `here `__). -# It works with a map-style dataset that implements the ``getitem()`` and ``len()`` protocols, and represents a map from indices/keys to data samples. It also works with an iterable dataset with the shuffle argument of ``False``. -# -# Before sending to the model, ``collate_fn`` function works on a batch of samples generated from ``DataLoader``. The input to ``collate_fn`` is a batch of data with the batch size in ``DataLoader``, and ``collate_fn`` processes them according to the data processing pipelines declared previously. Pay attention here and make sure that ``collate_fn`` is declared as a top level def. This ensures that the function is available in each worker. -# -# In this example, the text entries in the original data batch input are packed into a list and concatenated as a single tensor for the input of ``nn.EmbeddingBag``. The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of individual text entries. - - -from torch.utils.data import DataLoader - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -def collate_batch(batch): - label_list, text_list, offsets = [], [], [0] - for _label, _text in batch: - label_list.append(label_pipeline(_label)) - processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) - text_list.append(processed_text) - offsets.append(processed_text.size(0)) - label_list = torch.tensor(label_list, dtype=torch.int64) - offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) - text_list = torch.cat(text_list) - return label_list.to(device), text_list.to(device), offsets.to(device) - - -train_iter = AG_NEWS(split="train") -dataloader = DataLoader( - train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch -) - - -###################################################################### -# Define the model -# ---------------- -# -# The model is composed of the `nn.EmbeddingBag `__ layer plus a linear layer for the classification purpose. ``nn.EmbeddingBag`` with the default mode of "mean" computes the mean value of a “bag” of embeddings. Although the text entries here have different lengths, ``nn.EmbeddingBag`` module requires no padding here since the text lengths are saved in offsets. -# -# Additionally, since ``nn.EmbeddingBag`` accumulates the average across -# the embeddings on the fly, ``nn.EmbeddingBag`` can enhance the -# performance and memory efficiency to process a sequence of tensors. -# -# .. image:: ../_static/img/text_sentiment_ngrams_model.png -# - -from torch import nn - - -class TextClassificationModel(nn.Module): - def __init__(self, vocab_size, embed_dim, num_class): - super(TextClassificationModel, self).__init__() - self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False) - self.fc = nn.Linear(embed_dim, num_class) - self.init_weights() - - def init_weights(self): - initrange = 0.5 - self.embedding.weight.data.uniform_(-initrange, initrange) - self.fc.weight.data.uniform_(-initrange, initrange) - self.fc.bias.data.zero_() - - def forward(self, text, offsets): - embedded = self.embedding(text, offsets) - return self.fc(embedded) - - -###################################################################### -# Initiate an instance -# -------------------- -# -# The ``AG_NEWS`` dataset has four labels and therefore the number of classes is four. -# -# :: -# -# 1 : World -# 2 : Sports -# 3 : Business -# 4 : Sci/Tec -# -# We build a model with the embedding dimension of 64. The vocab size is equal to the length of the vocabulary instance. The number of classes is equal to the number of labels, -# - -train_iter = AG_NEWS(split="train") -num_class = len(set([label for (label, text) in train_iter])) -vocab_size = len(vocab) -emsize = 64 -model = TextClassificationModel(vocab_size, emsize, num_class).to(device) - - -###################################################################### -# Define functions to train the model and evaluate results. -# --------------------------------------------------------- -# - - -import time - - -def train(dataloader): - model.train() - total_acc, total_count = 0, 0 - log_interval = 500 - start_time = time.time() - - for idx, (label, text, offsets) in enumerate(dataloader): - optimizer.zero_grad() - predicted_label = model(text, offsets) - loss = criterion(predicted_label, label) - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) - optimizer.step() - total_acc += (predicted_label.argmax(1) == label).sum().item() - total_count += label.size(0) - if idx % log_interval == 0 and idx > 0: - elapsed = time.time() - start_time - print( - "| epoch {:3d} | {:5d}/{:5d} batches " - "| accuracy {:8.3f}".format( - epoch, idx, len(dataloader), total_acc / total_count - ) - ) - total_acc, total_count = 0, 0 - start_time = time.time() - - -def evaluate(dataloader): - model.eval() - total_acc, total_count = 0, 0 - - with torch.no_grad(): - for idx, (label, text, offsets) in enumerate(dataloader): - predicted_label = model(text, offsets) - loss = criterion(predicted_label, label) - total_acc += (predicted_label.argmax(1) == label).sum().item() - total_count += label.size(0) - return total_acc / total_count - - -###################################################################### -# Split the dataset and run the model -# ----------------------------------- -# -# Since the original ``AG_NEWS`` has no valid dataset, we split the training -# dataset into train/valid sets with a split ratio of 0.95 (train) and -# 0.05 (valid). Here we use -# `torch.utils.data.dataset.random_split `__ -# function in PyTorch core library. -# -# `CrossEntropyLoss `__ -# criterion combines ``nn.LogSoftmax()`` and ``nn.NLLLoss()`` in a single class. -# It is useful when training a classification problem with C classes. -# `SGD `__ -# implements stochastic gradient descent method as the optimizer. The initial -# learning rate is set to 5.0. -# `StepLR `__ -# is used here to adjust the learning rate through epochs. -# - - -from torch.utils.data.dataset import random_split -from torchtext.data.functional import to_map_style_dataset - -# Hyperparameters -EPOCHS = 10 # epoch -LR = 5 # learning rate -BATCH_SIZE = 64 # batch size for training - -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR) -scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) -total_accu = None -train_iter, test_iter = AG_NEWS() -train_dataset = to_map_style_dataset(train_iter) -test_dataset = to_map_style_dataset(test_iter) -num_train = int(len(train_dataset) * 0.95) -split_train_, split_valid_ = random_split( - train_dataset, [num_train, len(train_dataset) - num_train] -) - -train_dataloader = DataLoader( - split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) -valid_dataloader = DataLoader( - split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) -test_dataloader = DataLoader( - test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch -) - -for epoch in range(1, EPOCHS + 1): - epoch_start_time = time.time() - train(train_dataloader) - accu_val = evaluate(valid_dataloader) - if total_accu is not None and total_accu > accu_val: - scheduler.step() - else: - total_accu = accu_val - print("-" * 59) - print( - "| end of epoch {:3d} | time: {:5.2f}s | " - "valid accuracy {:8.3f} ".format( - epoch, time.time() - epoch_start_time, accu_val - ) - ) - print("-" * 59) - - -###################################################################### -# Evaluate the model with test dataset -# ------------------------------------ -# - - -###################################################################### -# Checking the results of the test dataset… - -print("Checking the results of test dataset.") -accu_test = evaluate(test_dataloader) -print("test accuracy {:8.3f}".format(accu_test)) - - -###################################################################### -# Test on a random news -# --------------------- -# -# Use the best model so far and test a golf news. -# - - -ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} - - -def predict(text, text_pipeline): - with torch.no_grad(): - text = torch.tensor(text_pipeline(text)) - output = model(text, torch.tensor([0])) - return output.argmax(1).item() + 1 - - -ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \ - enduring the season’s worst weather conditions on Sunday at The \ - Open on his way to a closing 75 at Royal Portrush, which \ - considering the wind and the rain was a respectable showing. \ - Thursday’s first round at the WGC-FedEx St. Jude Invitational \ - was another story. With temperatures in the mid-80s and hardly any \ - wind, the Spaniard was 13 strokes better in a flawless round. \ - Thanks to his best putting performance on the PGA Tour, Rahm \ - finished with an 8-under 62 for a three-stroke lead, which \ - was even more impressive considering he’d never played the \ - front nine at TPC Southwind." - -model = model.to("cpu") - -print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)]) diff --git a/beginner_source/text_sentiment_ngrams_tutorial.rst b/beginner_source/text_sentiment_ngrams_tutorial.rst new file mode 100644 index 00000000000..024d04056c5 --- /dev/null +++ b/beginner_source/text_sentiment_ngrams_tutorial.rst @@ -0,0 +1,12 @@ +:orphan: + +Text classification with the torchtext library +============================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/torchtext_custom_dataset_tutorial.py b/beginner_source/torchtext_custom_dataset_tutorial.py deleted file mode 100644 index 9875d8aa43a..00000000000 --- a/beginner_source/torchtext_custom_dataset_tutorial.py +++ /dev/null @@ -1,384 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Preprocess custom text dataset using Torchtext -=============================================== - -**Author**: `Anupam Sharma `_ - -This tutorial illustrates the usage of torchtext on a dataset that is not built-in. In the tutorial, -we will preprocess a dataset that can be further utilized to train a sequence-to-sequence -model for machine translation (something like, in this tutorial: `Sequence to Sequence Learning -with Neural Networks `_) but without using legacy version -of torchtext. - -In this tutorial, we will learn how to: - -* Read a dataset -* Tokenize sentence -* Apply transforms to sentence -* Perform bucket batching - -Let us assume that we need to prepare a dataset to train a model that can perform English to -German translation. We will use a tab-delimited German - English sentence pairs provided by -the `Tatoeba Project `_ which can be downloaded from -`this link `__. - -Sentence pairs for other languages can be found in `this link `\ -__. -""" - -# %% -# Setup -# ----- -# -# First, download the dataset, extract the zip, and note the path to the file `deu.txt`. -# -# Ensure that following packages are installed: -# -# * `Torchdata 0.6.0 `_ (`Installation instructions \ -# `__) -# * `Torchtext 0.15.0 `_ (`Installation instructions \ -# `__) -# * `Spacy `__ -# -# Here, we are using `Spacy` to tokenize text. In simple words tokenization means to -# convert a sentence to list of words. Spacy is a python package used for various Natural -# Language Processing (NLP) tasks. -# -# Download the English and German models from Spacy as shown below: -# -# .. code-block:: shell -# -# python -m spacy download en_core_web_sm -# python -m spacy download de_core_news_sm -# - - -# %% -# Let us start by importing required modules: - -import torchdata.datapipes as dp -import torchtext.transforms as T -import spacy -from torchtext.vocab import build_vocab_from_iterator -eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text -de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text - -# %% -# Now we will load the dataset - -FILE_PATH = 'data/deu.txt' -data_pipe = dp.iter.IterableWrapper([FILE_PATH]) -data_pipe = dp.iter.FileOpener(data_pipe, mode='rb') -data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True) - -# %% -# In the above code block, we are doing following things: -# -# 1. At line 2, we are creating an iterable of filenames -# 2. At line 3, we pass the iterable to `FileOpener` which then -# opens the file in read mode -# 3. At line 4, we call a function to parse the file, which -# again returns an iterable of tuples representing each rows -# of the tab-delimited file -# -# DataPipes can be thought of something like a dataset object, on which -# we can perform various operations. -# Check `this tutorial `_ for more details on -# DataPipes. -# -# We can verify if the iterable has the pair of sentences as shown -# below: - -for sample in data_pipe: - print(sample) - break - -# %% -# Note that we also have attribution details along with pair of sentences. We will -# write a small function to remove the attribution details: - -def removeAttribution(row): - """ - Function to keep the first two elements in a tuple - """ - return row[:2] -data_pipe = data_pipe.map(removeAttribution) - -# %% -# The `map` function at line 6 in above code block can be used to apply some function -# on each elements of `data_pipe`. Now, we can verify that the `data_pipe` only contains -# pair of sentences. - - -for sample in data_pipe: - print(sample) - break - -# %% -# Now, let us define few functions to perform tokenization: - -def engTokenize(text): - """ - Tokenize an English text and return a list of tokens - """ - return [token.text for token in eng.tokenizer(text)] - -def deTokenize(text): - """ - Tokenize a German text and return a list of tokens - """ - return [token.text for token in de.tokenizer(text)] - -# %% -# Above function accepts a text and returns a list of words -# as shown below: - -print(engTokenize("Have a good day!!!")) -print(deTokenize("Haben Sie einen guten Tag!!!")) - -# %% -# Building the vocabulary -# ----------------------- -# Let us consider an English sentence as the source and a German sentence as the target. -# -# Vocabulary can be considered as the set of unique words we have in the dataset. -# We will build vocabulary for both our source and target now. -# -# Let us define a function to get tokens from elements of tuples in the iterator. - - -def getTokens(data_iter, place): - """ - Function to yield tokens from an iterator. Since, our iterator contains - tuple of sentences (source and target), `place` parameters defines for which - index to return the tokens for. `place=0` for source and `place=1` for target - """ - for english, german in data_iter: - if place == 0: - yield engTokenize(english) - else: - yield deTokenize(german) - -# %% -# Now, we will build vocabulary for source: - -source_vocab = build_vocab_from_iterator( - getTokens(data_pipe,0), - min_freq=2, - specials= ['', '', '', ''], - special_first=True -) -source_vocab.set_default_index(source_vocab['']) - -# %% -# The code above, builds the vocabulary from the iterator. In the above code block: -# -# * At line 2, we call the `getTokens()` function with `place=0` as we need vocabulary for -# source sentences. -# * At line 3, we set `min_freq=2`. This means, the function will skip those words that occurs -# less than 2 times. -# * At line 4, we specify some special tokens: -# -# * `` for start of sentence -# * `` for end of sentence -# * `` for unknown words. An example of unknown word is the one skipped because of -# `min_freq=2`. -# * `` is the padding token. While training, a model we mostly train in batches. In a -# batch, there can be sentences of different length. So, we pad the shorter sentences with -# `` token to make length of all sequences in the batch equal. -# -# * At line 5, we set `special_first=True`. Which means `` will get index 0, `` index 1, -# `` index 2, and will get index 3 in the vocabulary. -# * At line 7, we set default index as index of ``. That means if some word is not in -# vocabulary, we will use `` instead of that unknown word. -# -# Similarly, we will build vocabulary for target sentences: - -target_vocab = build_vocab_from_iterator( - getTokens(data_pipe,1), - min_freq=2, - specials= ['', '', '', ''], - special_first=True -) -target_vocab.set_default_index(target_vocab['']) - -# %% -# Note that the example above shows how can we add special tokens to our vocabulary. The -# special tokens may change based on the requirements. -# -# Now, we can verify that special tokens are placed at the beginning and then other words. -# In the below code, `source_vocab.get_itos()` returns a list with tokens at index based on -# vocabulary. - -print(source_vocab.get_itos()[:9]) - -# %% -# Numericalize sentences using vocabulary -# --------------------------------------- -# After building the vocabulary, we need to convert our sentences to corresponding indices. -# Let us define some functions for this: - -def getTransform(vocab): - """ - Create transforms based on given vocabulary. The returned transform is applied to sequence - of tokens. - """ - text_tranform = T.Sequential( - ## converts the sentences to indices based on given vocabulary - T.VocabTransform(vocab=vocab), - ## Add at beginning of each sentence. 1 because the index for in vocabulary is - # 1 as seen in previous section - T.AddToken(1, begin=True), - ## Add at beginning of each sentence. 2 because the index for in vocabulary is - # 2 as seen in previous section - T.AddToken(2, begin=False) - ) - return text_tranform - -# %% -# Now, let us see how to use the above function. The function returns an object of `Transforms` -# which we will use on our sentence. Let us take a random sentence and check how the transform -# works. - -temp_list = list(data_pipe) -some_sentence = temp_list[798][0] -print("Some sentence=", end="") -print(some_sentence) -transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence)) -print("Transformed sentence=", end="") -print(transformed_sentence) -index_to_string = source_vocab.get_itos() -for index in transformed_sentence: - print(index_to_string[index], end=" ") - -# %% -# In the above code,: -# -# * At line 2, we take a source sentence from list that we created from `data_pipe` at line 1 -# * At line 5, we get a transform based on a source vocabulary and apply it to a tokenized -# sentence. Note that transforms take list of words and not a sentence. -# * At line 8, we get the mapping of index to string and then use it get the transformed -# sentence -# -# Now we will use DataPipe functions to apply transform to all our sentences. -# Let us define some more functions for this. - -def applyTransform(sequence_pair): - """ - Apply transforms to sequence of tokens in a sequence pair - """ - - return ( - getTransform(source_vocab)(engTokenize(sequence_pair[0])), - getTransform(target_vocab)(deTokenize(sequence_pair[1])) - ) -data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator -temp_list = list(data_pipe) -print(temp_list[0]) - -# %% -# Make batches (with bucket batch) -# -------------------------------- -# Generally, we train models in batches. While working for sequence to sequence models, it is -# recommended to keep the length of sequences in a batch similar. For that we will use -# `bucketbatch` function of `data_pipe`. -# -# Let us define some functions that will be used by the `bucketbatch` function. - -def sortBucket(bucket): - """ - Function to sort a given bucket. Here, we want to sort based on the length of - source and target sequence. - """ - return sorted(bucket, key=lambda x: (len(x[0]), len(x[1]))) - -# %% -# Now, we will apply the `bucketbatch` function: - -data_pipe = data_pipe.bucketbatch( - batch_size = 4, batch_num=5, bucket_num=1, - use_in_batch_shuffle=False, sort_key=sortBucket -) - -# %% -# In the above code block: -# -# * We keep batch size = 4. -# * `batch_num` is the number of batches to keep in a bucket -# * `bucket_num` is the number of buckets to keep in a pool for shuffling -# * `sort_key` specifies the function that takes a bucket and sorts it -# -# Now, let us consider a batch of source sentences as `X` and a batch of target sentences as `y`. -# Generally, while training a model, we predict on a batch of `X` and compare the result with `y`. -# But, a batch in our `data_pipe` is of the form `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`: - -print(list(data_pipe)[0]) -# %% -# So, we will now convert them into the form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`. -# For this we will write a small function: - -def separateSourceTarget(sequence_pairs): - """ - input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]` - output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))` - """ - sources,targets = zip(*sequence_pairs) - return sources,targets - -## Apply the function to each element in the iterator -data_pipe = data_pipe.map(separateSourceTarget) -print(list(data_pipe)[0]) - -# %% -# Now, we have the data as desired. -# -# Padding -# ------- -# As discussed earlier while building vocabulary, we need to pad shorter sentences in a batch to -# make all the sequences in a batch of equal length. We can perform padding as follows: - -def applyPadding(pair_of_sequences): - """ - Convert sequences to tensors and apply padding - """ - return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1]))) -## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies -# padding. Here, `0` is passed to the constructor to specify the index of the `` token in the -# vocabulary. -data_pipe = data_pipe.map(applyPadding) - -# %% -# Now, we can use the index to string mapping to see how the sequence would look with tokens -# instead of indices: - -source_index_to_string = source_vocab.get_itos() -target_index_to_string = target_vocab.get_itos() - -def showSomeTransformedSentences(data_pipe): - """ - Function to show how the sentences look like after applying all transforms. - Here we try to print actual words instead of corresponding index - """ - for sources,targets in data_pipe: - if sources[0][-1] != 0: - continue # Just to visualize padding of shorter sentences - for i in range(4): - source = "" - for token in sources[i]: - source += " " + source_index_to_string[token] - target = "" - for token in targets[i]: - target += " " + target_index_to_string[token] - print(f"Source: {source}") - print(f"Traget: {target}") - break - -showSomeTransformedSentences(data_pipe) -# %% -# In the above output we can observe that the shorter sentences are padded with ``. Now, we -# can use `data_pipe` while writing our training function. -# -# Some parts of this tutorial was inspired from `this article -# `__. diff --git a/beginner_source/torchtext_custom_dataset_tutorial.rst b/beginner_source/torchtext_custom_dataset_tutorial.rst new file mode 100644 index 00000000000..9f014f3ff9a --- /dev/null +++ b/beginner_source/torchtext_custom_dataset_tutorial.rst @@ -0,0 +1,12 @@ +:orphan: + +Preprocess custom text dataset using torchtext +============================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py index 7a2b053763a..de7a178bd7d 100644 --- a/beginner_source/transfer_learning_tutorial.py +++ b/beginner_source/transfer_learning_tutorial.py @@ -209,7 +209,7 @@ def train_model(model, criterion, optimizer, scheduler, num_epochs=25): print(f'Best val Acc: {best_acc:4f}') # load best model weights - model.load_state_dict(torch.load(best_model_params_path)) + model.load_state_dict(torch.load(best_model_params_path, weights_only=True)) return model diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py deleted file mode 100644 index d7ebee959e5..00000000000 --- a/beginner_source/transformer_tutorial.py +++ /dev/null @@ -1,377 +0,0 @@ -""" -Language Modeling with ``nn.Transformer`` and torchtext -=============================================================== - -This is a tutorial on training a model to predict the next word in a sequence using the -`nn.Transformer `__ module. - -The PyTorch 1.2 release includes a standard transformer module based on the -paper `Attention is All You Need `__. -Compared to Recurrent Neural Networks (RNNs), the transformer model has proven -to be superior in quality for many sequence-to-sequence tasks while being more -parallelizable. The ``nn.Transformer`` module relies entirely on an attention -mechanism (implemented as -`nn.MultiheadAttention `__) -to draw global dependencies between input and output. The ``nn.Transformer`` -module is highly modularized such that a single component (e.g., -`nn.TransformerEncoder `__) -can be easily adapted/composed. - -.. image:: ../_static/img/transformer_architecture.jpg - -""" - -###################################################################### -# Define the model -# ---------------- -# - - -###################################################################### -# In this tutorial, we train a ``nn.TransformerEncoder`` model on a -# language modeling task. Please note that this tutorial does not cover -# the training of `nn.TransformerDecoder `__, as depicted in -# the right half of the diagram above. The language modeling task is to assign a -# probability for the likelihood of a given word (or a sequence of words) -# to follow a sequence of words. A sequence of tokens are passed to the embedding -# layer first, followed by a positional encoding layer to account for the order -# of the word (see the next paragraph for more details). The -# ``nn.TransformerEncoder`` consists of multiple layers of -# `nn.TransformerEncoderLayer `__. -# Along with the input sequence, a square attention mask is required because the -# self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend -# the earlier positions in the sequence. For the language modeling task, any -# tokens on the future positions should be masked. To produce a probability -# distribution over output words, the output of the ``nn.TransformerEncoder`` -# model is passed through a linear layer to output unnormalized logits. -# The log-softmax function isn't applied here due to the later use of -# `CrossEntropyLoss `__, -# which requires the inputs to be unnormalized logits. -# - -import math -import os -from tempfile import TemporaryDirectory -from typing import Tuple - -import torch -from torch import nn, Tensor -from torch.nn import TransformerEncoder, TransformerEncoderLayer -from torch.utils.data import dataset - -class TransformerModel(nn.Module): - - def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int, - nlayers: int, dropout: float = 0.5): - super().__init__() - self.model_type = 'Transformer' - self.pos_encoder = PositionalEncoding(d_model, dropout) - encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout) - self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) - self.embedding = nn.Embedding(ntoken, d_model) - self.d_model = d_model - self.linear = nn.Linear(d_model, ntoken) - - self.init_weights() - - def init_weights(self) -> None: - initrange = 0.1 - self.embedding.weight.data.uniform_(-initrange, initrange) - self.linear.bias.data.zero_() - self.linear.weight.data.uniform_(-initrange, initrange) - - def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor: - """ - Arguments: - src: Tensor, shape ``[seq_len, batch_size]`` - src_mask: Tensor, shape ``[seq_len, seq_len]`` - - Returns: - output Tensor of shape ``[seq_len, batch_size, ntoken]`` - """ - src = self.embedding(src) * math.sqrt(self.d_model) - src = self.pos_encoder(src) - output = self.transformer_encoder(src, src_mask) - output = self.linear(output) - return output - - -###################################################################### -# ``PositionalEncoding`` module injects some information about the -# relative or absolute position of the tokens in the sequence. The -# positional encodings have the same dimension as the embeddings so that -# the two can be summed. Here, we use ``sine`` and ``cosine`` functions of -# different frequencies. -# - -class PositionalEncoding(nn.Module): - - def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): - super().__init__() - self.dropout = nn.Dropout(p=dropout) - - position = torch.arange(max_len).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) - pe = torch.zeros(max_len, 1, d_model) - pe[:, 0, 0::2] = torch.sin(position * div_term) - pe[:, 0, 1::2] = torch.cos(position * div_term) - self.register_buffer('pe', pe) - - def forward(self, x: Tensor) -> Tensor: - """ - Arguments: - x: Tensor, shape ``[seq_len, batch_size, embedding_dim]`` - """ - x = x + self.pe[:x.size(0)] - return self.dropout(x) - - -###################################################################### -# Load and batch data -# ------------------- -# - - -###################################################################### -# This tutorial uses ``torchtext`` to generate Wikitext-2 dataset. -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# %% -# .. code-block:: bash -# -# %%bash -# pip install portalocker -# pip install torchdata -# -# The vocab object is built based on the train dataset and is used to numericalize -# tokens into tensors. Wikitext-2 represents rare tokens as ``. -# -# Given a 1-D vector of sequential data, ``batchify()`` arranges the data -# into ``batch_size`` columns. If the data does not divide evenly into -# ``batch_size`` columns, then the data is trimmed to fit. For instance, with -# the alphabet as the data (total length of 26) and ``batch_size=4``, we would -# divide the alphabet into sequences of length 6, resulting in 4 of such sequences. -# -# .. math:: -# \begin{bmatrix} -# \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} -# \end{bmatrix} -# \Rightarrow -# \begin{bmatrix} -# \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & -# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & -# \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & -# \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} -# \end{bmatrix} -# -# Batching enables more parallelizable processing. However, batching means that -# the model treats each column independently; for example, the dependence of -# ``G`` and ``F`` can not be learned in the example above. -# - -from torchtext.datasets import WikiText2 -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator - -train_iter = WikiText2(split='train') -tokenizer = get_tokenizer('basic_english') -vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['']) -vocab.set_default_index(vocab['']) - -def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor: - """Converts raw text into a flat Tensor.""" - data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter] - return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) - -# ``train_iter`` was "consumed" by the process of building the vocab, -# so we have to create it again -train_iter, val_iter, test_iter = WikiText2() -train_data = data_process(train_iter) -val_data = data_process(val_iter) -test_data = data_process(test_iter) - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -def batchify(data: Tensor, bsz: int) -> Tensor: - """Divides the data into ``bsz`` separate sequences, removing extra elements - that wouldn't cleanly fit. - - Arguments: - data: Tensor, shape ``[N]`` - bsz: int, batch size - - Returns: - Tensor of shape ``[N // bsz, bsz]`` - """ - seq_len = data.size(0) // bsz - data = data[:seq_len * bsz] - data = data.view(bsz, seq_len).t().contiguous() - return data.to(device) - -batch_size = 20 -eval_batch_size = 10 -train_data = batchify(train_data, batch_size) # shape ``[seq_len, batch_size]`` -val_data = batchify(val_data, eval_batch_size) -test_data = batchify(test_data, eval_batch_size) - - -###################################################################### -# Functions to generate input and target sequence -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# ``get_batch()`` generates a pair of input-target sequences for -# the transformer model. It subdivides the source data into chunks of -# length ``bptt``. For the language modeling task, the model needs the -# following words as ``Target``. For example, with a ``bptt`` value of 2, -# we’d get the following two Variables for ``i`` = 0: -# -# .. image:: ../_static/img/transformer_input_target.png -# -# It should be noted that the chunks are along dimension 0, consistent -# with the ``S`` dimension in the Transformer model. The batch dimension -# ``N`` is along dimension 1. -# - -bptt = 35 -def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]: - """ - Args: - source: Tensor, shape ``[full_seq_len, batch_size]`` - i: int - - Returns: - tuple (data, target), where data has shape ``[seq_len, batch_size]`` and - target has shape ``[seq_len * batch_size]`` - """ - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].reshape(-1) - return data, target - - -###################################################################### -# Initiate an instance -# -------------------- -# - - -###################################################################### -# The model hyperparameters are defined below. The ``vocab`` size is -# equal to the length of the vocab object. -# - -ntokens = len(vocab) # size of vocabulary -emsize = 200 # embedding dimension -d_hid = 200 # dimension of the feedforward network model in ``nn.TransformerEncoder`` -nlayers = 2 # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder`` -nhead = 2 # number of heads in ``nn.MultiheadAttention`` -dropout = 0.2 # dropout probability -model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device) - - -###################################################################### -# Run the model -# ------------- -# - - -###################################################################### -# We use `CrossEntropyLoss `__ -# with the `SGD `__ -# (stochastic gradient descent) optimizer. The learning rate is initially set to -# 5.0 and follows a `StepLR `__ -# schedule. During training, we use `nn.utils.clip_grad_norm\_ `__ -# to prevent gradients from exploding. -# - -import time - -criterion = nn.CrossEntropyLoss() -lr = 5.0 # learning rate -optimizer = torch.optim.SGD(model.parameters(), lr=lr) -scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) - -def train(model: nn.Module) -> None: - model.train() # turn on train mode - total_loss = 0. - log_interval = 200 - start_time = time.time() - - num_batches = len(train_data) // bptt - for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)): - data, targets = get_batch(train_data, i) - output = model(data) - output_flat = output.view(-1, ntokens) - loss = criterion(output_flat, targets) - - optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) - optimizer.step() - - total_loss += loss.item() - if batch % log_interval == 0 and batch > 0: - lr = scheduler.get_last_lr()[0] - ms_per_batch = (time.time() - start_time) * 1000 / log_interval - cur_loss = total_loss / log_interval - ppl = math.exp(cur_loss) - print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | ' - f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | ' - f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}') - total_loss = 0 - start_time = time.time() - -def evaluate(model: nn.Module, eval_data: Tensor) -> float: - model.eval() # turn on evaluation mode - total_loss = 0. - with torch.no_grad(): - for i in range(0, eval_data.size(0) - 1, bptt): - data, targets = get_batch(eval_data, i) - seq_len = data.size(0) - output = model(data) - output_flat = output.view(-1, ntokens) - total_loss += seq_len * criterion(output_flat, targets).item() - return total_loss / (len(eval_data) - 1) - -###################################################################### -# Loop over epochs. Save the model if the validation loss is the best -# we've seen so far. Adjust the learning rate after each epoch. - -best_val_loss = float('inf') -epochs = 3 - -with TemporaryDirectory() as tempdir: - best_model_params_path = os.path.join(tempdir, "best_model_params.pt") - - for epoch in range(1, epochs + 1): - epoch_start_time = time.time() - train(model) - val_loss = evaluate(model, val_data) - val_ppl = math.exp(val_loss) - elapsed = time.time() - epoch_start_time - print('-' * 89) - print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | ' - f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}') - print('-' * 89) - - if val_loss < best_val_loss: - best_val_loss = val_loss - torch.save(model.state_dict(), best_model_params_path) - - scheduler.step() - model.load_state_dict(torch.load(best_model_params_path)) # load best model states - - -###################################################################### -# Evaluate the best model on the test dataset -# ------------------------------------------- -# - -test_loss = evaluate(model, test_data) -test_ppl = math.exp(test_loss) -print('=' * 89) -print(f'| End of training | test loss {test_loss:5.2f} | ' - f'test ppl {test_ppl:8.2f}') -print('=' * 89) diff --git a/beginner_source/transformer_tutorial.rst b/beginner_source/transformer_tutorial.rst new file mode 100644 index 00000000000..0bb2ffc784f --- /dev/null +++ b/beginner_source/transformer_tutorial.rst @@ -0,0 +1,8 @@ +Language Modeling with ``nn.Transformer`` and torchtext +======================================================= + +The content is deprecated. + +.. raw:: html + + diff --git a/beginner_source/translation_transformer.py b/beginner_source/translation_transformer.py deleted file mode 100644 index c5553246e38..00000000000 --- a/beginner_source/translation_transformer.py +++ /dev/null @@ -1,404 +0,0 @@ -""" -Language Translation with ``nn.Transformer`` and torchtext -========================================================== - -This tutorial shows: - - How to train a translation model from scratch using Transformer. - - Use torchtext library to access `Multi30k `__ dataset to train a German to English translation model. -""" - - -###################################################################### -# Data Sourcing and Processing -# ---------------------------- -# -# `torchtext library `__ has utilities for creating datasets that can be easily -# iterated through for the purposes of creating a language translation -# model. In this example, we show how to use torchtext's inbuilt datasets, -# tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. We will use -# `Multi30k dataset from torchtext library `__ -# that yields a pair of source-target raw sentences. -# -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# - -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator -from torchtext.datasets import multi30k, Multi30k -from typing import Iterable, List - - -# We need to modify the URLs for the dataset since the links to the original dataset are broken -# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info -multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz" -multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz" - -SRC_LANGUAGE = 'de' -TGT_LANGUAGE = 'en' - -# Place-holders -token_transform = {} -vocab_transform = {} - -################################################################################### -# Create source and target language tokenizer. Make sure to install the dependencies. -# -# .. code-block:: python -# -# pip install -U torchdata -# pip install -U spacy -# python -m spacy download en_core_web_sm -# python -m spacy download de_core_news_sm - -token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm') -token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm') - - -# helper function to yield list of tokens -def yield_tokens(data_iter: Iterable, language: str) -> List[str]: - language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1} - - for data_sample in data_iter: - yield token_transform[language](data_sample[language_index[language]]) - -# Define special symbols and indices -UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3 -# Make sure the tokens are in order of their indices to properly insert them in vocab -special_symbols = ['', '', '', ''] - -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - # Training data Iterator - train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - # Create torchtext's Vocab object - vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln), - min_freq=1, - specials=special_symbols, - special_first=True) - -# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found. -# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary. -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - vocab_transform[ln].set_default_index(UNK_IDX) - -###################################################################### -# Seq2Seq Network using Transformer -# --------------------------------- -# -# Transformer is a Seq2Seq model introduced in `“Attention is all you -# need” `__ -# paper for solving machine translation tasks. -# Below, we will create a Seq2Seq network that uses Transformer. The network -# consists of three parts. First part is the embedding layer. This layer converts tensor of input indices -# into corresponding tensor of input embeddings. These embedding are further augmented with positional -# encodings to provide position information of input tokens to the model. The second part is the -# actual `Transformer `__ model. -# Finally, the output of the Transformer model is passed through linear layer -# that gives unnormalized probabilities for each token in the target language. -# - - -from torch import Tensor -import torch -import torch.nn as nn -from torch.nn import Transformer -import math -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# helper Module that adds positional encoding to the token embedding to introduce a notion of word order. -class PositionalEncoding(nn.Module): - def __init__(self, - emb_size: int, - dropout: float, - maxlen: int = 5000): - super(PositionalEncoding, self).__init__() - den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size) - pos = torch.arange(0, maxlen).reshape(maxlen, 1) - pos_embedding = torch.zeros((maxlen, emb_size)) - pos_embedding[:, 0::2] = torch.sin(pos * den) - pos_embedding[:, 1::2] = torch.cos(pos * den) - pos_embedding = pos_embedding.unsqueeze(-2) - - self.dropout = nn.Dropout(dropout) - self.register_buffer('pos_embedding', pos_embedding) - - def forward(self, token_embedding: Tensor): - return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :]) - -# helper Module to convert tensor of input indices into corresponding tensor of token embeddings -class TokenEmbedding(nn.Module): - def __init__(self, vocab_size: int, emb_size): - super(TokenEmbedding, self).__init__() - self.embedding = nn.Embedding(vocab_size, emb_size) - self.emb_size = emb_size - - def forward(self, tokens: Tensor): - return self.embedding(tokens.long()) * math.sqrt(self.emb_size) - -# Seq2Seq Network -class Seq2SeqTransformer(nn.Module): - def __init__(self, - num_encoder_layers: int, - num_decoder_layers: int, - emb_size: int, - nhead: int, - src_vocab_size: int, - tgt_vocab_size: int, - dim_feedforward: int = 512, - dropout: float = 0.1): - super(Seq2SeqTransformer, self).__init__() - self.transformer = Transformer(d_model=emb_size, - nhead=nhead, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - dim_feedforward=dim_feedforward, - dropout=dropout) - self.generator = nn.Linear(emb_size, tgt_vocab_size) - self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size) - self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size) - self.positional_encoding = PositionalEncoding( - emb_size, dropout=dropout) - - def forward(self, - src: Tensor, - trg: Tensor, - src_mask: Tensor, - tgt_mask: Tensor, - src_padding_mask: Tensor, - tgt_padding_mask: Tensor, - memory_key_padding_mask: Tensor): - src_emb = self.positional_encoding(self.src_tok_emb(src)) - tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg)) - outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, - src_padding_mask, tgt_padding_mask, memory_key_padding_mask) - return self.generator(outs) - - def encode(self, src: Tensor, src_mask: Tensor): - return self.transformer.encoder(self.positional_encoding( - self.src_tok_emb(src)), src_mask) - - def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor): - return self.transformer.decoder(self.positional_encoding( - self.tgt_tok_emb(tgt)), memory, - tgt_mask) - - -###################################################################### -# During training, we need a subsequent word mask that will prevent the model from looking into -# the future words when making predictions. We will also need masks to hide -# source and target padding tokens. Below, let's define a function that will take care of both. -# - - -def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) - return mask - - -def create_mask(src, tgt): - src_seq_len = src.shape[0] - tgt_seq_len = tgt.shape[0] - - tgt_mask = generate_square_subsequent_mask(tgt_seq_len) - src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool) - - src_padding_mask = (src == PAD_IDX).transpose(0, 1) - tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1) - return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask - - -###################################################################### -# Let's now define the parameters of our model and instantiate the same. Below, we also -# define our loss function which is the cross-entropy loss and the optimizer used for training. -# -torch.manual_seed(0) - -SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE]) -TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE]) -EMB_SIZE = 512 -NHEAD = 8 -FFN_HID_DIM = 512 -BATCH_SIZE = 128 -NUM_ENCODER_LAYERS = 3 -NUM_DECODER_LAYERS = 3 - -transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, - NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM) - -for p in transformer.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - -transformer = transformer.to(DEVICE) - -loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX) - -optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) - -###################################################################### -# Collation -# --------- -# -# As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings. -# We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network -# defined previously. Below we define our collate function that converts a batch of raw strings into batch tensors that -# can be fed directly into our model. -# - - -from torch.nn.utils.rnn import pad_sequence - -# helper function to club together sequential operations -def sequential_transforms(*transforms): - def func(txt_input): - for transform in transforms: - txt_input = transform(txt_input) - return txt_input - return func - -# function to add BOS/EOS and create tensor for input sequence indices -def tensor_transform(token_ids: List[int]): - return torch.cat((torch.tensor([BOS_IDX]), - torch.tensor(token_ids), - torch.tensor([EOS_IDX]))) - -# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices -text_transform = {} -for ln in [SRC_LANGUAGE, TGT_LANGUAGE]: - text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization - vocab_transform[ln], #Numericalization - tensor_transform) # Add BOS/EOS and create tensor - - -# function to collate data samples into batch tensors -def collate_fn(batch): - src_batch, tgt_batch = [], [] - for src_sample, tgt_sample in batch: - src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))) - tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n"))) - - src_batch = pad_sequence(src_batch, padding_value=PAD_IDX) - tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX) - return src_batch, tgt_batch - -###################################################################### -# Let's define training and evaluation loop that will be called for each -# epoch. -# - -from torch.utils.data import DataLoader - -def train_epoch(model, optimizer): - model.train() - losses = 0 - train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn) - - for src, tgt in train_dataloader: - src = src.to(DEVICE) - tgt = tgt.to(DEVICE) - - tgt_input = tgt[:-1, :] - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) - - logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - tgt_out = tgt[1:, :] - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - losses += loss.item() - - return losses / len(list(train_dataloader)) - - -def evaluate(model): - model.eval() - losses = 0 - - val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) - val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn) - - for src, tgt in val_dataloader: - src = src.to(DEVICE) - tgt = tgt.to(DEVICE) - - tgt_input = tgt[:-1, :] - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) - - logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask) - - tgt_out = tgt[1:, :] - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - losses += loss.item() - - return losses / len(list(val_dataloader)) - -###################################################################### -# Now we have all the ingredients to train our model. Let's do it! -# - -from timeit import default_timer as timer -NUM_EPOCHS = 18 - -for epoch in range(1, NUM_EPOCHS+1): - start_time = timer() - train_loss = train_epoch(transformer, optimizer) - end_time = timer() - val_loss = evaluate(transformer) - print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s")) - - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol): - src = src.to(DEVICE) - src_mask = src_mask.to(DEVICE) - - memory = model.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE) - for i in range(max_len-1): - memory = memory.to(DEVICE) - tgt_mask = (generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).to(DEVICE) - out = model.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == EOS_IDX: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, src_sentence: str): - model.eval() - src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1) - num_tokens = src.shape[0] - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten() - return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("", "").replace("", "") - - -###################################################################### -# - -print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .")) - - -###################################################################### -# References -# ---------- -# -# 1. Attention is all you need paper. -# https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf -# 2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding diff --git a/beginner_source/translation_transformer.rst b/beginner_source/translation_transformer.rst new file mode 100644 index 00000000000..892c1b73ca5 --- /dev/null +++ b/beginner_source/translation_transformer.rst @@ -0,0 +1,10 @@ +Language Translation with ``nn.Transformer`` and torchtext +========================================================== + +This tutorial has been deprecated. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/beginner_source/vt_tutorial.py b/beginner_source/vt_tutorial.py index 1b0a93b8b4b..777098be946 100644 --- a/beginner_source/vt_tutorial.py +++ b/beginner_source/vt_tutorial.py @@ -1,6 +1,6 @@ """ Optimizing Vision Transformer Model for Deployment -=========================== +================================================== `Jeff Tang `_, `Geeta Chauhan `_ @@ -241,7 +241,7 @@ ###################################################################### # The results running on a Google Colab are: # -# :: +# .. code-block:: sh # # original model: 1236.69ms # scripted model: 1226.72ms diff --git a/conf.py b/conf.py index a4d8ca1af99..a12a05d21c2 100644 --- a/conf.py +++ b/conf.py @@ -42,8 +42,10 @@ import distutils.file_util import re from get_sphinx_filenames import SPHINX_SHOULD_RUN - +import pandocfilters +import pypandoc import plotly.io as pio +from pathlib import Path pio.renderers.default = 'sphinx_gallery' @@ -66,6 +68,12 @@ # # needs_sphinx = '1.0' +html_meta = { + 'description': 'Master PyTorch with our step-by-step tutorials for all skill levels. Start your journey to becoming a PyTorch expert today!', + 'keywords': 'PyTorch, tutorials, Getting Started, deep learning, AI', + 'author': 'PyTorch Contributors' +} + # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. @@ -74,12 +82,13 @@ 'sphinx.ext.intersphinx', 'sphinx_copybutton', 'sphinx_gallery.gen_gallery', - 'sphinx_design' + 'sphinx_design', + 'sphinx_sitemap' ] intersphinx_mapping = { "torch": ("https://pytorch.org/docs/stable/", None), - "tensordict": ("https://pytorch-labs.github.io/tensordict/", None), + "tensordict": ("https://pytorch.github.io/tensordict/", None), "torchrl": ("https://pytorch.org/rl/", None), "torchaudio": ("https://pytorch.org/audio/stable/", None), "torchtext": ("https://pytorch.org/text/stable/", None), @@ -106,9 +115,21 @@ def reset_seeds(gallery_conf, fname): 'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n" "# https://pytorch.org/tutorials/beginner/colab\n" "%matplotlib inline"), - 'reset_modules': (reset_seeds) + 'reset_modules': (reset_seeds), + 'ignore_pattern': r'_torch_export_nightly_tutorial.py', + 'pypandoc': {'extra_args': ['--mathjax', '--toc'], + 'filters': ['.jenkins/custom_pandoc_filter.py'], + }, } +html_baseurl = 'https://pytorch.org/tutorials/' # needed for sphinx-sitemap +sitemap_locales = [None] +sitemap_excludes = [ + "search.html", + "genindex.html", +] +sitemap_url_scheme = "{link}" + if os.getenv('GALLERY_PATTERN'): # GALLERY_PATTERN is to be used when you want to work on a single # tutorial. Previously this was fed into filename_pattern, but @@ -120,22 +141,17 @@ def reset_seeds(gallery_conf, fname): sphinx_gallery_conf['ignore_pattern'] = r'/(?!' + re.escape(os.getenv('GALLERY_PATTERN')) + r')[^/]+$' for i in range(len(sphinx_gallery_conf['examples_dirs'])): - gallery_dir = sphinx_gallery_conf['gallery_dirs'][i] - source_dir = sphinx_gallery_conf['examples_dirs'][i] - # Create gallery dirs if it doesn't exist - try: - os.mkdir(gallery_dir) - except OSError: - pass + gallery_dir = Path(sphinx_gallery_conf["gallery_dirs"][i]) + source_dir = Path(sphinx_gallery_conf["examples_dirs"][i]) # Copy rst files from source dir to gallery dir - for f in glob.glob(os.path.join(source_dir, '*.rst')): - distutils.file_util.copy_file(f, gallery_dir, update=True) - + for f in source_dir.rglob("*.rst"): + f_dir = Path(f).parent + gallery_subdir_path = gallery_dir / f_dir.relative_to(source_dir) + gallery_subdir_path.mkdir(parents=True, exist_ok=True) + distutils.file_util.copy_file(f, gallery_subdir_path, update=True) # Add any paths that contain templates here, relative to this directory. - - templates_path = ['_templates'] # The suffix(es) of source filenames. @@ -149,7 +165,7 @@ def reset_seeds(gallery_conf, fname): # General information about the project. project = 'PyTorch Tutorials' -copyright = '2023, PyTorch' +copyright = '2024, PyTorch' author = 'PyTorch contributors' # The version info for the project you're documenting, acts as replacement for @@ -171,7 +187,7 @@ def reset_seeds(gallery_conf, fname): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'src/pytorch-sphinx-theme/docs*'] exclude_patterns += sphinx_gallery_conf['examples_dirs'] exclude_patterns += ['*/index.rst'] @@ -284,9 +300,14 @@ def reset_seeds(gallery_conf, fname): html_css_files = [ 'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css', - 'css/custom.css' + 'css/custom.css', + 'css/custom2.css' ] +html_js_files = [ + "js/custom.js", +] + def setup(app): # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value # and can be moved outside of this function (and the setup(app) function diff --git a/distributed/home.rst b/distributed/home.rst index aac2a1df494..c2c53657067 100644 --- a/distributed/home.rst +++ b/distributed/home.rst @@ -13,6 +13,8 @@ PyTorch with each method having their advantages in certain use cases: * `DistributedDataParallel (DDP) <#learn-ddp>`__ * `Fully Sharded Data Parallel (FSDP) <#learn-fsdp>`__ +* `Tensor Parallel (TP) <#learn-tp>`__ +* `Device Mesh <#device-mesh>`__ * `Remote Procedure Call (RPC) distributed training <#learn-rpc>`__ * `Custom Extensions <#custom-extensions>`__ @@ -51,7 +53,7 @@ Learn DDP :link: https://pytorch.org/tutorials/advanced/generic_join.html?utm_source=distr_landing&utm_medium=generic_join :link-type: url - This tutorial describes the Join context manager and + This tutorial describes the Join context manager and demonstrates it's use with DistributedData Parallel. +++ :octicon:`code;1em` Code @@ -75,7 +77,7 @@ Learn FSDP .. grid-item-card:: :octicon:`file-code;1em` FSDP Advanced - :link: https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_advanced + :link: https://pytorch.org/tutorials/intermediate/FSDP_advanced_tutorial.html?utm_source=distr_landing&utm_medium=FSDP_advanced :link-type: url In this tutorial, you will learn how to fine-tune a HuggingFace (HF) T5 @@ -83,6 +85,41 @@ Learn FSDP +++ :octicon:`code;1em` Code + +.. _learn-tp: + +Learn Tensor Parallel (TP) +--------------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Large Scale Transformer model training with Tensor Parallel (TP) + :link: https://pytorch.org/tutorials/intermediate/TP_tutorial.html + :link-type: url + + This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel and Fully Sharded Data Parallel. + +++ + :octicon:`code;1em` Code + + +.. _device-mesh: + +Learn DeviceMesh +---------------- + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + Getting Started with DeviceMesh + :link: https://pytorch.org/tutorials/recipes/distributed_device_mesh.html?highlight=devicemesh + :link-type: url + + In this tutorial you will learn about `DeviceMesh` + and how it can help with distributed training. + +++ + :octicon:`code;1em` Code + .. _learn-rpc: Learn RPC diff --git a/docathon-leaderboard.md b/docathon-leaderboard.md index d95f6851d1b..49912c2abfb 100644 --- a/docathon-leaderboard.md +++ b/docathon-leaderboard.md @@ -1,3 +1,82 @@ +# 🎉 Docathon H1 2024 Leaderboard 🎉 + +This is the list of the docathon contributors that have participated and contributed to the PyTorch H1 2024 docathon. +A big shout out to everyone who have participated! We have awarded points for each merged PR. +For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. +For the **advanced** label, we have awarded 10 points. In some cases, we have awarded credit for the PRs that +were not merged or issues that have been closed without a merged PR. + +| Author | Points | PR | +|--- | --- | ---| +| ahoblitz | 34 | https://github.com/pytorch/pytorch/pull/128566, https://github.com/pytorch/pytorch/pull/128408, https://github.com/pytorch/pytorch/pull/128171, https://github.com/pytorch/pytorch/pull/128083, https://github.com/pytorch/pytorch/pull/128082, https://github.com/pytorch/pytorch/pull/127983, https://github.com/pytorch/xla/pull/7214 | +| afrittoli | 25 | https://github.com/pytorch/pytorch/pull/128139, https://github.com/pytorch/pytorch/pull/128133, https://github.com/pytorch/pytorch/pull/128132, https://github.com/pytorch/pytorch/pull/128129, https://github.com/pytorch/pytorch/pull/128127 | +| kiszk | 20 | https://github.com/pytorch/pytorch/pull/128337, https://github.com/pytorch/pytorch/pull/128123, https://github.com/pytorch/pytorch/pull/128022, https://github.com/pytorch/pytorch/pull/128312 | +| loganthomas | 19 | https://github.com/pytorch/pytorch/pull/128676, https://github.com/pytorch/pytorch/pull/128192, https://github.com/pytorch/pytorch/pull/128189, https://github.com/pytorch/tutorials/pull/2922, https://github.com/pytorch/tutorials/pull/2910, https://github.com/pytorch/xla/pull/7195 | +| ignaciobartol | 17 | https://github.com/pytorch/pytorch/pull/128741, https://github.com/pytorch/pytorch/pull/128135, https://github.com/pytorch/pytorch/pull/127938, https://github.com/pytorch/tutorials/pull/2936 | +| arunppsg | 17 | https://github.com/pytorch/pytorch/pull/128391, https://github.com/pytorch/pytorch/pull/128021, https://github.com/pytorch/pytorch/pull/128018, https://github.com/pytorch-labs/torchfix/pull/59 | +| alperenunlu | 17 | https://github.com/pytorch/tutorials/pull/2934, https://github.com/pytorch/tutorials/pull/2909, https://github.com/pytorch/pytorch/pull/104043 | +| anandptl84 | 10 | https://github.com/pytorch/pytorch/pull/128196, https://github.com/pytorch/pytorch/pull/128098 | +| GdoongMathew | 10 | https://github.com/pytorch/pytorch/pull/128136, https://github.com/pytorch/pytorch/pull/128051 | +| ZhaoqiongZ | 10 | https://github.com/pytorch/pytorch/pull/127872 | +| ZailiWang | 10 | https://github.com/pytorch/tutorials/pull/2931 | +| jingxu10 | 8 | https://github.com/pytorch/pytorch/pull/127280, https://github.com/pytorch/pytorch/pull/127279, https://github.com/pytorch/pytorch/pull/127278, https://github.com/pytorch/tutorials/pull/2919 | +| sitamgithub-MSIT | 7 | https://github.com/pytorch/tutorials/pull/2900, https://github.com/pytorch/xla/pull/7208 | +| spzala | 5 | https://github.com/pytorch/pytorch/pull/128679, https://github.com/pytorch/pytorch/pull/128657 | +| TharinduRusira | 5 | https://github.com/pytorch/pytorch/pull/128197 | +| zabboud | 5 | https://github.com/pytorch/pytorch/pull/128055 | +| orion160 | 5 | https://github.com/pytorch/tutorials/pull/2912 | +| Ricktho1 | 5 | https://github.com/pytorch/xla/pull/7273 | +| IvanLauLinTiong | 4 | https://github.com/pytorch/pytorch/pull/128526, https://github.com/pytorch/tutorials/pull/2849 | +| sshkhr | 2 | https://github.com/pytorch/pytorch/pull/128155 | +| rk7697 | 2 | https://github.com/pytorch/pytorch/pull/127993 | +| hippocookie | 2 | https://github.com/pytorch/tutorials/pull/2937 | +| diningeachox | 2 | https://github.com/pytorch/tutorials/pull/2935 | +| akhil-maker | 2 | https://github.com/pytorch/tutorials/pull/2899 | +| saurabhkthakur | 2 | https://github.com/pytorch/tutorials/pull/2896 | + +# 🎉 Docathon H2 2023 Leaderboard 🎉 + +This is the list of the docathon contributors that have participated and contributed to the H2 2023 PyTorch docathon. +A big shout out to everyone who have participated! We have awarded points for each merged PR. +For the **easy** label, we have awarded 2 points. For the **medium** label, we have awarded 5 points. +For the **advanced** label, we have awarded 10 points. In some cases, we have awarded half credit for the PRs that +were not merged or issues that have been closed without a merged PR. Thank you all for your awesome contributions! 🎉 + +| Author | Points | PR | +|--- | --- | ---| +| ahoblitz | 25 | https://github.com/pytorch/pytorch/pull/112992, https://github.com/pytorch/tutorials/pull/2662, https://github.com/pytorch/tutorials/pull/2647, https://github.com/pytorch/tutorials/pull/2642, https://github.com/pytorch/tutorials/pull/2640, https://github.com/pytorch/pytorch/pull/113092, https://github.com/pytorch/pytorch/pull/113348 | +| ChanBong | 22 | https://github.com/pytorch/pytorch/pull/113337, https://github.com/pytorch/pytorch/pull/113336, https://github.com/pytorch/pytorch/pull/113335, https://github.com/pytorch/tutorials/pull/2644, https://github.com/pytorch/tutorials/pull/2639 | +| alperenunlu | 22 | https://github.com/pytorch/pytorch/pull/113260, https://github.com/pytorch/tutorials/pull/2673, https://github.com/pytorch/tutorials/pull/2660, https://github.com/pytorch/tutorials/pull/2656, https://github.com/pytorch/tutorials/pull/2649, https://github.com/pytorch/pytorch/pull/113505, https://github.com/pytorch/pytorch/pull/113218, https://github.com/pytorch/pytorch/pull/113505 | +| spzala | 22 | https://github.com/pytorch/pytorch/pull/113200, https://github.com/pytorch/pytorch/pull/112693, https://github.com/pytorch/tutorials/pull/2667, https://github.com/pytorch/tutorials/pull/2635 | +| bjhargrave | 21 | https://github.com/pytorch/pytorch/pull/113358, https://github.com/pytorch/pytorch/pull/113206, https://github.com/pytorch/pytorch/pull/112786, https://github.com/pytorch/tutorials/pull/2661, https://github.com/pytorch/tutorials/pull/1272 | +| zabboud | 21 | https://github.com/pytorch/pytorch/pull/113233, https://github.com/pytorch/pytorch/pull/113227, https://github.com/pytorch/pytorch/pull/113177, https://github.com/pytorch/pytorch/pull/113219, https://github.com/pytorch/pytorch/pull/113311 | +| nvs-abhilash | 20 | https://github.com/pytorch/pytorch/pull/113241, https://github.com/pytorch/pytorch/pull/112765, https://github.com/pytorch/pytorch/pull/112695, https://github.com/pytorch/pytorch/pull/112657 | +| guptaaryan16 | 19 | https://github.com/pytorch/pytorch/pull/112817, https://github.com/pytorch/pytorch/pull/112735, https://github.com/pytorch/tutorials/pull/2674, https://github.com/pytorch/pytorch/pull/113196, https://github.com/pytorch/pytorch/pull/113532 | +| min-jean-cho | 17 | https://github.com/pytorch/pytorch/pull/113195, https://github.com/pytorch/pytorch/pull/113183, https://github.com/pytorch/pytorch/pull/113178, https://github.com/pytorch/pytorch/pull/113109, https://github.com/pytorch/pytorch/pull/112892 | +| markstur | 14 | https://github.com/pytorch/pytorch/pull/113250, https://github.com/pytorch/tutorials/pull/2643, https://github.com/pytorch/tutorials/pull/2638, https://github.com/pytorch/tutorials/pull/2636 | +| RustyGrackle | 13 | https://github.com/pytorch/pytorch/pull/113371, https://github.com/pytorch/pytorch/pull/113266, https://github.com/pytorch/pytorch/pull/113435 | +| Viditagarwal7479 | 12 | https://github.com/pytorch/pytorch/pull/112860, https://github.com/pytorch/tutorials/pull/2659, https://github.com/pytorch/tutorials/pull/2671 | +| kiszk | 10 | https://github.com/pytorch/pytorch/pull/113523, https://github.com/pytorch/pytorch/pull/112751 | +| awaelchli | 10 | https://github.com/pytorch/pytorch/pull/113216, https://github.com/pytorch/pytorch/pull/112674 | +| pilot-j | 10 | https://github.com/pytorch/pytorch/pull/112964, https://github.com/pytorch/pytorch/pull/112856 | +| krishnakalyan3 | 7 | https://github.com/pytorch/tutorials/pull/2653, https://github.com/pytorch/tutorials/pull/1235, https://github.com/pytorch/tutorials/pull/1705 | +| ash-01xor | 5 | https://github.com/pytorch/pytorch/pull/113511 | +| IvanLauLinTiong | 5 | https://github.com/pytorch/pytorch/pull/113052 | +| Senthi1Kumar | 5 | https://github.com/pytorch/pytorch/pull/113021 | +| ooooo-create | 5 | https://github.com/pytorch/pytorch/pull/112953 | +| stanleyedward | 5 | https://github.com/pytorch/pytorch/pull/112864, https://github.com/pytorch/pytorch/pull/112617 | +| leslie-fang-intel | 5 | https://github.com/pytorch/tutorials/pull/2668 | +| measty | 5 | https://github.com/pytorch/tutorials/pull/2675 | +| Hhhhhhao | 5 | https://github.com/pytorch/tutorials/pull/2676 | +| andrewashere | 3 | https://github.com/pytorch/pytorch/pull/112721 | +| aalhendi | 3 | https://github.com/pytorch/pytorch/pull/112947 | +| sitamgithub-MSIT | 3 | https://github.com/pytorch/pytorch/pull/113264 | +| Jarlaze | 3 | https://github.com/pytorch/pytorch/pull/113531 | +| jingxu10 | 2 | https://github.com/pytorch/tutorials/pull/2657 | +| cirquit | 2 | https://github.com/pytorch/tutorials/pull/2529 | +| prithviraj-maurya | 1 | https://github.com/pytorch/tutorials/pull/2652 | +| MirMustafaAli | 1 | https://github.com/pytorch/tutorials/pull/2645 | + # 🎉 Docathon H1 2023 Leaderboard 🎉 This is the list of the docathon contributors that have participated and contributed to the PyTorch docathon. A big shout out to everyone who have participated! We have awarded points for each merged PR. diff --git a/en-wordlist.txt b/en-wordlist.txt index 4a6d3a06226..b56df45df0c 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,48 +1,25 @@ -NDK -Backpropagating -multinode -TCP -Frontend -frontend -desynchronization -VLDB -RRef -OOM -subfolder -Dialogs -performant -multithreading -linearities -DyNet -Keras -Norvig -Theano -torchrun -benchmarked -MHA -XLM -BT -Fastpath -fastpath -YouTube -accelerometer -breakpoint -MobileNet -DeepLabV -Resampling +ACL +ADI +ALiBi +AOT +AOTInductor APIs ATen AVX -AOT Args Autograd BCE BFGS +BLAS +BLEU BN BOS +BT +Backpropagating Bahdanau BatchNorm Bethge +Bfloat CHW CIFAR CLS @@ -56,37 +33,61 @@ Captum Captum's CartPole Cayley +CharRNN Chatbots +Chen Colab +Colorectal +Composibility +Conda Conv ConvNet ConvNets +Cuda +CudaLaunchKernel +CudaMemcpyAsync +CudaMemsetAsync DCGAN DCGANs DDP DDPG DDQN DLRM +DMA DNN DQN +DataLoaders DataPipe DataPipes -DataLoaders +Decompositions +DeepLabV DeepMind DeiT DenseNet +Dialogs +DyNet EOS EPS Ecker +ExecuTorch ExportDB FC FGSM +tensordict +DataLoader's FLAVA +FSDP FX FX's +FairSeq +Fastpath +FakeTensor +FakeTensors +FFN FloydHub FloydHub's Frobenius +Frontend GAE GAN GANs @@ -102,100 +103,154 @@ Gatys GeForce Goodfellow Goodfellow’s +Graviton GreedySearchDecoder +HTA HVP +Hao +HistoEnc +HistoEncoder Hugging Face IMDB IOT +IR +IRs ISA ITERS ImageNet +Inductor Initializations Interpretability Iteratively -Inductor -IR -IRs JSON JVP Jacobian +Joona +Kather +Keras +Kihyuk Kiuk Kubernetes Kuei +KV +LRSchedulers LSTM LSTMs +LYM LeCun LeNet LeakyReLU LeakyReLUs Lipschitz LoRa -LRSchedulers Lua Luong -macos +MHA +MKLDNN MLP MLPs MNIST +MPS +MUC MacBook +MacOS +MaskRCNN Minifier +MobileNet +ModelABC +MPS Mypy NAS NCCL NCHW +NDK NES NLP NTK NUMA NaN +NaNs NanoGPT Netron NeurIPS +Norvig NumPy Numericalization Numpy's ONNX -ONNX's ONNX Runtime +ONNX Script +ONNX's +OOM +OU OpenAI +OpenBLAS +OpenJpeg OpenMP +OpenSlide +Opset Ornstein -OU PIL +PIL's PPO +PatchPredictor +PennFudan +Perfetto +Pixman Plotly +Pohjonen Prec Profiler PyTorch's +QuickStart +RCNN RGB +RGBA RL RNN RNNs +ROCm RPC +RPN +RRef RTX Radford ReLU ReLUs +ResNeXt ResNet +Resampling Runtime's SDPA SGD +SIMD SPD +SQuAD SSD +SSL SST2 STN +STR +SVE SciPy Sequentials +Sharding Sigmoid SoTA +Sohn Spacy +SwiGLU +TCP +THP +TIAToolbox +TODO TPU TensorBoard TensorBoards TensorDict TensorFloat TextVQA -TODO +Theano Tokenization TorchDynamo TorchInductor @@ -203,18 +258,35 @@ TorchMultimodal TorchRL TorchRL's TorchScript +TorchVision +TorchVision TorchX Tunable -Uhlenbeck UI +UMAP +Uhlenbeck Unescape VGG +VLDB VQA VS Code -Woohoo +ViT +Volterra +WMT +WSI +WSIs +WSI’s +Wang Wikitext -Xeon +Woohoo +XLM +Xception Xcode +Xeon +Yidong +YouTube +Zipf +accelerometer accuracies activations adversarially @@ -227,6 +299,7 @@ approximators autodiff autoencoder autograd +autotune autotuner backend backends @@ -238,11 +311,17 @@ backpropagation backtrace batchnorm batchnorm's +bbAP +benchmarked benchmarking bitwise +bool boolean +breakpoint broadcasted +bw bytecode +callable's cancelation cardinality chatbot @@ -254,13 +333,18 @@ colorbar compilable composable concat +conda config +configs contrastive conv convolutional cpu csv cuDNN +cuda +customizable +customizations datafile dataflow dataframe @@ -271,10 +355,13 @@ dataset datasets dataset’s deallocation +decompositions decorrelated deserialize deserialized +desynchronization deterministically +devicemesh dimensionality dir discontiguous @@ -282,42 +369,59 @@ distractor downsample downsamples dropdown +dtensor +dtype +dtypes duration elementwise embeddings encodings +enqueuing ensembling enum eq equalities et +eval evaluateInput extensibility fastai +fastmath +fastpath fbgemm feedforward finetune finetuning +FlexAttention fp +frontend functionalized +functionalizes +functionalization functorch fuser geomean +globals grayscale +handoff hardcode helpdesk helpdesks hessian hessians +histoencoder +histologically +homonymous hotspot hvp hyperparameter hyperparameters imagenet +img +inductor inferencing initializations inlined -inductor interpretable invariance io @@ -329,16 +433,27 @@ jacobians jit jitter jpg +json judgements +jupyter +kernels +keypoint kwargs labelled +latencies learnable learnings +linearities loadFilename logits +mAP +macos manualSeed matmul +matmuls matplotlib +memcpy +memset minibatch minibatches minified @@ -348,25 +463,38 @@ misclassification misclassified modularity modularized +mpp +mucosa multihead +MultiheadAttention multimodal multimodality +multinode multiobjective multiprocessed multithreaded +multithreading namespace natively ndarrays +nheads +nightlies +NJT +NJTs +NJT's num numericalize numpy nvFuser nvFuser's oneDNN +opset optimizable optimizer's optimizers +otsu overfitting +pageable parallelizable parallelization parametrization @@ -374,10 +502,13 @@ parametrizations parametrized parametrizing perceptibility +performant pickleable pipelining pointwise postprocessing +pre +pre preallocate preallocates preallocation @@ -391,6 +522,7 @@ prespecified pretrained prewritten primals +processgroup profiler profilers protobuf @@ -400,6 +532,7 @@ quantized quantizing queryable randint +randn readably recomputation regressor @@ -410,9 +543,12 @@ reinitializes relu reproducibility rescale +rescaling +reshard resnet restride rewinded +rgb rollout rollouts romanized @@ -421,20 +557,31 @@ runtime runtime runtimes scalable +SDPA +sharded softmax +sparsified +sparsifier +sparsifiers sparsify +sparsifying specificities src stacktrace stateful storages strided +stroma subclasses subclassing +subcomponent +subcomponents subdirectories +subfolder submodule submodules subnetworks +subproblems subprocess subprocesses subreddit @@ -443,11 +590,13 @@ subregion's summarization swappable tanh -th tensor's +th thresholding +tiatoolbox timestep timesteps +tmp tokenization tokenize tokenized @@ -457,6 +606,7 @@ tooltip topologies torchaudio torchdata +torchrun torchscriptable torchtext torchtext's @@ -465,11 +615,18 @@ torchviz traceback tradeoff tradeoffs +triton +uint +UX +umap +unbacked uncomment uncommented underflowing unfused +unicode unimodal +unigram unnormalized unoptimized unparametrized @@ -490,3 +647,42 @@ warmstart warmstarted warmstarting warmup +webp +wikitext +wsi +wsis +Meta's +RecSys +TorchRec +sharding +TBE +EBC +sharder +hyperoptimized +DMP +unsharded +lookups +KJTs +amongst +async +everytime +prototyped +GBs +HBM +gloo +nccl +Localhost +gpu +torchmetrics +url +colab +sharders +Criteo +torchrec +_batch_norm_impl_index +convolution_overrideable +aten +XPU +XPUs +impl +overrideable diff --git a/index.rst b/index.rst index 355e1646b4d..91194d6f480 100644 --- a/index.rst +++ b/index.rst @@ -1,18 +1,15 @@ Welcome to PyTorch Tutorials ============================ -What's new in PyTorch tutorials? - -* `Getting Started with Distributed Checkpoint (DCP) `__ -* `torch.export Tutorial `__ -* `Facilitating New Backend Integration by PrivateUse1 `__ -* `(prototype) Accelerating BERT with semi-structured (2:4) sparsity `__ -* `(prototype) PyTorch 2 Export Quantization-Aware Training (QAT) `__ -* `(prototype) PyTorch 2 Export Post Training Quantization with X86 Backend through Inductor `__ -* `(prototype) Inductor C++ Wrapper Tutorial `__ -* `How to save memory by fusing the optimizer step into the backward pass `__ -* `Tips for Loading an nn.Module from a Checkpoint `__ +**What's new in PyTorch tutorials?** +* `Compiled Autograd: Capturing a larger backward graph for torch.compile `__ +* `Reducing torch.compile cold start compilation time with regional compilation `__ +* `Introduction to TorchRec `__ +* `(prototype) Flight Recorder for Debugging Stuck Jobs `__ +* `(prototype) How to use TorchInductor on Windows CPU `__ +* `(prototype) Using Max-Autotune Compilation on CPU for Better Performance `__ +* `(prototype) Autoloading Out-of-Tree Extension `__ .. raw:: html @@ -74,7 +71,7 @@ What's new in PyTorch tutorials? :header: Introduction to PyTorch on YouTube :card_description: An introduction to building a complete ML workflow with PyTorch. Follows the PyTorch Beginner Series on YouTube. :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png - :link: beginner/introyt.html + :link: beginner/introyt/introyt_index.html :tags: Getting-Started .. customcarditem:: @@ -98,6 +95,13 @@ What's new in PyTorch tutorials? :link: intermediate/tensorboard_tutorial.html :tags: Interpretability,Getting-Started,TensorBoard +.. customcarditem:: + :header: Good usage of `non_blocking` and `pin_memory()` in PyTorch + :card_description: A guide on best practices to copy data from CPU to GPU. + :image: _static/img/pinmem.png + :link: intermediate/pinmem_nonblock.html + :tags: Getting-Started + .. Image/Video .. customcarditem:: @@ -142,6 +146,20 @@ What's new in PyTorch tutorials? :link: intermediate/spatial_transformer_tutorial.html :tags: Image/Video +.. customcarditem:: + :header: Inference on Whole Slide Images with TIAToolbox + :card_description: Learn how to use the TIAToolbox to perform inference on whole slide images. + :image: _static/img/thumbnails/cropped/TIAToolbox-Tutorial.png + :link: intermediate/tiatoolbox_tutorial.html + :tags: Image/Video + +.. customcarditem:: + :header: Semi-Supervised Learning Tutorial Based on USB + :card_description: Learn how to train semi-supervised learning algorithms (on custom data) using USB and PyTorch. + :image: _static/img/usb_semisup_learn/code.png + :link: advanced/usb_semisup_learn.html + :tags: Image/Video + .. Audio .. customcarditem:: @@ -214,64 +232,28 @@ What's new in PyTorch tutorials? :link: intermediate/forced_alignment_with_torchaudio_tutorial.html :tags: Audio -.. Text - -.. customcarditem:: - :header: Fast Transformer Inference with Better Transformer - :card_description: Deploy a PyTorch Transformer model using Better Transformer with high performance for inference - :image: _static/img/thumbnails/cropped/pytorch-logo.png - :link: beginner/bettertransformer_tutorial.html - :tags: Production,Text - -.. customcarditem:: - :header: Sequence-to-Sequence Modeling with nn.Transformer and torchtext - :card_description: Learn how to train a sequence-to-sequence model that uses the nn.Transformer module. - :image: _static/img/thumbnails/cropped/Sequence-to-Sequence-Modeling-with-nnTransformer-andTorchText.png - :link: beginner/transformer_tutorial.html - :tags: Text +.. NLP .. customcarditem:: :header: NLP from Scratch: Classifying Names with a Character-level RNN :card_description: Build and train a basic character-level RNN to classify word from scratch without the use of torchtext. First in a series of three tutorials. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Classifying-Names-with-a-Character-Level-RNN.png :link: intermediate/char_rnn_classification_tutorial - :tags: Text + :tags: NLP .. customcarditem:: :header: NLP from Scratch: Generating Names with a Character-level RNN :card_description: After using character-level RNN to classify names, learn how to generate names from languages. Second in a series of three tutorials. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Generating-Names-with-a-Character-Level-RNN.png :link: intermediate/char_rnn_generation_tutorial.html - :tags: Text + :tags: NLP .. customcarditem:: :header: NLP from Scratch: Translation with a Sequence-to-sequence Network and Attention :card_description: This is the third and final tutorial on doing “NLP From Scratch”, where we write our own classes and functions to preprocess the data to do our NLP modeling tasks. :image: _static/img/thumbnails/cropped/NLP-From-Scratch-Translation-with-a-Sequence-to-Sequence-Network-and-Attention.png :link: intermediate/seq2seq_translation_tutorial.html - :tags: Text - -.. customcarditem:: - :header: Text Classification with Torchtext - :card_description: Learn how to build the dataset and classify text using torchtext library. - :image: _static/img/thumbnails/cropped/Text-Classification-with-TorchText.png - :link: beginner/text_sentiment_ngrams_tutorial.html - :tags: Text - -.. customcarditem:: - :header: Language Translation with Transformer - :card_description: Train a language translation model from scratch using Transformer. - :image: _static/img/thumbnails/cropped/Language-Translation-with-TorchText.png - :link: beginner/translation_transformer.html - :tags: Text - -.. customcarditem:: - :header: Pre-process custom text dataset using Torchtext - :card_description: Learn how to use torchtext to prepare a custom dataset - :image: _static/img/thumbnails/cropped/torch_text_logo.png - :link: beginner/torchtext_custom_dataset_tutorial.html - :tags: Text - + :tags: NLP .. ONNX @@ -282,6 +264,13 @@ What's new in PyTorch tutorials? :link: beginner/onnx/export_simple_model_to_onnx_tutorial.html :tags: Production,ONNX,Backends +.. customcarditem:: + :header: Introduction to ONNX Registry + :card_description: Demonstrate end-to-end how to address unsupported operators by using ONNX Registry. + :image: _static/img/thumbnails/cropped/Exporting-PyTorch-Models-to-ONNX-Graphs.png + :link: advanced/onnx_registry_tutorial.html + :tags: Production,ONNX,Backends + .. Reinforcement Learning .. customcarditem:: @@ -305,6 +294,13 @@ What's new in PyTorch tutorials? :link: intermediate/mario_rl_tutorial.html :tags: Reinforcement-Learning +.. customcarditem:: + :header: Recurrent DQN + :card_description: Use TorchRL to train recurrent policies + :image: _static/img/rollout_recurrent.png + :link: intermediate/dqn_with_rnn_tutorial.html + :tags: Reinforcement-Learning + .. customcarditem:: :header: Code a DDPG Loss :card_description: Use TorchRL to code a DDPG Loss @@ -312,7 +308,12 @@ What's new in PyTorch tutorials? :link: advanced/coding_ddpg.html :tags: Reinforcement-Learning - +.. customcarditem:: + :header: Writing your environment and transforms + :card_description: Use TorchRL to code a Pendulum + :image: _static/img/pendulum.gif + :link: advanced/pendulum.html + :tags: Reinforcement-Learning .. Deploying PyTorch Models in Production @@ -345,6 +346,23 @@ What's new in PyTorch tutorials? :link: advanced/super_resolution_with_onnxruntime.html :tags: Production,ONNX +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Learn how to profile a PyTorch application + :link: beginner/profiler.html + :tags: Profiling + +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Introduction to Holistic Trace Analysis + :link: beginner/hta_intro_tutorial.html + :tags: Profiling + +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Trace Diff using Holistic Trace Analysis + :link: beginner/hta_trace_diff_tutorial.html + :tags: Profiling .. Code Transformations with FX @@ -378,16 +396,44 @@ What's new in PyTorch tutorials? :link: advanced/cpp_frontend.html :tags: Frontend-APIs,C++ +.. customcarditem:: + :header: PyTorch Custom Operators Landing Page + :card_description: This is the landing page for all things related to custom operators in PyTorch. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/custom_ops_landing_page.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + +.. customcarditem:: + :header: Custom Python Operators + :card_description: Create Custom Operators in Python. Useful for black-boxing a Python function for use with torch.compile. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/python_custom_ops.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + +.. customcarditem:: + :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` + :card_description: Learn how to use compiled autograd to capture a larger backward graph. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/compiled_autograd_tutorial + :tags: Model-Optimization,CUDA + +.. customcarditem:: + :header: Custom C++ and CUDA Operators + :card_description: How to extend PyTorch with custom C++ and CUDA operators. + :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png + :link: advanced/cpp_custom_ops.html + :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA + .. customcarditem:: :header: Custom C++ and CUDA Extensions - :card_description: Create a neural network layer with no parameters using numpy. Then use scipy to create a neural network layer that has learnable weights. + :card_description: Create a neural network layer with no parameters using numpy. Then use scipy to create a neural network layer that has learnable weights. :image: _static/img/thumbnails/cropped/Custom-Cpp-and-CUDA-Extensions.png :link: advanced/cpp_extension.html :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA .. customcarditem:: :header: Extending TorchScript with Custom C++ Operators - :card_description: Implement a custom TorchScript operator in C++, how to build it into a shared library, how to use it in Python to define TorchScript models and lastly how to load it into a C++ application for inference workloads. + :card_description: Implement a custom TorchScript operator in C++, how to build it into a shared library, how to use it in Python to define TorchScript models and lastly how to load it into a C++ application for inference workloads. :image: _static/img/thumbnails/cropped/Extending-TorchScript-with-Custom-Cpp-Operators.png :link: advanced/torch_script_custom_ops.html :tags: Extending-PyTorch,Frontend-APIs,TorchScript,C++ @@ -513,13 +559,6 @@ What's new in PyTorch tutorials? :link: beginner/hyperparameter_tuning_tutorial.html :tags: Model-Optimization,Best-Practice -.. customcarditem:: - :header: Optimizing Vision Transformer Model - :card_description: Learn how to use Facebook Data-efficient Image Transformers DeiT and script and optimize it for mobile. - :image: _static/img/thumbnails/cropped/mobile.png - :link: beginner/vt_tutorial.html - :tags: Model-Optimization,Best-Practice,Mobile - .. customcarditem:: :header: Parametrizations Tutorial :card_description: Learn how to use torch.nn.utils.parametrize to put constraints on your parameters (e.g. make them orthogonal, symmetric positive definite, low-rank...) @@ -541,6 +580,13 @@ What's new in PyTorch tutorials? :link: intermediate/optimizer_step_in_backward_tutorial.html :tags: Model-Optimization,Best-Practice,CUDA,Frontend-APIs +.. customcarditem:: + :header: (beta) Accelerating BERT with semi-structured sparsity + :card_description: Train BERT, prune it to be 2:4 sparse, and then accelerate it to achieve 2x inference speedups with semi-structured sparsity and torch.compile. + :image: _static/img/thumbnails/cropped/Pruning-Tutorial.png + :link: advanced/semi_structured_sparse.html + :tags: Text,Model-Optimization + .. customcarditem:: :header: (beta) Dynamic Quantization on an LSTM Word Language Model :card_description: Apply dynamic quantization, the easiest form of quantization, to a LSTM-based next word prediction model. @@ -618,6 +664,14 @@ What's new in PyTorch tutorials? :link: beginner/knowledge_distillation_tutorial.html :tags: Model-Optimization,Image/Video + +.. customcarditem:: + :header: Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile() + :card_description: This tutorial goes over recommended best practices for implementing Transformers with native PyTorch. + :image: _static/img/thumbnails/cropped/pytorch-logo.png + :link: intermediate/transformer_building_blocks.html + :tags: Transformer + .. Parallel-and-Distributed-Training @@ -657,6 +711,13 @@ What's new in PyTorch tutorials? :link: intermediate/dist_tuto.html :tags: Parallel-and-Distributed-Training +.. customcarditem:: + :header: Large Scale Transformer model training with Tensor Parallel + :card_description: Learn how to train large models with Tensor Parallel package. + :image: _static/img/thumbnails/cropped/Large-Scale-Transformer-model-training-with-Tensor-Parallel.png + :link: intermediate/TP_tutorial.html + :tags: Parallel-and-Distributed-Training + .. customcarditem:: :header: Customize Process Group Backends Using Cpp Extensions :card_description: Extend ProcessGroup with custom collective communication implementations. @@ -679,10 +740,10 @@ What's new in PyTorch tutorials? :tags: Parallel-and-Distributed-Training .. customcarditem:: - :header: Distributed Pipeline Parallelism Using RPC - :card_description: Demonstrate how to implement distributed pipeline parallelism using RPC - :image: _static/img/thumbnails/cropped/Distributed-Pipeline-Parallelism-Using-RPC.png - :link: intermediate/dist_pipeline_parallel_tutorial.html + :header: Introduction to Distributed Pipeline Parallelism + :card_description: Demonstrate how to implement pipeline parallelism using torch.distributed.pipelining + :image: _static/img/thumbnails/cropped/Introduction-to-Distributed-Pipeline-Parallelism.png + :link: intermediate/pipelining_tutorial.html :tags: Parallel-and-Distributed-Training .. customcarditem:: @@ -699,20 +760,6 @@ What's new in PyTorch tutorials? :link: advanced/rpc_ddp_tutorial.html :tags: Parallel-and-Distributed-Training -.. customcarditem:: - :header: Training Transformer models using Pipeline Parallelism - :card_description: Walk through a through a simple example of how to train a transformer model using pipeline parallelism. - :image: _static/img/thumbnails/cropped/Training-Transformer-models-using-Pipeline-Parallelism.png - :link: intermediate/pipeline_tutorial.html - :tags: Parallel-and-Distributed-Training - -.. customcarditem:: - :header: Training Transformer models using Distributed Data Parallel and Pipeline Parallelism - :card_description: Walk through a through a simple example of how to train a transformer model using Distributed Data Parallel and Pipeline Parallelism - :image: _static/img/thumbnails/cropped/Training-Transformer-Models-using-Distributed-Data-Parallel-and-Pipeline-Parallelism.png - :link: advanced/ddp_pipeline.html - :tags: Parallel-and-Distributed-Training - .. customcarditem:: :header: Getting Started with Fully Sharded Data Parallel(FSDP) :card_description: Learn how to train models with Fully Sharded Data Parallel package. @@ -724,24 +771,60 @@ What's new in PyTorch tutorials? :header: Advanced Model Training with Fully Sharded Data Parallel (FSDP) :card_description: Explore advanced model training with Fully Sharded Data Parallel package. :image: _static/img/thumbnails/cropped/Getting-Started-with-FSDP.png - :link: intermediate/FSDP_adavnced_tutorial.html + :link: intermediate/FSDP_advanced_tutorial.html :tags: Parallel-and-Distributed-Training -.. Mobile +.. customcarditem:: + :header: Introduction to Libuv TCPStore Backend + :card_description: TCPStore now uses a new server backend for faster connection and better scalability. + :image: _static/img/thumbnails/cropped/Introduction-to-Libuv-Backend-TCPStore.png + :link: intermediate/TCPStore_libuv_backend.html + :tags: Parallel-and-Distributed-Training + +.. Edge + +.. customcarditem:: + :header: Exporting to ExecuTorch Tutorial + :card_description: Learn about how to use ExecuTorch, a unified ML stack for lowering PyTorch models to edge devices. + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html + :tags: Edge + +.. customcarditem:: + :header: Running an ExecuTorch Model in C++ Tutorial + :card_description: Learn how to load and execute an ExecuTorch model in C++ + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html + :tags: Edge + +.. customcarditem:: + :header: Using the ExecuTorch SDK to Profile a Model + :card_description: Explore how to use the ExecuTorch SDK to profile, debug, and visualize ExecuTorch models + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/tutorials/sdk-integration-tutorial.html + :tags: Edge + +.. customcarditem:: + :header: Building an ExecuTorch iOS Demo App + :card_description: Explore how to set up the ExecuTorch iOS Demo App, which uses the MobileNet v3 model to process live camera images leveraging three different backends: XNNPACK, Core ML, and Metal Performance Shaders (MPS). + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/demo-apps-ios.html + :tags: Edge .. customcarditem:: - :header: Image Segmentation DeepLabV3 on iOS - :card_description: A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on iOS. - :image: _static/img/thumbnails/cropped/ios.png - :link: beginner/deeplabv3_on_ios.html - :tags: Mobile + :header: Building an ExecuTorch Android Demo App + :card_description: Learn how to set up the ExecuTorch Android Demo App for image segmentation tasks using the DeepLab v3 model and XNNPACK FP32 backend. + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/demo-apps-android.html + :tags: Edge .. customcarditem:: - :header: Image Segmentation DeepLabV3 on Android - :card_description: A comprehensive step-by-step tutorial on how to prepare and run the PyTorch DeepLabV3 image segmentation model on Android. - :image: _static/img/thumbnails/cropped/android.png - :link: beginner/deeplabv3_on_android.html - :tags: Mobile + :header: Lowering a Model as a Delegate + :card_description: Learn to accelerate your program using ExecuTorch by applying delegates through three methods: lowering the whole module, composing it with another module, and partitioning parts of a module. + :image: _static/img/ExecuTorch-Logo-cropped.svg + :link: https://pytorch.org/executorch/stable/examples-end-to-end-to-lower-model-to-delegate.html + :tags: Edge + .. Recommendation Systems @@ -749,7 +832,7 @@ What's new in PyTorch tutorials? :header: Introduction to TorchRec :card_description: TorchRec is a PyTorch domain library built to provide common sparsity & parallelism primitives needed for large-scale recommender systems. :image: _static/img/thumbnails/torchrec.png - :link: intermediate/torchrec_tutorial.html + :link: intermediate/torchrec_intro_tutorial.html :tags: TorchRec,Recommender .. customcarditem:: @@ -831,7 +914,7 @@ Additional Resources .. Page TOC .. ----------------------------------------- .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :hidden: :includehidden: :caption: PyTorch Recipes @@ -840,38 +923,15 @@ Additional Resources See All Prototype Recipes .. toctree:: - :maxdepth: 2 :hidden: :includehidden: :caption: Introduction to PyTorch beginner/basics/intro - beginner/basics/quickstart_tutorial - beginner/basics/tensorqs_tutorial - beginner/basics/data_tutorial - beginner/basics/transforms_tutorial - beginner/basics/buildmodel_tutorial - beginner/basics/autogradqs_tutorial - beginner/basics/optimization_tutorial - beginner/basics/saveloadrun_tutorial - -.. toctree:: - :maxdepth: 2 - :hidden: - :includehidden: - :caption: Introduction to PyTorch on YouTube - - beginner/introyt - beginner/introyt/introyt1_tutorial - beginner/introyt/tensors_deeper_tutorial - beginner/introyt/autogradyt_tutorial - beginner/introyt/modelsyt_tutorial - beginner/introyt/tensorboardyt_tutorial - beginner/introyt/trainingyt - beginner/introyt/captumyt + beginner/introyt/introyt_index .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :hidden: :includehidden: :caption: Learning PyTorch @@ -879,10 +939,12 @@ Additional Resources beginner/deep_learning_60min_blitz beginner/pytorch_with_examples beginner/nn_tutorial + intermediate/nlp_from_scratch_index intermediate/tensorboard_tutorial + intermediate/pinmem_nonblock .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :includehidden: :hidden: :caption: Image and Video @@ -893,6 +955,7 @@ Additional Resources beginner/dcgan_faces_tutorial intermediate/spatial_transformer_tutorial beginner/vt_tutorial + intermediate/tiatoolbox_tutorial .. toctree:: :maxdepth: 2 @@ -912,23 +975,7 @@ Additional Resources intermediate/forced_alignment_with_torchaudio_tutorial .. toctree:: - :maxdepth: 2 - :includehidden: - :hidden: - :caption: Text - - beginner/transformer_tutorial - beginner/bettertransformer_tutorial - intermediate/char_rnn_classification_tutorial - intermediate/char_rnn_generation_tutorial - intermediate/seq2seq_translation_tutorial - beginner/text_sentiment_ngrams_tutorial - beginner/translation_transformer - beginner/torchtext_custom_dataset_tutorial - - -.. toctree:: - :maxdepth: 2 + :maxdepth: 1 :includehidden: :hidden: :caption: Backends @@ -944,9 +991,10 @@ Additional Resources intermediate/reinforcement_q_learning intermediate/reinforcement_ppo intermediate/mario_rl_tutorial + advanced/pendulum .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :includehidden: :hidden: :caption: Deploying PyTorch Models in Production @@ -958,6 +1006,16 @@ Additional Resources advanced/super_resolution_with_onnxruntime intermediate/realtime_rpi +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Profiling PyTorch + + beginner/profiler + beginner/hta_intro_tutorial + beginner/hta_trace_diff_tutorial + .. toctree:: :maxdepth: 2 :includehidden: @@ -989,6 +1047,9 @@ Additional Resources :hidden: :caption: Extending PyTorch + advanced/custom_ops_landing_page + advanced/python_custom_ops + advanced/cpp_custom_ops intermediate/custom_function_double_backward_tutorial intermediate/custom_function_conv_bn_tutorial advanced/cpp_extension @@ -1019,10 +1080,12 @@ Additional Resources intermediate/nvfuser_intro_tutorial intermediate/ax_multiobjective_nas_tutorial intermediate/torch_compile_tutorial + intermediate/compiled_autograd_tutorial intermediate/inductor_debug_cpu intermediate/scaled_dot_product_attention_tutorial beginner/knowledge_distillation_tutorial + .. toctree:: :maxdepth: 2 :includehidden: @@ -1036,25 +1099,29 @@ Additional Resources intermediate/ddp_tutorial intermediate/dist_tuto intermediate/FSDP_tutorial - intermediate/FSDP_adavnced_tutorial + intermediate/FSDP_advanced_tutorial + intermediate/TCPStore_libuv_backend + intermediate/TP_tutorial + intermediate/pipelining_tutorial intermediate/process_group_cpp_extension_tutorial intermediate/rpc_tutorial intermediate/rpc_param_server_tutorial - intermediate/dist_pipeline_parallel_tutorial intermediate/rpc_async_execution advanced/rpc_ddp_tutorial - intermediate/pipeline_tutorial - advanced/ddp_pipeline advanced/generic_join .. toctree:: :maxdepth: 2 :includehidden: :hidden: - :caption: Mobile + :caption: Edge with ExecuTorch - beginner/deeplabv3_on_ios - beginner/deeplabv3_on_android + Exporting to ExecuTorch Tutorial + Running an ExecuTorch Model in C++ Tutorial < https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html> + Using the ExecuTorch SDK to Profile a Model + Building an ExecuTorch iOS Demo App + Building an ExecuTorch Android Demo App + Lowering a Model as a Delegate .. toctree:: :maxdepth: 2 @@ -1062,7 +1129,7 @@ Additional Resources :hidden: :caption: Recommendation Systems - intermediate/torchrec_tutorial + intermediate/torchrec_intro_tutorial advanced/sharding .. toctree:: diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_advanced_tutorial.rst similarity index 92% rename from intermediate_source/FSDP_adavnced_tutorial.rst rename to intermediate_source/FSDP_advanced_tutorial.rst index 748c8593306..bf22e6efb50 100644 --- a/intermediate_source/FSDP_adavnced_tutorial.rst +++ b/intermediate_source/FSDP_advanced_tutorial.rst @@ -6,6 +6,23 @@ Wright `__, `Rohan Varma `__, `Yanli Zhao `__ +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * PyTorch's Fully Sharded Data Parallel Module: A wrapper for sharding module parameters across + data parallel workers. + + + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 1.12 or later + * Read about the `FSDP API `__. + This tutorial introduces more advanced features of Fully Sharded Data Parallel (FSDP) as part of the PyTorch 1.12 release. To get familiar with FSDP, please @@ -13,18 +30,20 @@ refer to the `FSDP getting started tutorial `__. In this tutorial, we fine-tune a HuggingFace (HF) T5 model with FSDP for text -summarization as a working example. +summarization as a working example. The example uses Wikihow and for simplicity, we will showcase the training on a -single node, P4dn instance with 8 A100 GPUs. We will soon have a blog post on -large scale FSDP training on a multi-node cluster, please stay tuned for that on -the PyTorch medium channel. +single node, P4dn instance with 8 A100 GPUs. We now have several blog posts ( +`(link1), `__ +`(link2) `__) +and a `paper `__ on +large scale FSDP training on a multi-node cluster. FSDP is a production ready package with focus on ease of use, performance, and long-term support. One of the main benefits of FSDP is reducing the memory footprint on each GPU. This enables training of larger models with lower total memory vs DDP, and leverages the overlap of computation and communication to -train models efficiently. +train models efficiently. This reduced memory pressure can be leveraged to either train larger models or increase batch size, potentially helping overall training throughput. You can read more about PyTorch FSDP `here @@ -47,21 +66,21 @@ Recap on How FSDP Works At a high level FDSP works as follow: -*In constructor* +*In the constructor* * Shard model parameters and each rank only keeps its own shard -*In forward pass* +*In the forward pass* * Run `all_gather` to collect all shards from all ranks to recover the full - parameter for this FSDP unit Run forward computation -* Discard non-owned parameter shards it has just collected to free memory + parameter for this FSDP unit and run the forward computation +* Discard the non-owned parameter shards it has just collected to free memory -*In backward pass* +*In the backward pass* * Run `all_gather` to collect all shards from all ranks to recover the full - parameter in this FSDP unit Run backward computation -* Discard non-owned parameters to free memory. + parameter in this FSDP unit and run backward computation +* Discard non-owned parameters to free memory. * Run reduce_scatter to sync gradients @@ -74,21 +93,17 @@ summarization using WikiHow dataset. The main focus of this tutorial is to highlight different available features in FSDP that are helpful for training large scale model above 3B parameters. Also, we cover specific features for Transformer based models. The code for this tutorial is available in `Pytorch -Examples -`__. +examples +`__. *Setup* -1.1 Install PyTorch Nightlies - -We will install PyTorch nightlies, as some of the features such as activation -checkpointing is available in nightlies and will be added in next PyTorch -release after 1.12. +1.1 Install the latest PyTorch -.. code-block:: bash +.. code-block:: bash - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html + pip3 install torch torchvision torchaudio 1.2 Dataset Setup @@ -97,13 +112,13 @@ Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv `wikihowSep.cs `__, and place them in the `data` folder. We will use the wikihow dataset from `summarization_dataset -`__. +`__. Next, we add the following code snippets to a Python script “T5_training.py”. .. note:: The full source code for this tutorial is available in `PyTorch examples - `__. + `__. 1.3 Import necessary packages: @@ -154,7 +169,7 @@ Next, we add the following code snippets to a Python script “T5_training.py” import tqdm from datetime import datetime -1.4 Distributed training setup. +1.4 Distributed training setup. Here we use two helper functions to initialize the processes for distributed training, and then to clean up after training completion. In this tutorial, we are going to use torch elastic, using `torchrun @@ -191,13 +206,13 @@ metrics. date_of_run = datetime.now().strftime("%Y-%m-%d-%I:%M:%S_%p") print(f"--> current date and time of run = {date_of_run}") return date_of_run - + def format_metrics_to_gb(item): """quick function to format numbers to gigabyte and round to 4 digit precision""" metric_num = item / g_gigabyte metric_num = round(metric_num, ndigits=4) return metric_num - + 2.2 Define a train function: @@ -275,7 +290,7 @@ metrics. .. code-block:: python - + def fsdp_main(args): model, tokenizer = setup_model("t5-base") @@ -292,7 +307,7 @@ metrics. #wikihow(tokenizer, type_path, num_samples, input_length, output_length, print_text=False) - train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) + train_dataset = wikihow(tokenizer, 'train', 1500, 512, 150, False) val_dataset = wikihow(tokenizer, 'validation', 300, 512, 150, False) sampler1 = DistributedSampler(train_dataset, rank=rank, num_replicas=world_size, shuffle=True) @@ -430,7 +445,7 @@ metrics. .. code-block:: python - + if __name__ == '__main__': # Training settings parser = argparse.ArgumentParser(description='PyTorch T5 FSDP Example') @@ -463,7 +478,7 @@ metrics. To run the the training using torchrun: -.. code-block:: bash +.. code-block:: bash torchrun --nnodes 1 --nproc_per_node 4 T5_training.py @@ -487,7 +502,7 @@ communication efficient. In PyTorch 1.12, FSDP added this support and now we have a wrapping policy for transfomers. It can be created as follows, where the T5Block represents the T5 transformer -layer class (holding MHSA and FFN). +layer class (holding MHSA and FFN). .. code-block:: python @@ -499,10 +514,10 @@ layer class (holding MHSA and FFN). }, ) torch.cuda.set_device(local_rank) - + model = FSDP(model, - fsdp_auto_wrap_policy=t5_auto_wrap_policy) + auto_wrap_policy=t5_auto_wrap_policy) To see the wrapped model, you can easily print the model and visually inspect the sharding and FSDP units as well. @@ -513,22 +528,22 @@ Mixed Precision FSDP supports flexible mixed precision training allowing for arbitrary reduced precision types (such as fp16 or bfloat16). Currently BFloat16 is only available on Ampere GPUs, so you need to confirm native support before you use it. On -V100s for example, BFloat16 can still be run but due to it running non-natively, +V100s for example, BFloat16 can still be run but because it runs non-natively, it can result in significant slowdowns. To check if BFloat16 is natively supported, you can use the following : .. code-block:: python - + bf16_ready = ( torch.version.cuda - and torch.cuda.is_bf16_supported() + and torch.cuda.is_bf16_supported() and LooseVersion(torch.version.cuda) >= "11.0" and dist.is_nccl_available() and nccl.version() >= (2, 10) ) -One of the advantages of mixed percision in FSDP is providing granular control +One of the advantages of mixed precision in FSDP is providing granular control over different precision levels for parameters, gradients, and buffers as follows: @@ -571,7 +586,7 @@ with the following policy: .. code-block:: bash grad_bf16 = MixedPrecision(reduce_dtype=torch.bfloat16) - + In 2.4 we just add the relevant mixed precision policy to the FSDP wrapper: @@ -604,9 +619,9 @@ CPU-based initialization: auto_wrap_policy=t5_auto_wrap_policy, mixed_precision=bfSixteen, device_id=torch.cuda.current_device()) - - + + Sharding Strategy ----------------- FSDP sharding strategy by default is set to fully shard the model parameters, @@ -627,7 +642,7 @@ instead of "ShardingStrategy.FULL_SHARD" to the FSDP initialization as follows: sharding_strategy=ShardingStrategy.SHARD_GRAD_OP # ZERO2) This will reduce the communication overhead in FSDP, in this case, it holds full -parameters after forward and through the backwards pass. +parameters after forward and through the backwards pass. This saves an all_gather during backwards so there is less communication at the cost of a higher memory footprint. Note that full model params are freed at the @@ -652,12 +667,12 @@ wrapper in 2.4 as follows: mixed_precision=bfSixteen, device_id=torch.cuda.current_device(), backward_prefetch = BackwardPrefetch.BACKWARD_PRE) - + `backward_prefetch` has two modes, `BACKWARD_PRE` and `BACKWARD_POST`. `BACKWARD_POST` means that the next FSDP unit's params will not be requested until the current FSDP unit processing is complete, thus minimizing memory overhead. In some cases, using `BACKWARD_PRE` can increase model training speed -up to 2-10%, with even higher speed improvements noted for larger models. +up to 2-10%, with even higher speed improvements noted for larger models. Model Checkpoint Saving, by streaming to the Rank0 CPU ------------------------------------------------------ @@ -696,7 +711,7 @@ Pytorch 1.12 and used HF T5 as the running example. Using the proper wrapping policy especially for transformer models, along with mixed precision and backward prefetch should speed up your training runs. Also, features such as initializing the model on device, and checkpoint saving via streaming to CPU -should help to avoid OOM error in dealing with large models. +should help to avoid OOM error in dealing with large models. We are actively working to add new features to FSDP for the next release. If you have feedback, feature requests, questions or are encountering issues diff --git a/intermediate_source/FSDP_tutorial.rst b/intermediate_source/FSDP_tutorial.rst index d69a03b68be..8e5217c64a8 100644 --- a/intermediate_source/FSDP_tutorial.rst +++ b/intermediate_source/FSDP_tutorial.rst @@ -1,5 +1,5 @@ Getting Started with Fully Sharded Data Parallel(FSDP) -===================================================== +====================================================== **Author**: `Hamid Shojanazeri `__, `Yanli Zhao `__, `Shen Li `__ @@ -8,17 +8,17 @@ Getting Started with Fully Sharded Data Parallel(FSDP) Training AI models at a large scale is a challenging task that requires a lot of compute power and resources. It also comes with considerable engineering complexity to handle the training of these very large models. -`Pytorch FSDP `__, released in PyTorch 1.11 makes this easier. +`PyTorch FSDP `__, released in PyTorch 1.11 makes this easier. -In this tutorial, we show how to use `FSDP APIs `__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models `__, -`GPT 3 models up to 1T parameters `__ . The sample DDP MNIST code has been borrowed from `here `__. +In this tutorial, we show how to use `FSDP APIs `__, for simple MNIST models that can be extended to other larger models such as `HuggingFace BERT models `__, +`GPT 3 models up to 1T parameters `__ . The sample DDP MNIST code courtesy of `Patrick Hu `_. How FSDP works -------------- In `DistributedDataParallel `__, (DDP) training, each process/ worker owns a replica of the model and processes a batch of data, finally it uses all-reduce to sum up gradients over different workers. In DDP the model weights and optimizer states are replicated across all workers. FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks. -FSDP GPU memory footprint would be smaller than DDP across all workers. This makes the training of some very large models feasible and helps to fit larger models or batch sizes for our training job. This would come with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like communication and computation overlapping. +When training with FSDP, the GPU memory footprint is smaller than when training with DDP across all workers. This makes the training of some very large models feasible by allowing larger models or batch sizes to fit on device. This comes with the cost of increased communication volume. The communication overhead is reduced by internal optimizations like overlapping communication and computation. .. figure:: /_static/img/distributed/fsdp_workflow.png :width: 100% @@ -27,7 +27,7 @@ FSDP GPU memory footprint would be smaller than DDP across all workers. This mak FSDP Workflow -At high level FSDP works as follow: +At a high level FSDP works as follow: *In constructor* @@ -46,24 +46,31 @@ At high level FSDP works as follow: * Run reduce_scatter to sync gradients * Discard parameters. +One way to view FSDP's sharding is to decompose the DDP gradient all-reduce into reduce-scatter and all-gather. Specifically, during the backward pass, FSDP reduces and scatters gradients, ensuring that each rank possesses a shard of the gradients. Then it updates the corresponding shard of the parameters in the optimizer step. Finally, in the subsequent forward pass, it performs an all-gather operation to collect and combine the updated parameter shards. + +.. figure:: /_static/img/distributed/fsdp_sharding.png + :width: 100% + :align: center + :alt: FSDP allreduce + + FSDP Allreduce + How to use FSDP --------------- -Here we use a toy model to run training on MNIST dataset for demonstration purposes. Similarly the APIs and logic can be applied to larger models for training. +--------------- +Here we use a toy model to run training on the MNIST dataset for demonstration purposes. The APIs and logic can be applied to training larger models as well. *Setup* -1.1 Install Pytorch along with Torchvision - -.. code-block:: bash +1.1 Install PyTorch along with Torchvision - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html +See the `Get Started guide `__ for information on installation. We add the following code snippets to a python script “FSDP_mnist.py”. 1.2 Import necessary packages .. note:: - This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy`. + This tutorial is intended for PyTorch versions 1.12 and later. If you are using an earlier version, replace all instances of `size_based_auto_wrap_policy` with `default_auto_wrap_policy` and `fsdp_auto_wrap_policy` with `auto_wrap_policy`. .. code-block:: python @@ -139,7 +146,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. output = F.log_softmax(x, dim=1) return output -2.2 define a train function +2.2 Define a train function .. code-block:: python @@ -189,7 +196,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. 2.4 Define a distributed train function that wraps the model in FSDP -**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states. This is only available in Pytorch nightlies, current Pytorch release is 1.11 at the moment.** +**Note: to save the FSDP model, we need to call the state_dict on each rank then on Rank 0 save the overall states.** .. code-block:: python @@ -244,13 +251,13 @@ We add the following code snippets to a python script “FSDP_mnist.py”. init_end_event.record() if rank == 0: + init_end_event.synchronize() print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") print(f"{model}") if args.save_model: # use a barrier to make sure training is done on all ranks dist.barrier() - # state_dict for FSDP model is only available on Nightlies for now states = model.state_dict() if rank == 0: torch.save(states, "mnist_cnn.pt") @@ -259,7 +266,7 @@ We add the following code snippets to a python script “FSDP_mnist.py”. -2.5 Finally parsing the arguments and setting the main function +2.5 Finally, parse the arguments and set the main function .. code-block:: python @@ -302,7 +309,7 @@ We have recorded cuda events to measure the time of FSDP model specifics. The CU CUDA event elapsed time on training loop 40.67462890625sec Wrapping the model with FSDP, the model will look as follows, we can see the model has been wrapped in one FSDP unit. -Alternatively, we will look at adding the fsdp_auto_wrap_policy next and will discuss the differences. +Alternatively, we will look at adding the auto_wrap_policy next and will discuss the differences. .. code-block:: bash @@ -319,7 +326,7 @@ Alternatively, we will look at adding the fsdp_auto_wrap_policy next and will di ) ) -Following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 gpus captured from Pytorch Profiler. +The following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. .. figure:: /_static/img/distributed/FSDP_memory.gif @@ -329,12 +336,12 @@ Following is the peak memory usage from FSDP MNIST training on g4dn.12.xlarge AW FSDP Peak Memory Usage -*Applying fsdp_auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. +Applying *auto_wrap_policy* in FSDP otherwise, FSDP will put the entire model in one FSDP unit, which will reduce computation efficiency and memory efficiency. The way it works is that, suppose your model contains 100 Linear layers. If you do FSDP(model), there will only be one FSDP unit which wraps the entire model. In that case, the allgather would collect the full parameters for all 100 linear layers, and hence won't save CUDA memory for parameter sharding. Also, there is only one blocking allgather call for the all 100 linear layers, there will not be communication and computation overlapping between layers. -To avoid that, you can pass in an fsdp_auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit). +To avoid that, you can pass in an auto_wrap_policy, which will seal the current FSDP unit and start a new one automatically when the specified condition is met (e.g., size limit). In that way you will have multiple FSDP units, and only one FSDP unit needs to collect full parameters at a time. E.g., suppose you have 5 FSDP units, and each wraps 20 linear layers. Then, in the forward, the 1st FSDP unit will allgather parameters for the first 20 linear layers, do computation, discard the parameters and then move on to the next 20 linear layers. So, at any point in time, each rank only materializes parameters/grads for 20 linear layers instead of 100. @@ -352,9 +359,9 @@ Finding an optimal auto wrap policy is challenging, PyTorch will add auto tuning model = Net().to(rank) model = FSDP(model, - fsdp_auto_wrap_policy=my_auto_wrap_policy) + auto_wrap_policy=my_auto_wrap_policy) -Applying the FSDP_auto_wrap_policy, the model would be as follows: +Applying the auto_wrap_policy, the model would be as follows: .. code-block:: bash @@ -381,7 +388,7 @@ Applying the FSDP_auto_wrap_policy, the model would be as follows: CUDA event elapsed time on training loop 41.89130859375sec -Following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 gpus captured from Pytorch Profiler. +The following is the peak memory usage from FSDP with auto_wrap policy of MNIST training on a g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch Profiler. It can be observed that the peak memory usage on each device is smaller compared to FSDP without auto wrap policy applied, from ~75 MB to 66 MB. .. figure:: /_static/img/distributed/FSDP_autowrap.gif @@ -391,11 +398,11 @@ It can be observed that the peak memory usage on each device is smaller compared FSDP Peak Memory Usage using Auto_wrap policy -*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into gpus, then CPU offload can be helpful here. +*CPU Off-loading*: In case the model is very large that even with FSDP wouldn't fit into GPUs, then CPU offload can be helpful here. Currently, only parameter and gradient CPU offload is supported. It can be enabled via passing in cpu_offload=CPUOffload(offload_params=True). -Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. Default is None in which case there will be no offloading. +Note that this currently implicitly enables gradient offloading to CPU in order for params and grads to be on the same device to work with the optimizer. This API is subject to change. The default is None in which case there will be no offloading. Using this feature may slow down the training considerably, due to frequent copying of tensors from host to device, but it could help improve memory efficiency and train larger scale models. @@ -405,11 +412,11 @@ In 2.4 we just add it to the FSDP wrapper .. code-block:: python model = FSDP(model, - fsdp_auto_wrap_policy=my_auto_wrap_policy, + auto_wrap_policy=my_auto_wrap_policy, cpu_offload=CPUOffload(offload_params=True)) -Compare it with DDP, if in 2.4 we just normally wrap the model in ddp, saving the changes in “DDP_mnist.py”. +Compare it with DDP, if in 2.4 we just normally wrap the model in DPP, saving the changes in “DDP_mnist.py”. .. code-block:: python @@ -423,7 +430,7 @@ Compare it with DDP, if in 2.4 we just normally wrap the model in ddp, saving th CUDA event elapsed time on training loop 39.77766015625sec -Following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 gpus captured from Pytorch profiler. +The following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS EC2 instance with 4 GPUs captured from PyTorch profiler. .. figure:: /_static/img/distributed/DDP_memory.gif :width: 100% @@ -434,8 +441,8 @@ Following is the peak memory usage from DDP MNIST training on g4dn.12.xlarge AWS Considering the toy example and tiny MNIST model we defined here, we can observe the difference between peak memory usage of DDP and FSDP. -In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP that shards the model parameter, optimizer states and gradients over DDP ranks. +In DDP each process holds a replica of the model, so the memory footprint is higher compared to FSDP which shards the model parameters, optimizer states and gradients over DDP ranks. The peak memory usage using FSDP with auto_wrap policy is the lowest followed by FSDP and DDP. -Also, looking at timings, considering the small model and running the training on a single machine, FSDP with/out auto_wrap policy performed almost as fast as DDP. +Also, looking at timings, considering the small model and running the training on a single machine, FSDP with and without auto_wrap policy performed almost as fast as DDP. This example does not represent most of the real applications, for detailed analysis and comparison between DDP and FSDP please refer to this `blog post `__ . diff --git a/intermediate_source/TCPStore_libuv_backend.rst b/intermediate_source/TCPStore_libuv_backend.rst new file mode 100644 index 00000000000..1e285eba7c4 --- /dev/null +++ b/intermediate_source/TCPStore_libuv_backend.rst @@ -0,0 +1,286 @@ +Introduction to Libuv TCPStore Backend +====================================== +**Authors**: `Xilun Wu `_ + +.. note:: + |edit| View and edit this tutorial in `github `__. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * What is the new TCPStore backend + * Compare the new libuv backend against the legacy backend + * How to enable to use the legacy backend + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 or later + * Read about the `TCPStore API `__. + + +Introduction +------------ + +Recently, we have rolled out a new TCPStore server backend using `libuv `__, a third-party library for asynchronous I/O. This new server backend aims to +address scalability and robustness challenges in large-scale distributed training jobs, such as those with more than 1024 ranks. We ran a series of +benchmarks to compare the libuv backend against the old one, and the experiment results demonstrated significant improvements in store initialization +time and maintained a comparable performance in store I/O operations. + +As a result of these findings, the libuv backend has been set as the default TCPStore server backend in PyTorch 2.4. This change is expected to enhance +the performance and scalability of distributed training jobs. + +This change introduces a slight incompatibility to store initialization. For users who wish to continue using the legacy backend, the tutorial will +provide guidance on how to specify to use the previous TCPStore server backend. + + +Performance Benchmark +--------------------- + +To better demonstrate the benefit of our new libuv TCPStore backend, we set up a benchmark over a wide range of job size, from 1024 (1K) to 98304 (96K) ranks. +We first measured the TCPStore initialization time using the code snippet below: + +.. code:: python + + import logging + import os + + from time import perf_counter + + import torch + import torch.distributed as dist + + logger: logging.Logger = logging.getLogger(__name__) + + # Env var are preset when launching the benchmark + env_rank = os.environ.get("RANK", 0) + env_world_size = os.environ.get("WORLD_SIZE", 1) + env_master_addr = os.environ.get("MASTER_ADDR", "localhost") + env_master_port = os.environ.get("MASTER_PORT", "23456") + + start = perf_counter() + tcp_store = dist.TCPStore( + env_master_addr, + int(env_master_port), + world_size=int(env_world_size), + is_master=(int(env_rank) == 0), + ) + end = perf_counter() + time_elapsed = end - start + logger.info( + f"Complete TCPStore init with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." + ) + +Since the execution of the TCPStore server thread will be blocked until all clients are successfully connected, we take the time measured on rank 0 as the total +TCPStore initialization runtime. The experiment numbers are reported in the figure below: + +.. figure:: /_static/img/distributed/tcpstore_init_time.png + :width: 100% + :align: center + :alt: TCPStore Initialization Runtime Benchmark Result + +Figure 1. shows some significant evidence that the libuv backend is superior to the legacy backend: + +- TCPStore with libuv backend always has a faster initialization than the legacy backend, especially at super-large scale +- The legacy backend would timeout at server-client connecting at 96K scale (for example, over 30 minutes) while the libuv backend completed the initialization in 100 seconds. + +The second benchmark we did is to measure the runtime of TCPStore ``store_based_barrier`` operation: + +.. code:: python + + import logging + import os + import time + + from datetime import timedelta + from time import perf_counter + + import torch + import torch.distributed as dist + + DistStoreError = torch._C._DistStoreError + logger: logging.Logger = logging.getLogger(__name__) + + # since dist._store_based_barrier is a private function and cannot be directly called, we need to write a function which does the same + def store_based_barrier( + rank, + store, + group_name, + rendezvous_count, + timeout=dist.constants.default_pg_timeout, + logging_interval=timedelta(seconds=10), + ): + store_key = f"store_based_barrier_key:{group_name}" + store.add(store_key, 1) + + world_size = rendezvous_count + worker_count = store.add(store_key, 0) + + last_worker_key = f"{store_key}:last_worker" + if worker_count == world_size: + store.set(last_worker_key, "1") + + start = time.time() + while True: + try: + # This will throw an exception after the logging_interval in which we print out + # the status of the group or time out officially, throwing runtime error + store.wait([last_worker_key], logging_interval) + break + except RuntimeError as e: + worker_count = store.add(store_key, 0) + # Print status periodically to keep track. + logger.info( + "Waiting in store based barrier to initialize process group for " + "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)" + "error: %s", + rank, + store_key, + world_size, + worker_count, + timeout, + e, + ) + + if timedelta(seconds=(time.time() - start)) > timeout: + raise DistStoreError( + "Timed out initializing process group in store based barrier on " + "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout + ) + ) + + logger.info( + "Rank %s: Completed store-based barrier for key:%s with %s nodes.", + rank, + store_key, + world_size, + ) + + # Env var are preset when launching the benchmark + env_rank = os.environ.get("RANK", 0) + env_world_size = os.environ.get("WORLD_SIZE", 1) + env_master_addr = os.environ.get("MASTER_ADDR", "localhost") + env_master_port = os.environ.get("MASTER_PORT", "23456") + + tcp_store = dist.TCPStore( + env_master_addr, + int(env_master_port), + world_size=int(env_world_size), + is_master=(int(env_rank) == 0), + ) + + # sync workers + store_based_barrier(int(env_rank), tcp_store, "tcpstore_test", int(env_world_size)) + + number_runs = 10 + start = perf_counter() + for _ in range(number_runs): + store_based_barrier( + int(env_rank), tcp_store, "tcpstore_test", int(env_world_size) + ) + end = perf_counter() + time_elapsed = end - start + logger.info( + f"Complete {number_runs} TCPStore barrier runs with rank={env_rank}, world_size={env_world_size} in {time_elapsed} seconds." + ) + +We compute the average by dividing the runtime measured on rank 0 by ``number_runs`` and report it in the figure below: + +.. figure:: /_static/img/distributed/tcpstore_barrier_time.png + :width: 100% + :align: center + :alt: TCPStore Barrier Runtime Benchmark Result + +Figure 2. shows that the I/O performance of libuv backend is comparable to the legacy backend: + +- The libuv backend has a comparable performance over the whole spectrum in terms of the number of ranks +- The libuv backend runtime is more stable than the legacy backend as the number of ranks grows + + +Impact +------ + +One incompatibility that users may need to pay attention is, TCPStore currently does not support initialization with a ``listen_fd`` when using libuv backend. +If the user wants to keep using this initialization method, the user can simply pass ``use_libuv=False`` to stay with the old TCPStore backend. + +.. code:: python + + import socket + + import torch + import torch.distributed as dist + + listen_sock: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listen_sock.bind(("localhost", 0)) + addr, port, *_ = listen_sock.getsockname() + listen_fd = listen_sock.detach() + + tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd) # expect NotImplementedError + tcpstore = dist.TCPStore(addr, port, 1, True, master_listen_fd=listen_fd, use_libuv=False) # OK. Use legacy backend + + +Exit Route 1: Pass ``use_libuv=False`` to TCPStore Initialization +----------------------------------------------------------------- + +As the above code snippet shows, if user calls TCPStore init method to create a store, simply passing ``use_libuv=False`` allows user to remain using the old +TCPStore backend. This override has the highest priority over other approaches determining which backend the TCPStore server should choose. + + +Exit Route 2: Add ``use_libuv=0`` to ``init_method`` at ProcessGroup Initialization +----------------------------------------------------------------------------------- + +``ProcessGroup`` creates a TCPStore if user does not explicitly pass one to its initialization. User can add the query option ``use_libuv=0`` to ``init_method`` when +initializing the ``ProcessGroup``. This approach has lower priority than Exit Route 1. + +.. code:: python + + import torch + import torch.distributed as dist + + addr = "localhost" + port = 23456 + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=0, + world_size=1, + init_method=f"tcp://{addr}:{port}?use_libuv=0", + ) + dist.destroy_process_group() + + +Exit Route 3: Set Environment Variable ``USE_LIBUV`` to ``0`` +------------------------------------------------------------- + +When ProcessGroup creates a TCPStore, it also checks the environment vairable ``USE_LIBUV`` to determine which TCPStore backend to use. User can set the environment +variable ``"USE_LIBUV"`` to ``"0"`` to specify the use of old TCPStore backend. This approach has lower priority than Exit Route 2, for example, if the user sets environment +variable ``USE_LIBUV`` to ``1`` and also passes ``use_libuv=0`` in ``init_method``, then the old store backend will be chosen. + +.. code:: python + + import os + + import torch + import torch.distributed as dist + + addr = "localhost" + port = 23456 + os.environ["USE_LIBUV"] = "0" + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=0, + world_size=1, + init_method=f"tcp://{addr}:{port}", + ) + dist.destroy_process_group() + + +Conclusion +---------- +In PyTorch 2.4, we made the new libuv TCPStore backend the default. Although the new backend has incompatibility with initialization from a ``listen_fd``, it +shows significant performance improvement on store initialization at large-scale and compatible performance on store I/O at small/medium/large scales, which +brings a major benefit to Distributed Training's control plane. This tutorial explains our motivation, goes through the performance benchmark, notifies users +of the potential impact, and introduces three exit routes to remain using the legacy backend. In the long term, we aim to eventually deprecate the legacy backend. diff --git a/intermediate_source/TP_tutorial.rst b/intermediate_source/TP_tutorial.rst new file mode 100644 index 00000000000..91e64a87488 --- /dev/null +++ b/intermediate_source/TP_tutorial.rst @@ -0,0 +1,361 @@ +Large Scale Transformer model training with Tensor Parallel (TP) +====================================================== + +**Author**: `Wanchao Liang `__, `Tianyu Liu `__ + +.. note:: + |edit| View and edit this tutorial in `github `__. + +This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel and Fully Sharded Data Parallel. + +Prerequisites: + +- PyTorch 2.3.0 or later installed with CUDA/Linux +- `Tensor Parallel APIs `__ +- `Getting Started with DeviceMesh `__ +- `Getting Started with Fully Sharded Data Parallel `__ + + +How Tensor Parallel works? +----------- +Tensor Parallel (TP) was originally proposed in the `Megatron-LM `__ paper, +and it is an efficient model parallelism technique to train large scale Transformer models. +`Sequence Parallel `__ (SP) we mention in this tutorial is a variant of Tensor +Parallel that shards on the sequence dimension for ``nn.LayerNorm`` or ``RMSNorm`` to further save activation memory +during training. As the model becomes larger, the activation memory becomes the bottleneck, so in Tensor +Parallel training it usually applies Sequence Parallel to ``LayerNorm`` or ``RMSNorm`` layers. + +.. figure:: /_static/img/distributed/megatron_lm.png + :width: 100% + :align: center + :alt: Megatron-LM TP + + Figure 1. represents the sharding in Tensor Parallel style on a Transformer model’s MLP and Self-Attention layer, where the matrix multiplications in both attention/MLP happens through sharded computations (`image source `__) + + +At a high level, PyTorch Tensor Parallel works as follows: + +**Sharding initialization** + +* Determine which ``ParallelStyle`` to apply to each layer and shard the initialized module by calling ``parallelize_module``. +* The parallelized modules would have their model parameters be swapped to DTensors, and DTensor would be responsible to run the parallelized module using sharded computation. + +**Runtime foward/backward** + +* Depending on the input/outputs DTensor layouts user specified for each ``ParallelStyle``, it would run proper communication operation to transform the DTensor layouts for inputs/outputs (such as ``allreduce``, ``allgather`` and ``reduce_scatter``). +* Run sharded computation for the parallelized layers to save compute/memory (for example, ``nn.Linear``, ``nn.Embedding``). + + +When and Why you should apply Tensor Parallel +--------------------------------------------- +The PyTorch Fully Sharded Data Parallel (FSDP) already has the capability to scale model training to a specific +number of GPUs. However, when it comes to further scale the model training in terms of model size and GPU quantity, +many additional challenges arise that may require combining Tensor Parallel with FSDP.: + +1. As the world size (number of GPUs) is becoming excessively large (exceeding 128/256 GPUs), the FSDP collectives (such as ``allgather``) are being dominated by ring latency. + By implementing TP/SP on top of FSDP, the FSDP world size could be reduced by 8 by applying FSDP to be inter-host only, consequently decreasing the latency costs by the same amount. +2. Hit data parallelism limit where you can not raise the global batch size to be above the number of GPUs due to both convergence and GPU memory limitations, Tensor/Sequence Parallel + is the only known way to “ballpark” the global batch size and continue scaling with more GPUs. This means both model size and number of GPUs could continue to scale. +3. For certain types of models, when local batch size becomes smaller, TP/SP can yield matrix multiplication shapes that are more optimized for floating point operations (FLOPS). + +So, when pre-training, how easy is it to hit those limits? As of now, pre-training a Large Language Model (LLM) with billions or trillions of tokens could take months, even when using thousands of GPUs. + +* It will always hit limitation 1 when training LLM on a large scale. For example, Llama 2 70B trained with 2k GPUs for 35 days, multi-dimensional parallelisms are needed at 2k scale. +* When the Transformer model becomes larger (such as Llama2 70B), it will also quickly hit the limitation 2. One could not use FSDP alone with even local ``batch_size=1`` due to memory + and convergence constraints. For example, Llama 2 global batch size is 1K, so data parallelism alone can not be used at 2K GPUs. + + +How to apply Tensor Parallel +---------------------------- + +PyTorch Tensor Parallel APIs offers a set of module level primitives (``ParallelStyle``) to configure the sharding for each individual layers of the model, including: + +* ``ColwiseParallel`` and ``RowwiseParallel``: Shard the ``nn.Linear`` and ``nn.Embedding`` in the column or row fashion. +* ``SequenceParallel``: Perform sharded computations on ``nn.LayerNorm``, ``nn.Dropout``, ``RMSNormPython``, etc. +* ``PrepareModuleInput`` and ``PrepareModuleOutput``: Configure the module inputs/outputs sharding layouts with proper communication operations. + +To demonstrate how to use the PyTorch native Tensor Parallel APIs, let us look at a common Transformer model. In this tutorial, we use the most recent `Llama2 model `__ as a reference Transformer model implementation, as it is also widely used in the community. + +Since Tensor Parallel shard individual tensors over a set of devices, we would need to set up the distributed environment (such as NCCL communicators) first. +Tensor Parallelism is a Single-Program Multiple-Data (SPMD) sharding algorithm similar to PyTorch DDP/FSDP, and it under the hood leverages the PyTorch DTensor +to perform sharding. It also utilizes the DeviceMesh abstraction (which under the hood manages ProcessGroups) for device management and sharding. +To see how to utilize DeviceMesh to set up multi-dimensional parallelisms, please refer to `this tutorial `__. Tensor Parallel usually works within each host, so let us first initialize a DeviceMesh that connects 8 GPUs within a host. + +.. code-block:: python + + from torch.distributed.device_mesh import init_device_mesh + + tp_mesh = init_device_mesh("cuda", (8,)) + + +Now that we have initialized DeviceMesh, let us take a detailed look at the Llama 2 model architecture and see how we should perform the Tensor Parallel sharding. +Here we focus on the core ``TransformerBlock``, where the Transformer model stacks the identical ``TransformerBlock`` s to scale up the model. + +The core ``TransformerBlock`` consists of an ``Attention`` layer and a ``FeedForward`` layer. Let us first look at the simpler ``FeedForward`` layer. +For the ``FeedForward`` Layer it consists of three Linear layers, where it performs a SwiGLU style MLP, looking at its forward function: + +.. code-block:: python + + # forward in the FeedForward layer + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +It performs ``w1`` and ``w3`` matmuls concurrently and followed by a ``w2`` matmul with the result of the combined w1/w3 linear projection results. This means we could +use the idea from the Tensor Parallelism paper to shard the w1/w3 Linear layers in the colwise fashion and shard the ``w2`` Linear layer in the rowwise fashion, so that +there is only one ``allreduce`` communication happening at the end of all the three layers. With the PyTorch native Tensor Parallel, we can simply create a ``parallelize_plan`` for the ``FeedForward`` layer like below: + +.. code-block:: python + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module + + layer_tp_plan = { + # by default ColwiseParallel input layouts is replicated + # and RowwiseParallel output layouts is replicated + "feed_foward.w1": ColwiseParallel(), + "feed_forward.w2": RowwiseParallel(), + "feed_forward.w3": ColwiseParallel(), + } + + +That's simply how we configure the shardings for the ``FeedForward`` layer using the PyTorch Tensor Parallel APIs. Note that users would only need to specify how to shard the individual layers and the communications (for example, ``allreduce``) will happen under the hood. + +Moving on to the ``Attention`` Layer. It consists of ``wq``, ``wk``, ``wv`` Linear layers to project input to ``q``/ ``k`` / ``v``, and then it performs attention and output projection with the ``wo`` Linear layer. Tensor Parallelism here intends to perform column-wise sharding for the +q/k/v projection and row-wise sharding for the ``wo`` linear projection. So we can add the Attention plan to the ``tp_plan`` that we just drafted up: + +.. code-block:: python + + layer_tp_plan = { + # by default ColwiseParallel input layouts is replicated + # and RowwiseParallel output layouts is replicated + "attention.wq": ColwiseParallel(), + "attention.wk": ColwiseParallel(), + "attention.wv": ColwiseParallel(), + "attention.wo": RowwiseParallel(), + "feed_forward.w1": ColwiseParallel(), + "feed_forward.w2": RowwiseParallel(), + "feed_forward.w3": ColwiseParallel(), + } + + +This is almost the ``layer_tp_plan`` we need to apply Tensor Parallelism to the ``TransformerBlock``. However, one thing we should be aware is that when sharding the linear layer column-wise, the output of the linear layers would become sharded on the last tensor dimension, and the row-wise sharding linear layer directly accepts an input that shards on the last dimension. +If there are any more tensor operations (such as view operations) between the column-wise linear and the row-wise linear, we would need to adjust the relevant shape related ops to sharded shape. + +For the Llama model, in the attention layer there are couple of view operations that are shape related. In particular, column-wise parallel for ``wq``/ ``wk``/ ``wv`` linear layers, the activation tensor is sharded on the ``num_heads`` dimension, so we would need to adjust the ``num_heads`` to local ``num_heads``. + +Finally, we need to call ``parallelize_module`` API to make the plan for each ``TransformerBlock`` effective. Under the hood, it distributes the model parameters inside ``Attention`` and ``FeedForward`` layers to DTensors, and registers communication hooks for model inputs and outputs (before and after each module respectively), if necessary: + +.. code-block:: python + + for layer_id, transformer_block in enumerate(model.layers): + layer_tp_plan = {...} # i.e. the plan we just generated + + # Adjust attention module to use the local number of heads + attn_layer = transformer_block.attention + attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size() + attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size() + + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=layer_tp_plan, + ) + +Now that we have elaborated the sharding plan for each ``TransformerBlock``, there is usually a ``nn.Embedding`` in the first layer and a final ``nn.Linear`` projection layer, where user could choose row-wise or column-wise sharding to the first ``nn.Embedding`` and column-wise sharding to the last ``nn.Linear`` projection layer with proper input and output layouts specified. +Here is an example: + +.. code-block:: python + + model = parallelize_module( + model, + tp_mesh, + { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + ), + "output": ColwiseParallel( + output_layouts=Replicate(), + ), + } + ) + +.. note:: + If the model to be partitioned is too large to fit into CPU memory, one could either use ``meta`` device initialization (for example, initialize the model on meta device first, shard the layers, and the materialize the model), or parallelize the ``TransformerBlock`` layer by layer during the Transformer model initialization. + +Apply Sequence Parallel to ``LayerNorm/RMSNorm`` layers +------------------------------------------------------- + +Sequence Parallel works on top of the Tensor Parallel illustrated above. Compared with basic Tensor Parallel, which only shards tensors within the ``Attention`` modules and ``FeedForward`` modules and keep their module inputs and outputs (namely activations in the forward pass and gradients in the backward pass) replicated, Sequence Parallel keeps them sharded on the sequence dimension. + +In a typical ``TransformerBlock``, the forward function combines norm layers (``LayerNorm`` or ``RMSNorm``), an attention layer, a feed forward layer, and residual connections. For example: + +.. code-block:: python + + # forward in a TransformerBlock + def forward(self, x): + h = x + self.attention(self.attention_norm(x)) + out = h + self.feed_forward(self.ffn_norm(h)) + return out + +In most use cases, the activations (and gradients) are of the shape ``[batch size, sequence length, hidden dimension]`` outside the ``Attention`` and ``FeedForward`` modules. In the DTensor’s language, Sequence Parallel performs activation computation using the ``Shard(1)`` layout for both forward/backward of the module. +Following the code example earlier, the code below demonstrates how we apply Sequence Parallel to the norm layers within a ``TransformerBlock``: + +First let's import the required dependencies for Sequence Parallel: + +.. code-block:: python + + from torch.distributed.tensor.parallel import ( + PrepareModuleInput, + SequenceParallel, + ) + + +Next let's adjust the ``layer_tp_plan`` to enable sequence parallel on the ``RMSNorm`` layers: + +.. code-block:: python + + layer_tp_plan = { + # Now the input and output of SequenceParallel has Shard(1) layouts, + # to represent the input/output tensors sharded on the sequence dimension + "attention_norm": SequenceParallel(), + "attention": PrepareModuleInput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "attention.wq": ColwiseParallel(), + "attention.wk": ColwiseParallel(), + "attention.wv": ColwiseParallel(), + "attention.wo": RowwiseParallel(output_layouts=Shard(1)), + "ffn_norm": SequenceParallel(), + "feed_forward": PrepareModuleInput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "feed_forward.w1": ColwiseParallel(), + "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)), + "feed_forward.w3": ColwiseParallel(), + } + + +One can see we now use ``PrepareModuleInput`` to modify the module input layouts to the Attention and FeedForward layers from ``Shard(1)`` to ``Replicate()``, and mark their output layouts as ``Shard(1)``. +Just like what happens to Tensor Parallelism, one only needs to specify the tensor sharding layouts of the inputs and outputs, and the communication between layers will happen automatically. + +Note that with Sequence Parallel, we assume the inputs and outputs of a ``TransformerBlock`` are always sharded on the sequence dimension, so that multiple ``TransformerBlocks`` can be concatenated seamlessly. +This can be facilitated by explicitly specifying the output of the beginning ``nn.Embedding`` layer and the input of the final ``nn.Linear`` projection layer to be ``Shard(1)``: + +.. code-block:: python + + model = parallelize_module( + model, + tp_mesh, + { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(1), + ), + "norm": SequenceParallel(), + "output": ColwiseParallel( + input_layouts=Shard(1), + output_layouts=Replicate() + ), + } + ) + + +Apply Loss Parallel +------------------- + +Loss Parallel is a related technique to save memory and communication when the loss function is computed, as model outputs are usually very large. In Loss Parallel, when the model outputs are sharded on the (often huge) vocabulary dimension, the cross-entropy loss can be computed efficiently, without gathering all the model outputs to every single GPU. This not only significantly reduces the memory consumption, but also improves training speed by reducing communication overhead and doing sharded computation in parallel. The picture below briefly illustrates how Loss Parallel avoids gathering all model outputs to every GPU by doing sharded computation. + +.. figure:: /_static/img/distributed/loss_parallel.png + :width: 100% + :align: center + :alt: loss parallel + + Figure 2. Cross-entropy loss forward computation with loss parallel on one GPU. Blue represents sharded tensors; green represents replicated tensors; yellow represents tensors with partial values (to be all-reduced). Black arrows are local computations; red arrows are functional collectives among GPUs. + +In the PyTorch Tensor Parallel API, Loss Parallel can be enabled via a context manager ``loss_parallel``, with which one can directly use ``torch.nn.functional.cross_entropy`` or ``torch.nn.CrossEntropyLoss`` without modifying other parts of their code. + +To apply Loss Parallel, the model predictions, usually of the shape ``[batch size, sequence length, vocabulary size]``, should be sharded on the vocabulary dimension. This can be easily done via marking the output layouts of the last linear projection layer output: + +.. code-block:: python + + model = parallelize_module( + model, + tp_mesh, + { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(1), + ), + "norm": SequenceParallel(), + "output": ColwiseParallel( + input_layouts=Shard(1), + # use DTensor as the output + use_local_output=False, + ), + }, + ) + +In the code above, we also apply Sequence Parallel to the norm layer before output. We apply ``use_local_output=False`` to let the output stay as a DTensor, to work with the ``loss_parallel`` context manager. After that, one can simply call the cross_entropy loss function as is shown below. Note that the backward computation also needs to happen within the context. + +.. code-block:: python + + import torch.nn.functional as F + from torch.distributed.tensor.parallel import loss_parallel + + pred = model(input_ids) + with loss_parallel(): + # assuming pred and labels are of the shape [batch, seq, vocab] + loss = F.cross_entropy(pred.flatten(0, 1), labels.flatten(0, 1)) + loss.backward() + + +Combine Tensor Parallel with Fully Sharded Data Parallel together +----------------------------------------------------------------- + + +Now that we have shown how to apply Tensor/Sequence Parallel to the model, let us also take a look at how Tensor Parallel and Fully Sharded Data Parallel could work together. +Since Tensor Parallelism incurs communications that block the computation, we want to make sure it runs within a fast communication channel, such as NVLink. +In practice, we usually apply Tensor Parallel within each host, and apply Fully Sharded Data Parallel across the hosts. + +.. figure:: /_static/img/distributed/fsdp_tp.png + :width: 100% + :align: center + :alt: fsdp + tp + + Figure 3. FSDP and TP work on separate device dimensions, FSDP communication happens inter-host and TP communication happens intra-host. + + +This 2-D parallelism pattern can be easily expressed via a 2-D DeviceMesh, and we just need pass each “sub” DeviceMesh to each individual parallelism APIs: + +.. code-block:: python + + from torch.distributed.device_mesh import init_device_mesh + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + + # i.e. 2-D mesh is [dp, tp], training on 64 GPUs that performs 8 way DP and 8 way TP + mesh_2d = init_device_mesh("cuda", (8, 8)) + tp_mesh = mesh_2d["tp"] # a submesh that connects intra-host devices + dp_mesh = mesh_2d["dp"] # a submesh that connects inter-host devices + + model = Model(...) + + tp_plan = {...} + + # apply Tensor Parallel intra-host on tp_mesh + model_tp = parallelize_module(model, tp_mesh, tp_plan) + # apply FSDP inter-host on dp_mesh + model_2d = FSDP(model_tp, device_mesh=dp_mesh, use_orig_params=True, ...) + + +This would allow us to easily apply Tensor Parallel within each host (intra-host) and apply FSDP across hosts (inter-hosts), with **0-code changes** to the Llama model. +The Tensor(Model) Parallel and Data Parallel techniques combined together provides the ability to continue increasing model size and training efficiently using a large number of GPUs. + +Conclusion +---------- +This tutorial demonstrates how to train a large Transformer-like model across hundreds to thousands of GPUs using Tensor Parallel in combination with Fully Sharded Data Parallel. +It explains how to apply Tensor Parallel to different parts of the model, with **no code changes** to the model itself. Tensor Parallel is a efficient model parallelism technique for large scale training. + +To see the complete end-to-end code example explained in this tutorial, please refer to the `Tensor Parallel examples `__ in the pytorch/examples repository. diff --git a/intermediate_source/_torch_export_nightly_tutorial.py b/intermediate_source/_torch_export_nightly_tutorial.py new file mode 100644 index 00000000000..fdbe18392e5 --- /dev/null +++ b/intermediate_source/_torch_export_nightly_tutorial.py @@ -0,0 +1,635 @@ +# -*- coding: utf-8 -*- + +""" +torch.export Nightly Tutorial +================ +**Author:** William Wen, Zhengxu Chen, Angela Yi +""" + +###################################################################### +# +# .. warning:: +# +# ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility +# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.1. +# +# :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into +# standardized model representations, intended +# to be run on different (i.e. Python-less) environments. +# +# In this tutorial, you will learn how to use :func:`torch.export` to extract +# ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. +# We also detail some considerations/modifications that you may need +# to make in order to make your model compatible with ``torch.export``. +# +# **Contents** +# +# .. contents:: +# :local: + +###################################################################### +# Basic Usage +# ----------- +# +# ``torch.export`` extracts single-graph representations from PyTorch programs +# by tracing the target function, given example inputs. +# ``torch.export.export()`` is the main entry point for ``torch.export``. +# +# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, +# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` +# generally refers to the actual function call. +# +# The signature of ``torch.export.export()`` is: +# +# .. code:: python +# +# export( +# f: Callable, +# args: Tuple[Any, ...], +# kwargs: Optional[Dict[str, Any]] = None, +# *, +# dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None +# ) -> ExportedProgram +# +# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` +# and wraps it in an ``ExportedProgram``, which can be serialized or executed later with +# different inputs. Note that while the output ``ExportedGraph`` is callable and can be +# called in the same way as the original input callable, it is not a ``torch.nn.Module``. +# We will detail the ``dynamic_shapes`` argument later in the tutorial. + +import torch +from torch.export import export + +class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.lin = torch.nn.Linear(100, 10) + + def forward(self, x, y): + return torch.nn.functional.relu(self.lin(x + y), inplace=True) + +mod = MyModule() +exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) +print(type(exported_mod)) +print(exported_mod(torch.randn(8, 100), torch.randn(8, 100))) + +###################################################################### +# Let's review some attributes of ``ExportedProgram`` that are of interest. +# +# The ``graph`` attribute is an `FX graph `__ +# traced from the function we exported, that is, the computation graph of all PyTorch operations. +# The FX graph has some important properties: +# +# - The operations are "ATen-level" operations. +# - The graph is "functionalized", meaning that no operations are mutations. +# +# The ``graph_module`` attribute is the ``GraphModule`` that wraps the ``graph`` attribute +# so that it can be ran as a ``torch.nn.Module``. + +print(exported_mod) +print(exported_mod.graph_module) + +###################################################################### +# The printed code shows that FX graph only contains ATen-level ops (such as ``torch.ops.aten``) +# and that mutations were removed. For example, the mutating op ``torch.nn.functional.relu(..., inplace=True)`` +# is represented in the printed code by ``torch.ops.aten.relu.default``, which does not mutate. +# Future uses of input to the original mutating ``relu`` op are replaced by the additional new output +# of the replacement non-mutating ``relu`` op. +# +# Other attributes of interest in ``ExportedProgram`` include: +# +# - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. +# - ``range_constraints`` and ``equality_constraints`` -- constraints, covered later + +print(exported_mod.graph_signature) + +###################################################################### +# See the ``torch.export`` `documentation `__ +# for more details. + +###################################################################### +# Graph Breaks +# ------------ +# +# Although ``torch.export`` shares components with ``torch.compile``, +# the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not +# support graph breaks. This is because handling graph breaks involves interpreting +# the unsupported operation with default Python evaluation, which is incompatible +# with the export use case. Therefore, in order to make your model code compatible +# with ``torch.export``, you will need to modify your code to remove graph breaks. +# +# A graph break is necessary in cases such as: +# +# - data-dependent control flow + +def bad1(x): + if x.sum() > 0: + return torch.sin(x) + return torch.cos(x) + +import traceback as tb +try: + export(bad1, (torch.randn(3, 3),)) +except Exception: + tb.print_exc() + +###################################################################### +# - accessing tensor data with ``.data`` + +def bad2(x): + x.data[0, 0] = 3 + return x + +try: + export(bad2, (torch.randn(3, 3),)) +except Exception: + tb.print_exc() + +###################################################################### +# - calling unsupported functions (such as many built-in functions) + +def bad3(x): + x = x + 1 + return x + id(x) + +try: + export(bad3, (torch.randn(3, 3),)) +except Exception: + tb.print_exc() + +###################################################################### +# - unsupported Python language features (e.g. throwing exceptions, match statements) + +def bad4(x): + try: + x = x + 1 + raise RuntimeError("bad") + except: + x = x + 2 + return x + +try: + export(bad4, (torch.randn(3, 3),)) +except Exception: + tb.print_exc() + +###################################################################### +# The sections below demonstrate some ways you can modify your code +# in order to remove graph breaks. + +###################################################################### +# Control Flow Ops +# ---------------- +# +# ``torch.export`` actually does support data-dependent control flow. +# But these need to be expressed using control flow ops. For example, +# we can fix the control flow example above using the ``cond`` op, like so: + +from functorch.experimental.control_flow import cond + +def bad1_fixed(x): + def true_fn(x): + return torch.sin(x) + def false_fn(x): + return torch.cos(x) + return cond(x.sum() > 0, true_fn, false_fn, [x]) + +exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),)) +print(exported_bad1_fixed(torch.ones(3, 3))) +print(exported_bad1_fixed(-torch.ones(3, 3))) + +###################################################################### +# There are limitations to ``cond`` that one should be aware of: +# +# - The predicate (i.e. ``x.sum() > 0``) must result in a boolean or a single-element tensor. +# - The operands (i.e. ``[x]``) must be tensors. +# - The branch function (i.e. ``true_fn`` and ``false_fn``) signature must match with the +# operands and they must both return a single tensor with the same metadata (for example, ``dtype``, ``shape``, etc.). +# - Branch functions cannot mutate input or global variables. +# - Branch functions cannot access closure variables, except for ``self`` if the function is +# defined in the scope of a method. +# +# For more details about ``cond``, check out the `documentation `__. + +###################################################################### +# .. +# [NOTE] map is not documented at the moment +# We can also use ``map``, which applies a function across the first dimension +# of the first tensor argument. +# +# from functorch.experimental.control_flow import map +# +# def map_example(xs): +# def map_fn(x, const): +# def true_fn(x): +# return x + const +# def false_fn(x): +# return x - const +# return control_flow.cond(x.sum() > 0, true_fn, false_fn, [x]) +# return control_flow.map(map_fn, xs, torch.tensor([2.0])) +# +# exported_map_example= export(map_example, (torch.randn(4, 3),)) +# inp = torch.cat((torch.ones(2, 3), -torch.ones(2, 3))) +# print(exported_map_example(inp)) + +###################################################################### +# Constraints/Dynamic Shapes +# -------------------------- +# +# Ops can have different specializations/behaviors for different tensor shapes, so by default, +# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective +# example inputs given to the initial ``torch.export.export()`` call. +# If we try to run the ``ExportedProgram`` in the example below with a tensor +# with a different shape, we get an error: + +class MyModule2(torch.nn.Module): + def __init__(self): + super().__init__() + self.lin = torch.nn.Linear(100, 10) + + def forward(self, x, y): + return torch.nn.functional.relu(self.lin(x + y), inplace=True) + +mod2 = MyModule2() +exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) + +try: + exported_mod2(torch.randn(10, 100), torch.randn(10, 100)) +except Exception: + tb.print_exc() + +###################################################################### +# We can relax this constraint using the ``dynamic_shapes`` argument of +# ``torch.export.export()``, which allows us to specify, using ``torch.export.Dim`` +# (`documentation `__), +# which dimensions of the input tensors are dynamic. +# +# For each tensor argument of the input callable, we can specify a mapping from the dimension +# to a ``torch.export.Dim``. +# A ``torch.export.Dim`` is essentially a named symbolic integer with optional +# minimum and maximum bounds. +# +# Then, the format of ``torch.export.export()``'s ``dynamic_shapes`` argument is a mapping +# from the input callable's tensor argument names, to dimension --> dim mappings as described above. +# If there is no ``torch.export.Dim`` given to a tensor argument's dimension, then that dimension is +# assumed to be static. +# +# The first argument of ``torch.export.Dim`` is the name for the symbolic integer, used for debugging. +# Then we can specify an optional minimum and maximum bound (inclusive). Below, we show example usage. +# +# In the example below, our input +# ``inp1`` has an unconstrained first dimension, but the size of the second +# dimension must be in the interval [4, 18]. + +from torch.export import Dim + +inp1 = torch.randn(10, 10, 2) + +def dynamic_shapes_example1(x): + x = x[:, 2:] + return torch.relu(x) + +inp1_dim0 = Dim("inp1_dim0") +inp1_dim1 = Dim("inp1_dim1", min=4, max=18) +dynamic_shapes1 = { + "x": {0: inp1_dim0, 1: inp1_dim1}, +} + +exported_dynamic_shapes_example1 = export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1) + +print(exported_dynamic_shapes_example1(torch.randn(5, 5, 2))) + +try: + exported_dynamic_shapes_example1(torch.randn(8, 1, 2)) +except Exception: + tb.print_exc() + +try: + exported_dynamic_shapes_example1(torch.randn(8, 20, 2)) +except Exception: + tb.print_exc() + +try: + exported_dynamic_shapes_example1(torch.randn(8, 8, 3)) +except Exception: + tb.print_exc() + +###################################################################### +# Note that if our example inputs to ``torch.export`` do not satisfy the constraints +# given by ``dynamic_shapes``, then we get an error. + +inp1_dim1_bad = Dim("inp1_dim1_bad", min=11, max=18) +dynamic_shapes1_bad = { + "x": {0: inp1_dim0, 1: inp1_dim1_bad}, +} + +try: + export(dynamic_shapes_example1, (inp1,), dynamic_shapes=dynamic_shapes1_bad) +except Exception: + tb.print_exc() + +###################################################################### +# We can enforce that equalities between dimensions of different tensors +# by using the same ``torch.export.Dim`` object, for example, in matrix multiplication: + +inp2 = torch.randn(4, 8) +inp3 = torch.randn(8, 2) + +def dynamic_shapes_example2(x, y): + return x @ y + +inp2_dim0 = Dim("inp2_dim0") +inner_dim = Dim("inner_dim") +inp3_dim1 = Dim("inp3_dim1") + +dynamic_shapes2 = { + "x": {0: inp2_dim0, 1: inner_dim}, + "y": {0: inner_dim, 1: inp3_dim1}, +} + +exported_dynamic_shapes_example2 = export(dynamic_shapes_example2, (inp2, inp3), dynamic_shapes=dynamic_shapes2) + +print(exported_dynamic_shapes_example2(torch.randn(2, 16), torch.randn(16, 4))) + +try: + exported_dynamic_shapes_example2(torch.randn(4, 8), torch.randn(4, 2)) +except Exception: + tb.print_exc() + +###################################################################### +# We can actually use ``torch.export`` to guide us as to which ``dynamic_shapes`` constraints +# are necessary. We can do this by relaxing all constraints (recall that if we +# do not provide constraints for a dimension, the default behavior is to constrain +# to the exact shape value of the example input) and letting ``torch.export`` +# error out. + +inp4 = torch.randn(8, 16) +inp5 = torch.randn(16, 32) + +def dynamic_shapes_example3(x, y): + if x.shape[0] <= 16: + return x @ y[:, :16] + return y + +dynamic_shapes3 = { + "x": {i: Dim(f"inp4_dim{i}") for i in range(inp4.dim())}, + "y": {i: Dim(f"inp5_dim{i}") for i in range(inp5.dim())}, +} + +try: + export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3) +except Exception: + tb.print_exc() + +###################################################################### +# We can see that the error message gives us suggested fixes to our +# dynamic shape constraints. Let us follow those suggestions (exact +# suggestions may differ slightly): + +def suggested_fixes(): + inp4_dim1 = Dim('shared_dim') + # suggested fixes below + inp4_dim0 = Dim('inp4_dim0', max=16) + inp5_dim1 = Dim('inp5_dim1', min=17) + inp5_dim0 = inp4_dim1 + # end of suggested fixes + return { + "x": {0: inp4_dim0, 1: inp4_dim1}, + "y": {0: inp5_dim0, 1: inp5_dim1}, + } + +dynamic_shapes3_fixed = suggested_fixes() +exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) +print(exported_dynamic_shapes_example3(torch.randn(4, 32), torch.randn(32, 64))) + +###################################################################### +# Note that in the example above, because we constrained the value of ``x.shape[0]`` in +# ``dynamic_shapes_example3``, the exported program is sound even though there is a +# raw ``if`` statement. +# +# If you want to see why ``torch.export`` generated these constraints, you can +# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, +# or use ``torch._logging.set_logs``. + +import logging +torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) +exported_dynamic_shapes_example3 = export(dynamic_shapes_example3, (inp4, inp5), dynamic_shapes=dynamic_shapes3_fixed) + +# reset to previous values +torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) + +###################################################################### +# We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and +# ``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...`` +# represent. + +print(exported_dynamic_shapes_example3.range_constraints) +print(exported_dynamic_shapes_example3.equality_constraints) + +###################################################################### +# Custom Ops +# ---------- +# +# ``torch.export`` can export PyTorch programs with custom operators. +# +# +# Currently, the steps to register a custom op for use by ``torch.export`` are: +# +# - If you’re writing custom ops purely in Python, use torch.library.custom_op. + +import torch.library +import numpy as np + +@torch.library.custom_op("mylib::sin", mutates_args=()) +def sin(x): + x_np = x.numpy() + y_np = np.sin(x_np) + return torch.from_numpy(y_np) + +###################################################################### +# - You will need to provide abstract implementation so that PT2 can trace through it. + +@torch.library.register_fake("mylib::sin") +def _(x): + return torch.empty_like(x) + +# - Sometimes, the custom op you are exporting has data-dependent output, meaning +# we can't determine the shape of the output at compile time. In this case, you can do +# following: +@torch.library.custom_op("mylib::nonzero", mutates_args=()) +def nonzero(x): + x_np = x.cpu().numpy() + res = np.stack(np.nonzero(x_np), axis=1) + return torch.tensor(res, device=x.device) + +@torch.library.register_fake("mylib::nonzero") +def _(x): + # The number of nonzero-elements is data-dependent. + # Since we cannot peek at the data in an abstract implementation, + # we use the `ctx` object to construct a new ``symint`` that + # represents the data-dependent size. + ctx = torch.library.get_ctx() + nnz = ctx.new_dynamic_size() + shape = [nnz, x.dim()] + result = x.new_empty(shape, dtype=torch.int64) + return result + +###################################################################### +# - Call the custom op from the code you want to export using ``torch.ops`` + +def custom_op_example(x): + x = torch.sin(x) + x = torch.ops.mylib.sin(x) + x = torch.cos(x) + y = torch.ops.mylib.nonzero(x) + return x + y.sum() + +###################################################################### +# - Export the code as before + +exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),)) +exported_custom_op_example.graph_module.print_readable() +print(exported_custom_op_example(torch.randn(3, 3))) + +###################################################################### +# Note in the above outputs that the custom op is included in the exported graph. +# And when we call the exported graph as a function, the original custom op is called, +# as evidenced by the ``print`` call. +# +# If you have a custom operator implemented in C++, please refer to +# `this document `__ +# to make it compatible with ``torch.export``. + +###################################################################### +# Decompositions +# -------------- +# +# The graph produced by ``torch.export`` by default returns a graph containing +# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 +# operators, all of which are functional, that is, they do not +# mutate or alias inputs. You can find a list of all ATen operators +# `here `__ +# and you can inspect if an operator is functional by checking +# ``op._schema.is_mutable``, for example: + +print(torch.ops.aten.add.Tensor._schema.is_mutable) +print(torch.ops.aten.add_.Tensor._schema.is_mutable) + +###################################################################### +# By default, the environment in which you want to run the exported graph +# should support all ~2000 of these operators. +# However, you can use the following API on the exported program +# if your specific environment is only able to support a subset of +# the ~2000 operators. +# +# .. code:: python +# +# def run_decompositions( +# self: ExportedProgram, +# decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] +# ) -> ExportedProgram +# +# ``run_decompositions`` takes in a decomposition table, which is a mapping of +# operators to a function specifying how to reduce, or decompose, that operator +# into an equivalent sequence of other ATen operators. +# +# The default decomposition table for ``run_decompositions`` is the +# `Core ATen decomposition table `__ +# which will decompose the all ATen operators to the +# `Core ATen Operator Set `__ +# which consists of only ~180 operators. + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 4) + + def forward(self, x): + return self.linear(x) + +ep = export(M(), (torch.randn(2, 3),)) +print(ep.graph) + +core_ir_ep = ep.run_decompositions() +print(core_ir_ep.graph) + +###################################################################### +# Notice that after running ``run_decompositions`` the +# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen +# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part +# of the Core ATen Opset. + +###################################################################### +# Most ATen operators already have decompositions, which are located +# `here `__. +# If you would like to use some of these existing decomposition functions, +# you can pass in a list of operators you would like to decompose to the +# `get_decompositions `__ +# function, which will return a decomposition table using existing +# decomposition implementations. + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 4) + + def forward(self, x): + return self.linear(x) + +ep = export(M(), (torch.randn(2, 3),)) +print(ep.graph) + +from torch._decomp import get_decompositions +decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) +core_ir_ep = ep.run_decompositions(decomp_table) +print(core_ir_ep.graph) + +###################################################################### +# If there is no existing decomposition function for an ATen operator that you would +# like to decompose, feel free to send a pull request into PyTorch +# implementing the decomposition! + +###################################################################### +# ExportDB +# -------- +# +# ``torch.export`` will only ever export a single computation graph from a PyTorch program. Because of this requirement, +# there will be Python or PyTorch features that are not compatible with ``torch.export``, which will require users to +# rewrite parts of their model code. We have seen examples of this earlier in the tutorial -- for example, rewriting +# if-statements using ``cond``. +# +# `ExportDB `__ is the standard reference that documents +# supported and unsupported Python/PyTorch features for ``torch.export``. It is essentially a list a program samples, each +# of which represents the usage of one particular Python/PyTorch feature and its interaction with ``torch.export``. +# Examples are also tagged by category so that they can be more easily searched. +# +# For example, let's use ExportDB to get a better understanding of how the predicate works in the ``cond`` operator. +# We can look at the example called ``cond_predicate``, which has a ``torch.cond`` tag. The example code looks like: + +def cond_predicate(x): + """ + The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: + - torch.Tensor with a single element + - boolean expression + NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. + """ + pred = x.dim() > 2 and x.shape[2] > 10 + return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x]) + +###################################################################### +# More generally, ExportDB can be used as a reference when one of the following occurs: +# +# 1. Before attempting ``torch.export``, you know ahead of time that your model uses some tricky Python/PyTorch features +# and you want to know if ``torch.export`` covers that feature. +# 2. When attempting ``torch.export``, there is a failure and it's unclear how to work around it. +# +# ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach +# out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. + +###################################################################### +# Conclusion +# ---------- +# +# We introduced ``torch.export``, the new PyTorch 2.X way to export single computation +# graphs from PyTorch programs. In particular, we demonstrate several code modifications +# and considerations (control flow ops, constraints, etc.) that need to be made in order to export a graph. diff --git a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py index f16b170ee6a..ed581426c2e 100644 --- a/intermediate_source/autograd_saved_tensors_hooks_tutorial.py +++ b/intermediate_source/autograd_saved_tensors_hooks_tutorial.py @@ -397,7 +397,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - return torch.load(name) + return torch.load(name, weights_only=True) ###################################################################### @@ -420,7 +420,7 @@ def pack_hook(tensor): return name def unpack_hook(name): - tensor = torch.load(name) + tensor = torch.load(name, weights_only=True) os.remove(name) return tensor @@ -462,7 +462,7 @@ def pack_hook(tensor): return temp_file def unpack_hook(temp_file): - return torch.load(temp_file.name) + return torch.load(temp_file.name, weights_only=True) ###################################################################### diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py index 79b096b9e64..0f1ae21a556 100644 --- a/intermediate_source/ax_multiobjective_nas_tutorial.py +++ b/intermediate_source/ax_multiobjective_nas_tutorial.py @@ -232,21 +232,21 @@ def trainer( # we get the logic to read and parse the TensorBoard logs for free. # -from ax.metrics.tensorboard import TensorboardCurveMetric +from ax.metrics.tensorboard import TensorboardMetric +from tensorboard.backend.event_processing import plugin_event_multiplexer as event_multiplexer - -class MyTensorboardMetric(TensorboardCurveMetric): +class MyTensorboardMetric(TensorboardMetric): # NOTE: We need to tell the new TensorBoard metric how to get the id / # file handle for the TensorBoard logs from a trial. In this case # our convention is to just save a separate file per trial in # the prespecified log dir. - @classmethod - def get_ids_from_trials(cls, trials): - return { - trial.index: Path(log_dir).joinpath(str(trial.index)).as_posix() - for trial in trials - } + def _get_event_multiplexer_for_trial(self, trial): + mul = event_multiplexer.EventMultiplexer(max_reload_threads=20) + mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None) + mul.Reload() + + return mul # This indicates whether the metric is queryable while the trial is # still running. We don't use this in the current tutorial, but Ax @@ -266,12 +266,12 @@ def is_available_while_running(cls): val_acc = MyTensorboardMetric( name="val_acc", - curve_name="val_acc", + tag="val_acc", lower_is_better=False, ) model_num_params = MyTensorboardMetric( name="num_params", - curve_name="num_params", + tag="num_params", lower_is_better=True, ) diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py index 0957b109b3a..67c3f04cbe3 100644 --- a/intermediate_source/char_rnn_classification_tutorial.py +++ b/intermediate_source/char_rnn_classification_tutorial.py @@ -4,13 +4,18 @@ ************************************************************** **Author**: `Sean Robertson `_ +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + We will be building and training a basic character-level Recurrent Neural Network (RNN) to classify words. This tutorial, along with two other Natural Language Processing (NLP) "from scratch" tutorials :doc:`/intermediate/char_rnn_generation_tutorial` and :doc:`/intermediate/seq2seq_translation_tutorial`, show how to -preprocess data to model NLP. In particular these tutorials do not -use many of the convenience functions of `torchtext`, so you can see how +preprocess data to model NLP. In particular, these tutorials show how preprocessing to model NLP works at a low level. A character-level RNN reads words as a series of characters - @@ -20,20 +25,7 @@ Specifically, we'll train on a few thousand surnames from 18 languages of origin, and predict which language a name is from based on the -spelling: - -:: - - $ python predict.py Hinton - (-0.47) Scottish - (-1.52) English - (-3.57) Irish - - $ python predict.py Schmidhuber - (-0.19) German - (-2.48) Czech - (-2.68) Dutch - +spelling. Recommended Preparation ======================= @@ -56,79 +48,62 @@ Networks `__ is about LSTMs specifically but also informative about RNNs in general +""" +###################################################################### +# Preparing Torch +# ========================== +# +# Set up torch to default to the right device use GPU acceleration depending on your hardware (CPU or CUDA). +# -Preparing the Data -================== - -.. Note:: - Download the data from - `here `_ - and extract it to the current directory. - -Included in the ``data/names`` directory are 18 text files named as -``[Language].txt``. Each file contains a bunch of names, one name per -line, mostly romanized (but we still need to convert from Unicode to -ASCII). +import torch -We'll end up with a dictionary of lists of names per language, -``{language: [names ...]}``. The generic variables "category" and "line" -(for language and name in our case) are used for later extensibility. -""" -from io import open -import glob -import os +# Check if CUDA is available +device = torch.device('cpu') +if torch.cuda.is_available(): + device = torch.device('cuda') -def findFiles(path): return glob.glob(path) +torch.set_default_device(device) +print(f"Using device = {torch.get_default_device()}") -print(findFiles('data/names/*.txt')) +###################################################################### +# Preparing the Data +# ================== +# +# Download the data from `here `__ +# and extract it to the current directory. +# +# Included in the ``data/names`` directory are 18 text files named as +# ``[Language].txt``. Each file contains a bunch of names, one name per +# line, mostly romanized (but we still need to convert from Unicode to +# ASCII). +# +# The first step is to define and clean our data. Initially, we need to convert Unicode to plain ASCII to +# limit the RNN input layers. This is accomplished by converting Unicode strings to ASCII and allowing only a small set of allowed characters. +import string import unicodedata -import string -all_letters = string.ascii_letters + " .,;'" -n_letters = len(all_letters) +allowed_characters = string.ascii_letters + " .,;'" +n_letters = len(allowed_characters) -# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 +# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' - and c in all_letters + and c in allowed_characters ) -print(unicodeToAscii('Ślusàrski')) - -# Build the category_lines dictionary, a list of names per language -category_lines = {} -all_categories = [] - -# Read a file and split into lines -def readLines(filename): - lines = open(filename, encoding='utf-8').read().strip().split('\n') - return [unicodeToAscii(line) for line in lines] - -for filename in findFiles('data/names/*.txt'): - category = os.path.splitext(os.path.basename(filename))[0] - all_categories.append(category) - lines = readLines(filename) - category_lines[category] = lines - -n_categories = len(all_categories) - - -###################################################################### -# Now we have ``category_lines``, a dictionary mapping each category -# (language) to a list of lines (names). We also kept track of -# ``all_categories`` (just a list of languages) and ``n_categories`` for -# later reference. +######################### +# Here's an example of converting a unicode alphabet name to plain ASCII. This simplifies the input layer # -print(category_lines['Italian'][:5]) - +print (f"converting 'Ślusàrski' to {unicodeToAscii('Ślusàrski')}") ###################################################################### # Turning Names into Tensors -# -------------------------- +# ========================== # # Now that we have all the names organized, we need to turn them into # Tensors to make any use of them. @@ -142,19 +117,10 @@ def readLines(filename): # # That extra 1 dimension is because PyTorch assumes everything is in # batches - we're just using a batch size of 1 here. -# - -import torch # Find letter index from all_letters, e.g. "a" = 0 def letterToIndex(letter): - return all_letters.find(letter) - -# Just for demonstration, turn a letter into a <1 x n_letters> Tensor -def letterToTensor(letter): - tensor = torch.zeros(1, n_letters) - tensor[0][letterToIndex(letter)] = 1 - return tensor + return allowed_characters.find(letter) # Turn a line into a , # or an array of one-hot letter vectors @@ -164,9 +130,87 @@ def lineToTensor(line): tensor[li][0][letterToIndex(letter)] = 1 return tensor -print(letterToTensor('J')) +######################### +# Here are some examples of how to use ``lineToTensor()`` for a single and multiple character string. -print(lineToTensor('Jones').size()) +print (f"The letter 'a' becomes {lineToTensor('a')}") #notice that the first position in the tensor = 1 +print (f"The name 'Ahn' becomes {lineToTensor('Ahn')}") #notice 'A' sets the 27th index to 1 + +######################### +# Congratulations, you have built the foundational tensor objects for this learning task! You can use a similar approach +# for other RNN tasks with text. +# +# Next, we need to combine all our examples into a dataset so we can train, test and validate our models. For this, +# we will use the `Dataset and DataLoader `__ classes +# to hold our dataset. Each Dataset needs to implement three functions: ``__init__``, ``__len__``, and ``__getitem__``. +from io import open +import glob +import os +import time + +import torch +from torch.utils.data import Dataset + +class NamesDataset(Dataset): + + def __init__(self, data_dir): + self.data_dir = data_dir #for provenance of the dataset + self.load_time = time.localtime #for provenance of the dataset + labels_set = set() #set of all classes + + self.data = [] + self.data_tensors = [] + self.labels = [] + self.labels_tensors = [] + + #read all the ``.txt`` files in the specified directory + text_files = glob.glob(os.path.join(data_dir, '*.txt')) + for filename in text_files: + label = os.path.splitext(os.path.basename(filename))[0] + labels_set.add(label) + lines = open(filename, encoding='utf-8').read().strip().split('\n') + for name in lines: + self.data.append(name) + self.data_tensors.append(lineToTensor(name)) + self.labels.append(label) + + #Cache the tensor representation of the labels + self.labels_uniq = list(labels_set) + for idx in range(len(self.labels)): + temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long) + self.labels_tensors.append(temp_tensor) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data_item = self.data[idx] + data_label = self.labels[idx] + data_tensor = self.data_tensors[idx] + label_tensor = self.labels_tensors[idx] + + return label_tensor, data_tensor, data_label, data_item + + +######################### +#Here we can load our example data into the ``NamesDataset`` + +alldata = NamesDataset("data/names") +print(f"loaded {len(alldata)} items of data") +print(f"example = {alldata[0]}") + +######################### +#Using the dataset object allows us to easily split the data into train and test sets. Here we create a 80/20 +# split but the ``torch.utils.data`` has more useful utilities. Here we specify a generator since we need to use the +#same device as PyTorch defaults to above. + +train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], generator=torch.Generator(device=device).manual_seed(2024)) + +print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}") + +######################### +# Now we have a basic dataset containing **20074** examples where each example is a pairing of label and name. We have also +#split the dataset into training and testing so we can validate the model that we build. ###################################################################### @@ -178,115 +222,58 @@ def lineToTensor(line): # held hidden state and gradients which are now entirely handled by the # graph itself. This means you can implement a RNN in a very "pure" way, # as regular feed-forward layers. -# -# This RNN module (mostly copied from `the PyTorch for Torch users -# tutorial `__) -# is just 2 linear layers which operate on an input and hidden state, with -# a ``LogSoftmax`` layer after the output. +# +# This CharRNN class implements an RNN with three components. +# First, we use the `nn.RNN implementation `__. +# Next, we define a layer that maps the RNN hidden layers to our output. And finally, we apply a ``softmax`` function. Using ``nn.RNN`` +# leads to a significant improvement in performance, such as cuDNN-accelerated kernels, versus implementing +# each layer as a ``nn.Linear``. It also simplifies the implementation in ``forward()``. # import torch.nn as nn +import torch.nn.functional as F -class RNN(nn.Module): +class CharRNN(nn.Module): def __init__(self, input_size, hidden_size, output_size): - super(RNN, self).__init__() - - self.hidden_size = hidden_size + super(CharRNN, self).__init__() - self.i2h = nn.Linear(input_size + hidden_size, hidden_size) + self.rnn = nn.RNN(input_size, hidden_size) self.h2o = nn.Linear(hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=1) - - def forward(self, input, hidden): - combined = torch.cat((input, hidden), 1) - hidden = self.i2h(combined) - output = self.h2o(hidden) + + def forward(self, line_tensor): + rnn_out, hidden = self.rnn(line_tensor) + output = self.h2o(hidden[0]) output = self.softmax(output) - return output, hidden - - def initHidden(self): - return torch.zeros(1, self.hidden_size) -n_hidden = 128 -rnn = RNN(n_letters, n_hidden, n_categories) + return output -###################################################################### -# To run a step of this network we need to pass an input (in our case, the -# Tensor for the current letter) and a previous hidden state (which we -# initialize as zeros at first). We'll get back the output (probability of -# each language) and a next hidden state (which we keep for the next -# step). -# - -input = letterToTensor('A') -hidden = torch.zeros(1, n_hidden) - -output, next_hidden = rnn(input, hidden) +########################### +# We can then create an RNN with 57 input nodes, 128 hidden nodes, and 18 outputs: +n_hidden = 128 +rnn = CharRNN(n_letters, n_hidden, len(alldata.labels_uniq)) +print(rnn) ###################################################################### -# For the sake of efficiency we don't want to be creating a new Tensor for -# every step, so we will use ``lineToTensor`` instead of -# ``letterToTensor`` and use slices. This could be further optimized by -# precomputing batches of Tensors. -# +# After that we can pass our Tensor to the RNN to obtain a predicted output. Subsequently, +# we use a helper function, ``label_from_output``, to derive a text label for the class. -input = lineToTensor('Albert') -hidden = torch.zeros(1, n_hidden) - -output, next_hidden = rnn(input[0], hidden) -print(output) - - -###################################################################### -# As you can see the output is a ``<1 x n_categories>`` Tensor, where -# every item is the likelihood of that category (higher is more likely). -# +def label_from_output(output, output_labels): + top_n, top_i = output.topk(1) + label_i = top_i[0].item() + return output_labels[label_i], label_i +input = lineToTensor('Albert') +output = rnn(input) #this is equivalent to ``output = rnn.forward(input)`` +print(output) +print(label_from_output(output, alldata.labels_uniq)) ###################################################################### # # Training # ======== -# Preparing for Training -# ---------------------- -# -# Before going into training we should make a few helper functions. The -# first is to interpret the output of the network, which we know to be a -# likelihood of each category. We can use ``Tensor.topk`` to get the index -# of the greatest value: -# - -def categoryFromOutput(output): - top_n, top_i = output.topk(1) - category_i = top_i[0].item() - return all_categories[category_i], category_i - -print(categoryFromOutput(output)) - - -###################################################################### -# We will also want a quick way to get a training example (a name and its -# language): -# - -import random - -def randomChoice(l): - return l[random.randint(0, len(l) - 1)] - -def randomTrainingExample(): - category = randomChoice(all_categories) - line = randomChoice(category_lines[category]) - category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) - line_tensor = lineToTensor(line) - return category, line, category_tensor, line_tensor - -for i in range(10): - category, line, category_tensor, line_tensor = randomTrainingExample() - print('category =', category, '/ line =', line) ###################################################################### @@ -296,93 +283,67 @@ def randomTrainingExample(): # Now all it takes to train this network is show it a bunch of examples, # have it make guesses, and tell it if it's wrong. # -# For the loss function ``nn.NLLLoss`` is appropriate, since the last -# layer of the RNN is ``nn.LogSoftmax``. -# - -criterion = nn.NLLLoss() - - -###################################################################### -# Each loop of training will: -# -# - Create input and target tensors -# - Create a zeroed initial hidden state -# - Read each letter in and -# -# - Keep hidden state for next letter -# -# - Compare final output to target -# - Back-propagate -# - Return the output and loss -# - -learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn - -def train(category_tensor, line_tensor): - hidden = rnn.initHidden() - - rnn.zero_grad() - - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) - - loss = criterion(output, category_tensor) - loss.backward() - - # Add parameters' gradients to their values, multiplied by learning rate - for p in rnn.parameters(): - p.data.add_(p.grad.data, alpha=-learning_rate) - - return output, loss.item() - - -###################################################################### -# Now we just have to run that with a bunch of examples. Since the -# ``train`` function returns both the output and loss we can print its -# guesses and also keep track of loss for plotting. Since there are 1000s -# of examples we print only every ``print_every`` examples, and take an -# average of the loss. -# - -import time -import math - -n_iters = 100000 -print_every = 5000 -plot_every = 1000 - - - -# Keep track of losses for plotting -current_loss = 0 -all_losses = [] +# We do this by defining a ``train()`` function which trains the model on a given dataset using minibatches. RNNs +# RNNs are trained similarly to other networks; therefore, for completeness, we include a batched training method here. +# The loop (``for i in batch``) computes the losses for each of the items in the batch before adjusting the +# weights. This operation is repeated until the number of epochs is reached. + +import random +import numpy as np + +def train(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()): + """ + Learn on a batch of training_data for a specified number of iterations and reporting thresholds + """ + # Keep track of losses for plotting + current_loss = 0 + all_losses = [] + rnn.train() + optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate) + + start = time.time() + print(f"training on data set with n = {len(training_data)}") + + for iter in range(1, n_epoch + 1): + rnn.zero_grad() # clear the gradients + + # create some minibatches + # we cannot use dataloaders because each of our names is a different length + batches = list(range(len(training_data))) + random.shuffle(batches) + batches = np.array_split(batches, len(batches) //n_batch_size ) + + for idx, batch in enumerate(batches): + batch_loss = 0 + for i in batch: #for each example in this batch + (label_tensor, text_tensor, label, text) = training_data[i] + output = rnn.forward(text_tensor) + loss = criterion(output, label_tensor) + batch_loss += loss + + # optimize parameters + batch_loss.backward() + nn.utils.clip_grad_norm_(rnn.parameters(), 3) + optimizer.step() + optimizer.zero_grad() + + current_loss += batch_loss.item() / len(batch) + + all_losses.append(current_loss / len(batches) ) + if iter % report_every == 0: + print(f"{iter} ({iter / n_epoch:.0%}): \t average batch loss = {all_losses[-1]}") + current_loss = 0 + + return all_losses -def timeSince(since): - now = time.time() - s = now - since - m = math.floor(s / 60) - s -= m * 60 - return '%dm %ds' % (m, s) +########################################################################## +# We can now train a dataset with minibatches for a specified number of epochs. The number of epochs for this +# example is reduced to speed up the build. You can get better results with different parameters. start = time.time() - -for iter in range(1, n_iters + 1): - category, line, category_tensor, line_tensor = randomTrainingExample() - output, loss = train(category_tensor, line_tensor) - current_loss += loss - - # Print ``iter`` number, loss, name and guess - if iter % print_every == 0: - guess, guess_i = categoryFromOutput(output) - correct = '✓' if guess == category else '✗ (%s)' % category - print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct)) - - # Add current loss avg to list of losses - if iter % plot_every == 0: - all_losses.append(current_loss / plot_every) - current_loss = 0 - +all_losses = train(rnn, train_set, n_epoch=27, learning_rate=0.15, report_every=5) +end = time.time() +print(f"training took {end-start}s") ###################################################################### # Plotting the Results @@ -397,7 +358,7 @@ def timeSince(since): plt.figure() plt.plot(all_losses) - +plt.show() ###################################################################### # Evaluating the Results @@ -410,48 +371,45 @@ def timeSince(since): # ``evaluate()``, which is the same as ``train()`` minus the backprop. # -# Keep track of correct guesses in a confusion matrix -confusion = torch.zeros(n_categories, n_categories) -n_confusion = 10000 +def evaluate(rnn, testing_data, classes): + confusion = torch.zeros(len(classes), len(classes)) + + rnn.eval() #set to eval mode + with torch.no_grad(): # do not record the gradients during eval phase + for i in range(len(testing_data)): + (label_tensor, text_tensor, label, text) = testing_data[i] + output = rnn(text_tensor) + guess, guess_i = label_from_output(output, classes) + label_i = classes.index(label) + confusion[label_i][guess_i] += 1 -# Just return an output given a line -def evaluate(line_tensor): - hidden = rnn.initHidden() + # Normalize by dividing every row by its sum + for i in range(len(classes)): + denom = confusion[i].sum() + if denom > 0: + confusion[i] = confusion[i] / denom - for i in range(line_tensor.size()[0]): - output, hidden = rnn(line_tensor[i], hidden) + # Set up plot + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(confusion.cpu().numpy()) #numpy uses cpu here so we need to use a cpu version + fig.colorbar(cax) - return output + # Set up axes + ax.set_xticks(np.arange(len(classes)), labels=classes, rotation=90) + ax.set_yticks(np.arange(len(classes)), labels=classes) -# Go through a bunch of examples and record which are correctly guessed -for i in range(n_confusion): - category, line, category_tensor, line_tensor = randomTrainingExample() - output = evaluate(line_tensor) - guess, guess_i = categoryFromOutput(output) - category_i = all_categories.index(category) - confusion[category_i][guess_i] += 1 + # Force label at every tick + ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) + ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) -# Normalize by dividing every row by its sum -for i in range(n_categories): - confusion[i] = confusion[i] / confusion[i].sum() + # sphinx_gallery_thumbnail_number = 2 + plt.show() -# Set up plot -fig = plt.figure() -ax = fig.add_subplot(111) -cax = ax.matshow(confusion.numpy()) -fig.colorbar(cax) -# Set up axes -ax.set_xticklabels([''] + all_categories, rotation=90) -ax.set_yticklabels([''] + all_categories) - -# Force label at every tick -ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) -ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) - -# sphinx_gallery_thumbnail_number = 2 -plt.show() +evaluate(rnn, test_set, classes=alldata.labels_uniq) + ###################################################################### # You can pick out bright spots off the main axis that show which @@ -461,72 +419,20 @@ def evaluate(line_tensor): # -###################################################################### -# Running on User Input -# --------------------- -# - -def predict(input_line, n_predictions=3): - print('\n> %s' % input_line) - with torch.no_grad(): - output = evaluate(lineToTensor(input_line)) - - # Get top N categories - topv, topi = output.topk(n_predictions, 1, True) - predictions = [] - - for i in range(n_predictions): - value = topv[0][i].item() - category_index = topi[0][i].item() - print('(%.2f) %s' % (value, all_categories[category_index])) - predictions.append([value, all_categories[category_index]]) - -predict('Dovesky') -predict('Jackson') -predict('Satoshi') - - -###################################################################### -# The final versions of the scripts `in the Practical PyTorch -# repo `__ -# split the above code into a few files: -# -# - ``data.py`` (loads files) -# - ``model.py`` (defines the RNN) -# - ``train.py`` (runs training) -# - ``predict.py`` (runs ``predict()`` with command line arguments) -# - ``server.py`` (serve prediction as a JSON API with ``bottle.py``) -# -# Run ``train.py`` to train and save the network. -# -# Run ``predict.py`` with a name to view predictions: -# -# :: -# -# $ python predict.py Hazaki -# (-0.42) Japanese -# (-1.39) Polish -# (-3.51) Czech -# -# Run ``server.py`` and visit http://localhost:5533/Yourname to get JSON -# output of predictions. -# - - ###################################################################### # Exercises # ========= # -# - Try with a different dataset of line -> category, for example: -# -# - Any word -> language -# - First name -> gender -# - Character name -> writer -# - Page title -> blog or subreddit -# # - Get better results with a bigger and/or better shaped network # -# - Add more linear layers +# - Adjust the hyperparameters to enhance performance, such as changing the number of epochs, batch size, and learning rate # - Try the ``nn.LSTM`` and ``nn.GRU`` layers +# - Modify the size of the layers, such as increasing or decreasing the number of hidden nodes or adding additional linear layers # - Combine multiple of these RNNs as a higher level network +# +# - Try with a different dataset of line -> label, for example: # +# - Any word -> language +# - First name -> gender +# - Character name -> writer +# - Page title -> blog or subreddit \ No newline at end of file diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py index 114c3f3f572..50a6afa11b7 100644 --- a/intermediate_source/char_rnn_generation_tutorial.py +++ b/intermediate_source/char_rnn_generation_tutorial.py @@ -4,12 +4,18 @@ ************************************************************* **Author**: `Sean Robertson `_ +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + This is our second of three tutorials on "NLP From Scratch". -In the `first tutorial `_ +In the `first tutorial `_ we used a RNN to classify names into their language of origin. This time we'll turn around and generate names from languages. -:: +.. code-block:: sh > python sample.py Russian RUS Rovakov @@ -64,7 +70,7 @@ Preparing the Data ================== -.. Note:: +.. note:: Download the data from `here `_ and extract it to the current directory. @@ -370,7 +376,7 @@ def timeSince(since): # # - Return the final name # -# .. Note:: +# .. note:: # Rather than having to give it a starting letter, another # strategy would have been to include a "start of string" token in # training and have the network choose its own starting letter. diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst new file mode 100644 index 00000000000..1091b19a49e --- /dev/null +++ b/intermediate_source/compiled_autograd_tutorial.rst @@ -0,0 +1,221 @@ +Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` +========================================================================== +**Author:** `Simon Fan `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How compiled autograd interacts with ``torch.compile`` + * How to use the compiled autograd API + * How to inspect logs using ``TORCH_LOGS`` + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 + * Complete the `Introduction to torch.compile `_ + * Read through the TorchDynamo and AOTAutograd sections of `Get Started with PyTorch 2.x `_ + +Overview +-------- +Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 +that allows the capture of a larger backward graph. + +While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: + +* Graph breaks in the forward lead to graph breaks in the backward +* `Backward hooks `_ are not captured + +Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing +it to capture the full backward graph at runtime. Models with these two characteristics should try +Compiled Autograd, and potentially observe better performance. + +However, Compiled Autograd introduces its own limitations: + +* Added runtime overhead at the start of the backward for cache lookup +* More prone to recompiles and graph breaks in dynamo due to the larger capture + +.. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. + +Setup +----- +In this tutorial, we will base our examples on this simple neural network model. +It takes a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. + +.. code:: python + + import torch + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + +Basic usage +------------ +Before calling the ``torch.compile`` API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: + +.. code:: python + + model = Model() + x = torch.randn(10) + + torch._dynamo.config.compiled_autograd = True + @torch.compile + def train(model, x): + loss = model(x).sum() + loss.backward() + + train(model, x) + +In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using ``torch.randn(10)``. +We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. +When ``train(model, x)`` is called: + +* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``. +* Dynamo intercepts the Python bytecode, simulates their execution and records the operations into a graph. +* ``AOTDispatcher`` disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward. +* Dynamo sets the optimized function to be evaluated next by Python Interpreter. +* Python Interpreter executes the optimized function, which executes ``loss = model(x).sum()``. +* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we set ``torch._dynamo.config.compiled_autograd = True``. +* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this process, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully-traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode. +* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher will not need to partition the graph. + +Inspecting the compiled autograd logs +------------------------------------- +Run the script with the ``TORCH_LOGS`` environment variables: + +* To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +* To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` + +Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, +these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. + +In the image below, the red box encapsulates the AOT backward graph that is captured by ``torch.compile`` without Compiled Autograd. + + +.. image:: ../_static/img/compiled_autograd/entire_verbose_log.png + +.. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. + +Compiling the forward and backward pass using different flags +------------------------------------------------------------- +You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. + +.. code:: python + + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True + torch.compile(lambda: loss.backward(), fullgraph=True)() + +Or you can use the context manager, which will apply to all autograd calls within its scope. + +.. code:: python + + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)): + loss.backward() + + +Compiled Autograd addresses certain limitations of AOTAutograd +-------------------------------------------------------------- +1. Graph breaks in the forward pass no longer necessarily lead to graph breaks in the backward pass: + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + # 1st graph + temp = x + 10 + torch._dynamo.graph_break() + # 2nd graph + temp = temp + 10 + torch._dynamo.graph_break() + # 3rd graph + return temp.sum() + + x = torch.randn(10, 10, requires_grad=True) + torch._dynamo.utils.counters.clear() + loss = fn(x) + + # 1. base torch.compile + loss.backward(retain_graph=True) + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3) + torch._dynamo.utils.counters.clear() + + # 2. torch.compile with compiled autograd + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + + # single graph for the backward + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) + + +In the first ``torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. +Whereas in the second ``torch.compile`` with compiled autograd case, we see that a full backward graph was traced despite the graph breaks. + +.. note:: It is still possible for the Dynamo to graph break when tracing backward hooks captured by Compiled Autograd. + + +2. Backward hooks can now be captured + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + return x.sum() + + x = torch.randn(10, 10, requires_grad=True) + x.register_hook(lambda grad: grad+10) + loss = fn(x) + + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + +There should be a ``call_hook`` node in the graph, which dynamo will later inline into the following: + +.. image:: ../_static/img/compiled_autograd/call_hook_node.png + +Common recompilation reasons for Compiled Autograd +-------------------------------------------------- +1. Due to changes in the autograd structure of the loss value: + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + x = torch.randn(10, requires_grad=True) + for op in [torch.add, torch.sub, torch.mul, torch.div]: + loss = op(x, x).sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +In the example above, we call a different operator on each iteration, leading to ``loss`` tracking a different autograd history each time. You should see some recompile messages: **Cache miss due to new autograd node**. + +.. image:: ../_static/img/compiled_autograd/recompile_due_to_node.png + +2. Due to tensors changing shapes: + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + for i in [10, 100, 10]: + x = torch.randn(i, i, requires_grad=True) + loss = x.sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +In the example above, ``x`` changes shapes, and compiled autograd will mark ``x`` as a dynamic shape tensor after the first change. You should see recompiles messages: **Cache miss due to changed shapes**. + +.. image:: ../_static/img/compiled_autograd/recompile_due_to_dynamic.png + +Conclusion +---------- +In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. Stay tuned for deep dives on `dev-discuss `_. diff --git a/intermediate_source/ddp_series_minGPT.rst b/intermediate_source/ddp_series_minGPT.rst index 1d1f809e434..743568ae18b 100644 --- a/intermediate_source/ddp_series_minGPT.rst +++ b/intermediate_source/ddp_series_minGPT.rst @@ -6,11 +6,12 @@ training `__ \|\| **minGPT Training** Training “real-world” models with DDP ===================================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites - Best practices when writing a distributed training script - Increased flexibility with saving/loading artifacts in the cloud @@ -23,6 +24,7 @@ Authors: `Suraj Subramanian `__ :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ - [Optional] Familiarity with `multinode training `__ diff --git a/intermediate_source/ddp_series_multinode.rst b/intermediate_source/ddp_series_multinode.rst index 721c5580f6c..8746eb19bbd 100644 --- a/intermediate_source/ddp_series_multinode.rst +++ b/intermediate_source/ddp_series_multinode.rst @@ -6,11 +6,12 @@ training** \|\| `minGPT Training `__ Multinode Training ================== -Authors: `Suraj Subramanian `__ +Authors: `Suraj Subramanian `__ .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites - Launching multinode training jobs with ``torchrun`` - Code changes (and things to keep in mind) when moving from single-node to multinode training. @@ -22,6 +23,7 @@ Authors: `Suraj Subramanian `__ :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub `__ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites - Familiarity with `multi-GPU training <../beginner/ddp_series_multigpu.html>`__ and `torchrun <../beginner/ddp_series_fault_tolerance.html>`__ - 2 or more TCP-reachable GPU machines (this tutorial uses AWS p3.2xlarge instances) diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst index 1553cf1ac29..c91c195d6f0 100644 --- a/intermediate_source/ddp_tutorial.rst +++ b/intermediate_source/ddp_tutorial.rst @@ -2,7 +2,7 @@ Getting Started with Distributed Data Parallel ================================================= **Author**: `Shen Li `_ -**Edited by**: `Joe Zhu `_ +**Edited by**: `Joe Zhu `_, `Chirag Pandya `__ .. note:: |edit| View and edit this tutorial in `github `__. @@ -15,24 +15,30 @@ Prerequisites: `DistributedDataParallel `__ -(DDP) implements data parallelism at the module level which can run across -multiple machines. Applications using DDP should spawn multiple processes and -create a single DDP instance per process. DDP uses collective communications in the +(DDP) is a powerful module in PyTorch that allows you to parallelize your model across +multiple machines, making it perfect for large-scale deep learning applications. +To use DDP, you'll need to spawn multiple processes and create a single instance of DDP per process. + +But how does it work? DDP uses collective communications from the `torch.distributed `__ -package to synchronize gradients and buffers. More specifically, DDP registers -an autograd hook for each parameter given by ``model.parameters()`` and the -hook will fire when the corresponding gradient is computed in the backward -pass. Then DDP uses that signal to trigger gradient synchronization across -processes. Please refer to -`DDP design note `__ for more details. +package to synchronize gradients and buffers across all processes. This means that each process will have +its own copy of the model, but they'll all work together to train the model as if it were on a single machine. + +To make this happen, DDP registers an autograd hook for each parameter in the model. +When the backward pass is run, this hook fires and triggers gradient synchronization across all processes. +This ensures that each process has the same gradients, which are then used to update the model. + +For more information on how DDP works and how to use it effectively, be sure to check out the +`DDP design note `__. +With DDP, you can train your models faster and more efficiently than ever before! + +The recommended way to use DDP is to spawn one process for each model replica. The model replica can span +multiple devices. DDP processes can be placed on the same machine or across machines. Note that GPU devices +cannot be shared across DDP processes (i.e. one GPU for one DDP process). -The recommended way to use DDP is to spawn one process for each model replica, -where a model replica can span multiple devices. DDP processes can be -placed on the same machine or across machines, but GPU devices cannot be -shared across processes. This tutorial starts from a basic DDP use case and -then demonstrates more advanced use cases including checkpointing models and -combining DDP with model parallel. +In this tutorial, we'll start with a basic DDP use case and then demonstrate more advanced use cases, +including checkpointing models and combining DDP with model parallel. .. note:: @@ -43,25 +49,22 @@ combining DDP with model parallel. Comparison between ``DataParallel`` and ``DistributedDataParallel`` ------------------------------------------------------------------- -Before we dive in, let's clarify why, despite the added complexity, you would -consider using ``DistributedDataParallel`` over ``DataParallel``: +Before we dive in, let's clarify why you would consider using ``DistributedDataParallel`` +over ``DataParallel``, despite its added complexity: -- First, ``DataParallel`` is single-process, multi-thread, and only works on a - single machine, while ``DistributedDataParallel`` is multi-process and works - for both single- and multi- machine training. ``DataParallel`` is usually - slower than ``DistributedDataParallel`` even on a single machine due to GIL - contention across threads, per-iteration replicated model, and additional - overhead introduced by scattering inputs and gathering outputs. +- First, ``DataParallel`` is single-process, multi-threaded, but it only works on a + single machine. In contrast, ``DistributedDataParallel`` is multi-process and supports + both single- and multi- machine training. + Due to GIL contention across threads, per-iteration replicated model, and additional overhead introduced by + scattering inputs and gathering outputs, ``DataParallel`` is usually + slower than ``DistributedDataParallel`` even on a single machine. - Recall from the `prior tutorial `__ that if your model is too large to fit on a single GPU, you must use **model parallel** to split it across multiple GPUs. ``DistributedDataParallel`` works with - **model parallel**; ``DataParallel`` does not at this time. When DDP is combined + **model parallel**, while ``DataParallel`` does not at this time. When DDP is combined with model parallel, each DDP process would use model parallel, and all processes collectively would use data parallel. -- If your model needs to span multiple machines or if your use case does not fit - into data parallelism paradigm, please see `the RPC API `__ - for more generic distributed training support. Basic Use Case -------------- @@ -141,6 +144,7 @@ different DDP processes starting from different initial model parameter values. optimizer.step() cleanup() + print(f"Finished running basic DDP example on rank {rank}.") def run_demo(demo_fn, world_size): @@ -154,7 +158,7 @@ provides a clean API as if it were a local model. Gradient synchronization communications take place during the backward pass and overlap with the backward computation. When the ``backward()`` returns, ``param.grad`` already contains the synchronized gradient tensor. For basic use cases, DDP only -requires a few more LoCs to set up the process group. When applying DDP to more +requires a few more lines of code to set up the process group. When applying DDP to more advanced use cases, some caveats require caution. Skewed Processing Speeds @@ -179,13 +183,14 @@ It's common to use ``torch.save`` and ``torch.load`` to checkpoint modules during training and recover from checkpoints. See `SAVING AND LOADING MODELS `__ for more details. When using DDP, one optimization is to save the model in -only one process and then load it to all processes, reducing write overhead. -This is correct because all processes start from the same parameters and +only one process and then load it on all processes, reducing write overhead. +This works because all processes start from the same parameters and gradients are synchronized in backward passes, and hence optimizers should keep -setting parameters to the same values. If you use this optimization, make sure no process starts +setting parameters to the same values. +If you use this optimization (i.e. save on one process but restore on all), make sure no process starts loading before the saving is finished. Additionally, when loading the module, you need to provide an appropriate ``map_location`` -argument to prevent a process from stepping into others' devices. If ``map_location`` +argument to prevent processes from stepping into others' devices. If ``map_location`` is missing, ``torch.load`` will first load the module to CPU and then copy each parameter to where it was saved, which would result in all processes on the same machine using the same set of devices. For more advanced failure recovery @@ -214,11 +219,11 @@ and elasticity support, please refer to `TorchElastic `__ command +One can then run a `torch elastic/torchrun `__ command on all nodes to initialize the DDP job created above: .. code:: bash torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:29400 elastic_ddp.py -We are running the DDP script on two hosts, and each host we run with 8 processes, aka, we -are running it on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes. +In the example above, we are running the DDP script on two hosts and we run with 8 processes on each host. That is, we +are running this job on 16 GPUs. Note that ``$MASTER_ADDR`` must be the same across all nodes. -Here torchrun will launch 8 process and invoke ``elastic_ddp.py`` -on each process on the node it is launched on, but user also needs to apply cluster +Here ``torchrun`` will launch 8 processes and invoke ``elastic_ddp.py`` +on each process on the node it is launched on, but user also needs to apply cluster management tools like slurm to actually run this command on 2 nodes. For example, on a SLURM enabled cluster, we can write a script to run the command above @@ -368,8 +375,8 @@ and set ``MASTER_ADDR`` as: Then we can just run this script using the SLURM command: ``srun --nodes=2 ./torchrun_script.sh``. -Of course, this is just an example; you can choose your own cluster scheduling tools -to initiate the torchrun job. -For more information about Elastic run, one can check this -`quick start document `__ to learn more. +This is just an example; you can choose your own cluster scheduling tools to initiate the ``torchrun`` job. + +For more information about Elastic run, please see the +`quick start document `__. diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst index ecc64c2e1f0..ec3e3cf304a 100644 --- a/intermediate_source/dist_pipeline_parallel_tutorial.rst +++ b/intermediate_source/dist_pipeline_parallel_tutorial.rst @@ -1,353 +1,10 @@ Distributed Pipeline Parallelism Using RPC ========================================== -**Author**: `Shen Li `_ -.. note:: - |edit| View and edit this tutorial in `github `__. +This tutorial has been deprecated. -Prerequisites: - -- `PyTorch Distributed Overview <../beginner/dist_overview.html>`__ -- `Single-Machine Model Parallel Best Practices `__ -- `Getting started with Distributed RPC Framework `__ -- RRef helper functions: - `RRef.rpc_sync() `__, - `RRef.rpc_async() `__, and - `RRef.remote() `__ - - - -This tutorial uses a Resnet50 model to demonstrate implementing distributed -pipeline parallelism with `torch.distributed.rpc `__ -APIs. This can be viewed as the distributed counterpart of the multi-GPU -pipeline parallelism discussed in -`Single-Machine Model Parallel Best Practices `_. - -.. note:: This tutorial requires PyTorch v1.6.0 or above. - -.. note:: Full source code of this tutorial can be found at - `pytorch/examples `__. - -Basics ------- - - -The previous tutorial, `Getting Started with Distributed RPC Framework `_ -shows how to use `torch.distributed.rpc `_ -to implement distributed model parallelism for an RNN model. That tutorial uses -one GPU to host the ``EmbeddingTable``, and the provided code works fine. -However, if a model lives on multiple GPUs, it would require some extra steps to -increase the amortized utilization of all GPUs. Pipeline parallelism is one type -of paradigm that can help in this case. - -In this tutorial, we use ``ResNet50`` as an example model which is also used by -the `Single-Machine Model Parallel Best Practices `_ -tutorial. Similarly, the ``ResNet50`` model is divided into two shards and -the input batch is partitioned into multiple splits and fed into the two model -shards in a pipelined fashion. The difference is that, instead of parallelizing -the execution using CUDA streams, this tutorial invokes asynchronous RPCs. So, -the solution presented in this tutorial also works across machine boundaries. -The remainder of this tutorial presents the implementation in four steps. - - - -Step 1: Partition ResNet50 Model --------------------------------- - -This is the preparation step which implements ``ResNet50`` in two model shards. -The code below is borrowed from the -`ResNet implementation in torchvision `_. -The ``ResNetBase`` module contains the common building blocks and attributes for -the two ResNet shards. - - -.. code:: python - - import threading - - import torch - import torch.nn as nn - - from torchvision.models.resnet import Bottleneck - - num_classes = 1000 - - - def conv1x1(in_planes, out_planes, stride=1): - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) - - - class ResNetBase(nn.Module): - def __init__(self, block, inplanes, num_classes=1000, - groups=1, width_per_group=64, norm_layer=None): - super(ResNetBase, self).__init__() - - self._lock = threading.Lock() - self._block = block - self._norm_layer = nn.BatchNorm2d - self.inplanes = inplanes - self.dilation = 1 - self.groups = groups - self.base_width = width_per_group - - def _make_layer(self, planes, blocks, stride=1): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if stride != 1 or self.inplanes != planes * self._block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, planes * self._block.expansion, stride), - norm_layer(planes * self._block.expansion), - ) - - layers = [] - layers.append(self._block(self.inplanes, planes, stride, downsample, self.groups, - self.base_width, previous_dilation, norm_layer)) - self.inplanes = planes * self._block.expansion - for _ in range(1, blocks): - layers.append(self._block(self.inplanes, planes, groups=self.groups, - base_width=self.base_width, dilation=self.dilation, - norm_layer=norm_layer)) - - return nn.Sequential(*layers) - - def parameter_rrefs(self): - return [RRef(p) for p in self.parameters()] - - -Now, we are ready to define the two model shards. For the constructor, we -simply split all ResNet50 layers into two parts and move each part into the -provided device. The ``forward`` functions of both shards take an ``RRef`` of -the input data, fetch the data locally, and then move it to the expected device. -After applying all layers to the input, it moves the output to CPU and returns. -It is because the RPC API requires tensors to reside on CPU to avoid invalid -device errors when the numbers of devices in the caller and the callee do not -match. - - -.. code:: python - - class ResNetShard1(ResNetBase): - def __init__(self, device, *args, **kwargs): - super(ResNetShard1, self).__init__( - Bottleneck, 64, num_classes=num_classes, *args, **kwargs) - - self.device = device - self.seq = nn.Sequential( - nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False), - self._norm_layer(self.inplanes), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - self._make_layer(64, 3), - self._make_layer(128, 4, stride=2) - ).to(self.device) - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def forward(self, x_rref): - x = x_rref.to_here().to(self.device) - with self._lock: - out = self.seq(x) - return out.cpu() - - - class ResNetShard2(ResNetBase): - def __init__(self, device, *args, **kwargs): - super(ResNetShard2, self).__init__( - Bottleneck, 512, num_classes=num_classes, *args, **kwargs) - - self.device = device - self.seq = nn.Sequential( - self._make_layer(256, 6, stride=2), - self._make_layer(512, 3, stride=2), - nn.AdaptiveAvgPool2d((1, 1)), - ).to(self.device) - - self.fc = nn.Linear(512 * self._block.expansion, num_classes).to(self.device) - - def forward(self, x_rref): - x = x_rref.to_here().to(self.device) - with self._lock: - out = self.fc(torch.flatten(self.seq(x), 1)) - return out.cpu() - - -Step 2: Stitch ResNet50 Model Shards Into One Module ----------------------------------------------------- - - -Then, we create a ``DistResNet50`` module to assemble the two shards and -implement the pipeline parallel logic. In the constructor, we use two -``rpc.remote`` calls to put the two shards on two different RPC workers -respectively and hold on to the ``RRef`` to the two model parts so that they -can be referenced in the forward pass. The ``forward`` function -splits the input batch into multiple micro-batches, and feeds these -micro-batches to the two model parts in a pipelined fashion. It first uses an -``rpc.remote`` call to apply the first shard to a micro-batch and then forwards -the returned intermediate output ``RRef`` to the second model shard. After that, -it collects the ``Future`` of all micro-outputs, and waits for all of them after -the loop. Note that both ``remote()`` and ``rpc_async()`` return immediately and -run asynchronously. Therefore, the entire loop is non-blocking, and will launch -multiple RPCs concurrently. The execution order of one micro-batch on two model -parts are preserved by intermediate output ``y_rref``. The execution order -across micro-batches does not matter. In the end, the forward function -concatenates outputs of all micro-batches into one single output tensor and -returns. The ``parameter_rrefs`` function is a helper to -simplify distributed optimizer construction, which will be used later. - - - -.. code:: python - - class DistResNet50(nn.Module): - def __init__(self, num_split, workers, *args, **kwargs): - super(DistResNet50, self).__init__() - - self.num_split = num_split - - # Put the first part of the ResNet50 on workers[0] - self.p1_rref = rpc.remote( - workers[0], - ResNetShard1, - args = ("cuda:0",) + args, - kwargs = kwargs - ) - - # Put the second part of the ResNet50 on workers[1] - self.p2_rref = rpc.remote( - workers[1], - ResNetShard2, - args = ("cuda:1",) + args, - kwargs = kwargs - ) - - def forward(self, xs): - out_futures = [] - for x in iter(xs.split(self.num_split, dim=0)): - x_rref = RRef(x) - y_rref = self.p1_rref.remote().forward(x_rref) - z_fut = self.p2_rref.rpc_async().forward(y_rref) - out_futures.append(z_fut) - - return torch.cat(torch.futures.wait_all(out_futures)) - - def parameter_rrefs(self): - remote_params = [] - remote_params.extend(self.p1_rref.remote().parameter_rrefs().to_here()) - remote_params.extend(self.p2_rref.remote().parameter_rrefs().to_here()) - return remote_params - - -Step 3: Define The Training Loop --------------------------------- - - -After defining the model, let us implement the training loop. We use a -dedicated "master" worker to prepare random inputs and labels, and control the -distributed backward pass and distributed optimizer step. It first creates an -instance of the ``DistResNet50`` module. It specifies the number of -micro-batches for each batch, and also provides the name of the two RPC workers -(i.e., "worker1", and "worker2"). Then it defines the loss function and creates -a ``DistributedOptimizer`` using the ``parameter_rrefs()`` helper to acquire a -list of parameter ``RRefs``. Then, the main training loop is very similar to -regular local training, except that it uses ``dist_autograd`` to launch -backward and provides the ``context_id`` for both backward and optimizer -``step()``. - - -.. code:: python - - import torch.distributed.autograd as dist_autograd - import torch.optim as optim - from torch.distributed.optim import DistributedOptimizer - - num_batches = 3 - batch_size = 120 - image_w = 128 - image_h = 128 - - - def run_master(num_split): - # put the two model parts on worker1 and worker2 respectively - model = DistResNet50(num_split, ["worker1", "worker2"]) - loss_fn = nn.MSELoss() - opt = DistributedOptimizer( - optim.SGD, - model.parameter_rrefs(), - lr=0.05, - ) - - one_hot_indices = torch.LongTensor(batch_size) \ - .random_(0, num_classes) \ - .view(batch_size, 1) - - for i in range(num_batches): - print(f"Processing batch {i}") - # generate random inputs and labels - inputs = torch.randn(batch_size, 3, image_w, image_h) - labels = torch.zeros(batch_size, num_classes) \ - .scatter_(1, one_hot_indices, 1) - - with dist_autograd.context() as context_id: - outputs = model(inputs) - dist_autograd.backward(context_id, [loss_fn(outputs, labels)]) - opt.step(context_id) - - -Step 4: Launch RPC Processes ----------------------------- - - -Finally, the code below shows the target function for all processes. The main -logic is defined in ``run_master``. The workers passively waiting for -commands from the master, and hence simply runs ``init_rpc`` and ``shutdown``, -where the ``shutdown`` by default will block until all RPC participants finish. - -.. code:: python - - import os - import time - - import torch.multiprocessing as mp - - - def run_worker(rank, world_size, num_split): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '29500' - options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128) - - if rank == 0: - rpc.init_rpc( - "master", - rank=rank, - world_size=world_size, - rpc_backend_options=options - ) - run_master(num_split) - else: - rpc.init_rpc( - f"worker{rank}", - rank=rank, - world_size=world_size, - rpc_backend_options=options - ) - pass - - # block until all rpcs finish - rpc.shutdown() - - - if __name__=="__main__": - world_size = 3 - for num_split in [1, 2, 4, 8]: - tik = time.time() - mp.spawn(run_worker, args=(world_size, num_split), nprocs=world_size, join=True) - tok = time.time() - print(f"number of splits = {num_split}, execution time = {tok - tik}") +Redirecting to a newer tutorial in 3 seconds... +.. raw:: html + diff --git a/intermediate_source/dist_tuto.rst b/intermediate_source/dist_tuto.rst index 9a0ceb7a4a8..9a004aa67b4 100644 --- a/intermediate_source/dist_tuto.rst +++ b/intermediate_source/dist_tuto.rst @@ -38,7 +38,7 @@ simultaneously. If you have access to compute cluster you should check with your local sysadmin or use your favorite coordination tool (e.g., `pdsh `__, `clustershell `__, or -`others `__). For the purpose of this +`slurm `__). For the purpose of this tutorial, we will use a single machine and spawn multiple processes using the following template. @@ -47,6 +47,7 @@ the following template. """run.py:""" #!/usr/bin/env python import os + import sys import torch import torch.distributed as dist import torch.multiprocessing as mp @@ -64,11 +65,15 @@ the following template. if __name__ == "__main__": - size = 2 + world_size = 2 processes = [] - mp.set_start_method("spawn") + if "google.colab" in sys.modules: + print("Running in Google Colab") + mp.get_context("spawn") + else: + mp.set_start_method("spawn") for rank in range(size): - p = mp.Process(target=init_process, args=(rank, size, run)) + p = mp.Process(target=init_process, args=(rank, world_size, run)) p.start() processes.append(p) @@ -125,7 +130,7 @@ process 0 increments the tensor and sends it to process 1 so that they both end up with 1.0. Notice that process 1 needs to allocate memory in order to store the data it will receive. -Also notice that ``send``/``recv`` are **blocking**: both processes stop +Also notice that ``send/recv`` are **blocking**: both processes block until the communication is completed. On the other hand immediates are **non-blocking**; the script continues its execution and the methods return a ``Work`` object upon which we can choose to @@ -156,7 +161,8 @@ we should not modify the sent tensor nor access the received tensor before ``req In other words, - writing to ``tensor`` after ``dist.isend()`` will result in undefined behaviour. -- reading from ``tensor`` after ``dist.irecv()`` will result in undefined behaviour. +- reading from ``tensor`` after ``dist.irecv()`` will result in undefined + behaviour, until ``req.wait()`` has been executed. However, after ``req.wait()`` has been executed we are guaranteed that the communication took place, @@ -219,16 +225,23 @@ to obtain the sum of all tensors on all processes, we can use the Since we want the sum of all tensors in the group, we use ``dist.ReduceOp.SUM`` as the reduce operator. Generally speaking, any commutative mathematical operation can be used as an operator. -Out-of-the-box, PyTorch comes with 4 such operators, all working at the +Out-of-the-box, PyTorch comes with many such operators, all working at the element-wise level: - ``dist.ReduceOp.SUM``, - ``dist.ReduceOp.PRODUCT``, - ``dist.ReduceOp.MAX``, -- ``dist.ReduceOp.MIN``. +- ``dist.ReduceOp.MIN``, +- ``dist.ReduceOp.BAND``, +- ``dist.ReduceOp.BOR``, +- ``dist.ReduceOp.BXOR``, +- ``dist.ReduceOp.PREMUL_SUM``. + +The full list of supported operators is +`here `__. -In addition to ``dist.all_reduce(tensor, op, group)``, there are a total -of 6 collectives currently implemented in PyTorch. +In addition to ``dist.all_reduce(tensor, op, group)``, there are many additional collectives currently implemented in +PyTorch. Here are a few supported collectives. - ``dist.broadcast(tensor, src, group)``: Copies ``tensor`` from ``src`` to all other processes. @@ -244,6 +257,12 @@ of 6 collectives currently implemented in PyTorch. - ``dist.all_gather(tensor_list, tensor, group)``: Copies ``tensor`` from all processes to ``tensor_list``, on all processes. - ``dist.barrier(group)``: Blocks all processes in `group` until each one has entered this function. +- ``dist.all_to_all(output_tensor_list, input_tensor_list, group)``: Scatters list of input tensors to all processes in + a group and return gathered list of tensors in output list. + +The full list of supported collectives can be found by looking at the latest documentation for PyTorch Distributed +`(link) `__. + Distributed Training -------------------- @@ -275,7 +294,7 @@ gradients of their model on their batch of data and then average their gradients. In order to ensure similar convergence results when changing the number of processes, we will first have to partition our dataset. (You could also use -`tnt.dataset.SplitDataset `__, +`torch.utils.data.random_split `__, instead of the snippet below.) .. code:: python @@ -300,7 +319,7 @@ instead of the snippet below.) def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234): self.data = data self.partitions = [] - rng = Random() + rng = Random() # from random import Random rng.seed(seed) data_len = len(data) indexes = [x for x in range(0, data_len)] @@ -327,7 +346,7 @@ the following few lines: transforms.Normalize((0.1307,), (0.3081,)) ])) size = dist.get_world_size() - bsz = 128 / float(size) + bsz = 128 // size partition_sizes = [1.0 / size for _ in range(size)] partition = DataPartitioner(dataset, partition_sizes) partition = partition.use(dist.get_rank()) @@ -389,7 +408,7 @@ could train any model on a large computer cluster. lot more tricks `__ required to implement a production-level implementation of synchronous SGD. Again, use what `has been tested and -optimized `__. +optimized `__. Our Own Ring-Allreduce ~~~~~~~~~~~~~~~~~~~~~~ @@ -451,8 +470,9 @@ Communication Backends One of the most elegant aspects of ``torch.distributed`` is its ability to abstract and build on top of different backends. As mentioned before, -there are currently three backends implemented in PyTorch: Gloo, NCCL, and -MPI. They each have different specifications and tradeoffs, depending +there are multiple backends implemented in PyTorch. +Some of the most popular ones are Gloo, NCCL, and MPI. +They each have different specifications and tradeoffs, depending on the desired use case. A comparative table of supported functions can be found `here `__. @@ -544,15 +564,15 @@ NCCL backend is included in the pre-built binaries with CUDA support. Initialization Methods ~~~~~~~~~~~~~~~~~~~~~~ -To finish this tutorial, let's talk about the very first function we -called: ``dist.init_process_group(backend, init_method)``. In -particular, we will go over the different initialization methods which -are responsible for the initial coordination step between each process. -Those methods allow you to define how this coordination is done. -Depending on your hardware setup, one of these methods should be -naturally more suitable than the others. In addition to the following -sections, you should also have a look at the `official -documentation `__. +To conclude this tutorial, let's examine the initial function we invoked: +``dist.init_process_group(backend, init_method)``. Specifically, we will discuss the various +initialization methods responsible for the preliminary coordination step between each process. +These methods enable you to define how this coordination is accomplished. + +The choice of initialization method depends on your hardware setup, and one method may be more +suitable than others. In addition to the following sections, please refer to the `official +documentation `__ for further information. + **Environment Variable** @@ -569,7 +589,7 @@ finally handshake with them. - ``WORLD_SIZE``: The total number of processes, so that the master knows how many workers to wait for. - ``RANK``: Rank of each process, so they will know whether it is the - master of a worker. + master or a worker. **Shared File System** diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py new file mode 100644 index 00000000000..6ea09559392 --- /dev/null +++ b/intermediate_source/dqn_with_rnn_tutorial.py @@ -0,0 +1,468 @@ +# -*- coding: utf-8 -*- + +""" +Recurrent DQN: Training recurrent policies +========================================== + +**Author**: `Vincent Moens `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to incorporating an RNN in an actor in TorchRL + * How to use that memory-based policy with a replay buffer and a loss module + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.0.0 + * gym[mujoco] + * tqdm +""" + +######################################################################### +# Overview +# -------- +# +# Memory-based policies are crucial not only when the observations are partially +# observable but also when the time dimension must be taken into account to +# make informed decisions. +# +# Recurrent neural network have long been a popular tool for memory-based +# policies. The idea is to keep a recurrent state in memory between two +# consecutive steps, and use this as an input to the policy along with the +# current observation. +# +# This tutorial shows how to incorporate an RNN in a policy using TorchRL. +# +# Key learnings: +# +# - Incorporating an RNN in an actor in TorchRL; +# - Using that memory-based policy with a replay buffer and a loss module. +# +# The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier +# for the hidden states from one step to another. We'll build a policy that +# reads the previous recurrent state from the current TensorDict, and writes the +# current recurrent states in the TensorDict of the next state: +# +# .. figure:: /_static/img/rollout_recurrent.png +# :alt: Data collection with a recurrent policy +# +# As this figure shows, our environment populates the TensorDict with zeroed recurrent +# states which are read by the policy together with the observation to produce an +# action, and recurrent states that will be used for the next step. +# When the :func:`~torchrl.envs.utils.step_mdp` function is called, the recurrent states +# from the next state are brought to the current TensorDict. Let's see how this +# is implemented in practice. + +###################################################################### +# If you are running this in Google Colab, make sure you install the following dependencies: +# +# .. code-block:: bash +# +# !pip3 install torchrl +# !pip3 install gym[mujoco] +# !pip3 install tqdm +# +# Setup +# ----- +# + +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + +import torch +import tqdm +from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq +from torch import nn +from torchrl.collectors import SyncDataCollector +from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer +from torchrl.envs import ( + Compose, + ExplorationType, + GrayScale, + InitTracker, + ObservationNorm, + Resize, + RewardScaling, + set_exploration_type, + StepCounter, + ToTensorImage, + TransformedEnv, +) +from torchrl.envs.libs.gym import GymEnv +from torchrl.modules import ConvNet, EGreedyModule, LSTMModule, MLP, QValueModule +from torchrl.objectives import DQNLoss, SoftUpdate + +is_fork = multiprocessing.get_start_method() == "fork" +device = ( + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") +) + +###################################################################### +# Environment +# ----------- +# +# As usual, the first step is to build our environment: it helps us +# define the problem and build the policy network accordingly. For this tutorial, +# we'll be running a single pixel-based instance of the CartPole gym +# environment with some custom transforms: turning to grayscale, resizing to +# 84x84, scaling down the rewards and normalizing the observations. +# +# .. note:: +# The :class:`~torchrl.envs.transforms.StepCounter` transform is accessory. Since the CartPole +# task goal is to make trajectories as long as possible, counting the steps +# can help us track the performance of our policy. +# +# Two transforms are important for the purpose of this tutorial: +# +# - :class:`~torchrl.envs.transforms.InitTracker` will stamp the +# calls to :meth:`~torchrl.envs.EnvBase.reset` by adding a ``"is_init"`` +# boolean mask in the TensorDict that will track which steps require a reset +# of the RNN hidden states. +# - The :class:`~torchrl.envs.transforms.TensorDictPrimer` transform is a bit more +# technical. It is not required to use RNN policies. However, it +# instructs the environment (and subsequently the collector) that some extra +# keys are to be expected. Once added, a call to `env.reset()` will populate +# the entries indicated in the primer with zeroed tensors. Knowing that +# these tensors are expected by the policy, the collector will pass them on +# during collection. Eventually, we'll be storing our hidden states in the +# replay buffer, which will help us bootstrap the computation of the +# RNN operations in the loss module (which would otherwise be initiated +# with 0s). In summary: not including this transform will not impact hugely +# the training of our policy, but it will make the recurrent keys disappear +# from the collected data and the replay buffer, which will in turn lead to +# a slightly less optimal training. +# Fortunately, the :class:`~torchrl.modules.LSTMModule` we propose is +# equipped with a helper method to build just that transform for us, so +# we can wait until we build it! +# + +env = TransformedEnv( + GymEnv("CartPole-v1", from_pixels=True, device=device), + Compose( + ToTensorImage(), + GrayScale(), + Resize(84, 84), + StepCounter(), + InitTracker(), + RewardScaling(loc=0.0, scale=0.1), + ObservationNorm(standard_normal=True, in_keys=["pixels"]), + ), +) + +###################################################################### +# As always, we need to initialize manually our normalization constants: +# +env.transform[-1].init_stats(1000, reduce_dim=[0, 1, 2], cat_dim=0, keep_dims=[0]) +td = env.reset() + +###################################################################### +# Policy +# ------ +# +# Our policy will have 3 components: a :class:`~torchrl.modules.ConvNet` +# backbone, an :class:`~torchrl.modules.LSTMModule` memory layer and a shallow +# :class:`~torchrl.modules.MLP` block that will map the LSTM output onto the +# action values. +# +# Convolutional network +# ~~~~~~~~~~~~~~~~~~~~~ +# +# We build a convolutional network flanked with a :class:`torch.nn.AdaptiveAvgPool2d` +# that will squash the output in a vector of size 64. The :class:`~torchrl.modules.ConvNet` +# can assist us with this: +# + +feature = Mod( + ConvNet( + num_cells=[32, 32, 64], + squeeze_output=True, + aggregator_class=nn.AdaptiveAvgPool2d, + aggregator_kwargs={"output_size": (1, 1)}, + device=device, + ), + in_keys=["pixels"], + out_keys=["embed"], +) +###################################################################### +# we execute the first module on a batch of data to gather the size of the +# output vector: +# +n_cells = feature(env.reset())["embed"].shape[-1] + +###################################################################### +# LSTM Module +# ~~~~~~~~~~~ +# +# TorchRL provides a specialized :class:`~torchrl.modules.LSTMModule` class +# to incorporate LSTMs in your code-base. It is a :class:`~tensordict.nn.TensorDictModuleBase` +# subclass: as such, it has a set of ``in_keys`` and ``out_keys`` that indicate +# what values should be expected to be read and written/updated during the +# execution of the module. The class comes with customizable predefined +# values for these attributes to facilitate its construction. +# +# .. note:: +# *Usage limitations*: The class supports almost all LSTM features such as +# dropout or multi-layered LSTMs. +# However, to respect TorchRL's conventions, this LSTM must have the ``batch_first`` +# attribute set to ``True`` which is **not** the default in PyTorch. However, +# our :class:`~torchrl.modules.LSTMModule` changes this default +# behavior, so we're good with a native call. +# +# Also, the LSTM cannot have a ``bidirectional`` attribute set to ``True`` as +# this wouldn't be usable in online settings. In this case, the default value +# is the correct one. +# + +lstm = LSTMModule( + input_size=n_cells, + hidden_size=128, + device=device, + in_key="embed", + out_key="embed", +) + +###################################################################### +# Let us look at the LSTM Module class, specifically its in and out_keys: +print("in_keys", lstm.in_keys) +print("out_keys", lstm.out_keys) + +###################################################################### +# We can see that these values contain the key we indicated as the in_key (and out_key) +# as well as recurrent key names. The out_keys are preceded by a "next" prefix +# that indicates that they will need to be written in the "next" TensorDict. +# We use this convention (which can be overridden by passing the in_keys/out_keys +# arguments) to make sure that a call to :func:`~torchrl.envs.utils.step_mdp` will +# move the recurrent state to the root TensorDict, making it available to the +# RNN during the following call (see figure in the intro). +# +# As mentioned earlier, we have one more optional transform to add to our +# environment to make sure that the recurrent states are passed to the buffer. +# The :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` method does +# exactly that: +# +env.append_transform(lstm.make_tensordict_primer()) + +###################################################################### +# and that's it! We can print the environment to check that everything looks good now +# that we have added the primer: +print(env) + +###################################################################### +# MLP +# ~~~ +# +# We use a single-layer MLP to represent the action values we'll be using for +# our policy. +# +mlp = MLP( + out_features=2, + num_cells=[ + 64, + ], + device=device, +) +###################################################################### +# and fill the bias with zeros: + +mlp[-1].bias.data.fill_(0.0) +mlp = Mod(mlp, in_keys=["embed"], out_keys=["action_value"]) + +###################################################################### +# Using the Q-Values to select an action +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The last part of our policy is the Q-Value Module. +# The Q-Value module :class:`~torchrl.modules.tensordict_module.QValueModule` +# will read the ``"action_values"`` key that is produced by our MLP and +# from it, gather the action that has the maximum value. +# The only thing we need to do is to specify the action space, which can be done +# either by passing a string or an action-spec. This allows us to use +# Categorical (sometimes called "sparse") encoding or the one-hot version of it. +# +qval = QValueModule(spec=env.action_spec) + +###################################################################### +# .. note:: +# TorchRL also provides a wrapper class :class:`torchrl.modules.QValueActor` that +# wraps a module in a Sequential together with a :class:`~torchrl.modules.tensordict_module.QValueModule` +# like we are doing explicitly here. There is little advantage to do this +# and the process is less transparent, but the end results will be similar to +# what we do here. +# +# We can now put things together in a :class:`~tensordict.nn.TensorDictSequential` +# +stoch_policy = Seq(feature, lstm, mlp, qval) + +###################################################################### +# DQN being a deterministic algorithm, exploration is a crucial part of it. +# We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying +# progressively to 0. +# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyModule.step` +# (see training loop below). +# +exploration_module = EGreedyModule( + annealing_num_steps=1_000_000, spec=env.action_spec, eps_init=0.2 +) +stoch_policy = Seq( + stoch_policy, + exploration_module, +) + +###################################################################### +# Using the model for the loss +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The model as we've built it is well equipped to be used in sequential settings. +# However, the class :class:`torch.nn.LSTM` can use a cuDNN-optimized backend +# to run the RNN sequence faster on GPU device. We would not want to miss +# such an opportunity to speed up our training loop! +# To use it, we just need to tell the LSTM module to run on "recurrent-mode" +# when used by the loss. +# As we'll usually want to have two copies of the LSTM module, we do this by +# calling a :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` method that +# will return a new instance of the LSTM (with shared weights) that will +# assume that the input data is sequential in nature. +# +policy = Seq(feature, lstm.set_recurrent_mode(True), mlp, qval) + +###################################################################### +# Because we still have a couple of uninitialized parameters we should +# initialize them before creating an optimizer and such. +# +policy(env.reset()) + +###################################################################### +# DQN Loss +# -------- +# +# Out DQN loss requires us to pass the policy and, again, the action-space. +# While this may seem redundant, it is important as we want to make sure that +# the :class:`~torchrl.objectives.DQNLoss` and the :class:`~torchrl.modules.tensordict_module.QValueModule` +# classes are compatible, but aren't strongly dependent on each other. +# +# To use the Double-DQN, we ask for a ``delay_value`` argument that will +# create a non-differentiable copy of the network parameters to be used +# as a target network. +loss_fn = DQNLoss(policy, action_space=env.action_spec, delay_value=True) + +###################################################################### +# Since we are using a double DQN, we need to update the target parameters. +# We'll use a :class:`~torchrl.objectives.SoftUpdate` instance to carry out +# this work. +# +updater = SoftUpdate(loss_fn, eps=0.95) + +optim = torch.optim.Adam(policy.parameters(), lr=3e-4) + +###################################################################### +# Collector and replay buffer +# --------------------------- +# +# We build the simplest data collector there is. We'll try to train our algorithm +# with a million frames, extending the buffer with 50 frames at a time. The buffer +# will be designed to store 20 thousands trajectories of 50 steps each. +# At each optimization step (16 per data collection), we'll collect 4 items +# from our buffer, for a total of 200 transitions. +# We'll use a :class:`~torchrl.data.replay_buffers.LazyMemmapStorage` storage to keep the data +# on disk. +# +# .. note:: +# For the sake of efficiency, we're only running a few thousands iterations +# here. In a real setting, the total number of frames should be set to 1M. +# +collector = SyncDataCollector(env, stoch_policy, frames_per_batch=50, total_frames=200, device=device) +rb = TensorDictReplayBuffer( + storage=LazyMemmapStorage(20_000), batch_size=4, prefetch=10 +) + +###################################################################### +# Training loop +# ------------- +# +# To keep track of the progress, we will run the policy in the environment once +# every 50 data collection, and plot the results after training. +# + +utd = 16 +pbar = tqdm.tqdm(total=1_000_000) +longest = 0 + +traj_lens = [] +for i, data in enumerate(collector): + if i == 0: + print( + "Let us print the first batch of data.\nPay attention to the key names " + "which will reflect what can be found in this data structure, in particular: " + "the output of the QValueModule (action_values, action and chosen_action_value)," + "the 'is_init' key that will tell us if a step is initial or not, and the " + "recurrent_state keys.\n", + data, + ) + pbar.update(data.numel()) + # it is important to pass data that is not flattened + rb.extend(data.unsqueeze(0).to_tensordict().cpu()) + for _ in range(utd): + s = rb.sample().to(device, non_blocking=True) + loss_vals = loss_fn(s) + loss_vals["loss"].backward() + optim.step() + optim.zero_grad() + longest = max(longest, data["step_count"].max().item()) + pbar.set_description( + f"steps: {longest}, loss_val: {loss_vals['loss'].item(): 4.4f}, action_spread: {data['action'].sum(0)}" + ) + exploration_module.step(data.numel()) + updater.step() + + with set_exploration_type(ExplorationType.MODE), torch.no_grad(): + rollout = env.rollout(10000, stoch_policy) + traj_lens.append(rollout.get(("next", "step_count")).max().item()) + +###################################################################### +# Let's plot our results: +# +if traj_lens: + from matplotlib import pyplot as plt + + plt.plot(traj_lens) + plt.xlabel("Test collection") + plt.title("Test trajectory lengths") + +###################################################################### +# Conclusion +# ---------- +# +# We have seen how an RNN can be incorporated in a policy in TorchRL. +# You should now be able: +# +# - Create an LSTM module that acts as a :class:`~tensordict.nn.TensorDictModule` +# - Indicate to the LSTM module that a reset is needed via an :class:`~torchrl.envs.transforms.InitTracker` +# transform +# - Incorporate this module in a policy and in a loss module +# - Make sure that the collector is made aware of the recurrent state entries +# such that they can be stored in the replay buffer along with the rest of +# the data +# +# Further Reading +# --------------- +# +# - The TorchRL documentation can be found `here `_. diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst index dd76d08956f..786ef11f3b2 100644 --- a/intermediate_source/dynamic_quantization_bert_tutorial.rst +++ b/intermediate_source/dynamic_quantization_bert_tutorial.rst @@ -79,7 +79,7 @@ Mac: .. code:: shell - yes y | pip uninstall torch tochvision + yes y | pip uninstall torch torchvision yes y | pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html @@ -138,7 +138,7 @@ the following helper functions: one for converting the text examples into the feature vectors; The other one for measuring the F1 score of the predicted result. -The `glue_convert_examples_to_features `_ function converts the texts into input features: +The `glue_convert_examples_to_features `_ function converts the texts into input features: - Tokenize the input sequences; - Insert [CLS] in the beginning; @@ -147,7 +147,7 @@ The `glue_convert_examples_to_features `_ function has the compute metrics with +The `glue_compute_metrics `_ function has the compute metrics with the `F1 score `_, which can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The @@ -206,7 +206,7 @@ in `examples `_. +We provide the fine-tuned BERT model for MRPC task `here `_. To save time, you can download the model file (~400 MB) directly into your local folder ``$OUT_DIR``. 2.1 Set global configurations @@ -273,7 +273,7 @@ We load the tokenizer and fine-tuned BERT sequence classifier model 2.3 Define the tokenize and evaluation function ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We reuse the tokenize and evaluation function from `Huggingface `_. +We reuse the tokenize and evaluation function from `HuggingFace `_. .. code:: python @@ -414,7 +414,7 @@ We reuse the tokenize and evaluation function from `Huggingface /imagenet_class_index.json')) +# model = models.densenet121(weights='IMAGENET1K_V1') +# model.eval() # -# app = Flask(__name__) -# imagenet_class_index = json.load(open('/imagenet_class_index.json')) -# model = models.densenet121(weights='IMAGENET1K_V1') -# model.eval() # +# def transform_image(image_bytes): +# my_transforms = transforms.Compose([transforms.Resize(255), +# transforms.CenterCrop(224), +# transforms.ToTensor(), +# transforms.Normalize( +# [0.485, 0.456, 0.406], +# [0.229, 0.224, 0.225])]) +# image = Image.open(io.BytesIO(image_bytes)) +# return my_transforms(image).unsqueeze(0) # -# def transform_image(image_bytes): -# my_transforms = transforms.Compose([transforms.Resize(255), -# transforms.CenterCrop(224), -# transforms.ToTensor(), -# transforms.Normalize( -# [0.485, 0.456, 0.406], -# [0.229, 0.224, 0.225])]) -# image = Image.open(io.BytesIO(image_bytes)) -# return my_transforms(image).unsqueeze(0) # +# def get_prediction(image_bytes): +# tensor = transform_image(image_bytes=image_bytes) +# outputs = model.forward(tensor) +# _, y_hat = outputs.max(1) +# predicted_idx = str(y_hat.item()) +# return imagenet_class_index[predicted_idx] # -# def get_prediction(image_bytes): -# tensor = transform_image(image_bytes=image_bytes) -# outputs = model.forward(tensor) -# _, y_hat = outputs.max(1) -# predicted_idx = str(y_hat.item()) -# return imagenet_class_index[predicted_idx] +# +# @app.route('/predict', methods=['POST']) +# def predict(): +# if request.method == 'POST': +# file = request.files['file'] +# img_bytes = file.read() +# class_id, class_name = get_prediction(image_bytes=img_bytes) +# return jsonify({'class_id': class_id, 'class_name': class_name}) # # -# @app.route('/predict', methods=['POST']) -# def predict(): -# if request.method == 'POST': -# file = request.files['file'] -# img_bytes = file.read() -# class_id, class_name = get_prediction(image_bytes=img_bytes) -# return jsonify({'class_id': class_id, 'class_name': class_name}) +# if __name__ == '__main__': +# app.run() # # -# if __name__ == '__main__': -# app.run() - ###################################################################### # Let's test our web server! Run: # -# :: +# .. code-block:: sh +# +# FLASK_ENV=development FLASK_APP=app.py flask run # -# $ FLASK_ENV=development FLASK_APP=app.py flask run - ####################################################################### # We can use the # `requests `_ @@ -322,15 +286,15 @@ def get_prediction(image_bytes): # # resp = requests.post("http://localhost:5000/predict", # files={"file": open('/cat.jpg','rb')}) +# ####################################################################### # Printing `resp.json()` will now show the following: # -# :: +# .. code-block:: sh # # {"class_id": "n02124075", "class_name": "Egyptian_cat"} # - ###################################################################### # Next steps # -------------- @@ -368,3 +332,4 @@ def get_prediction(image_bytes): # # - Finally, we encourage you to check out our other tutorials on deploying PyTorch models # linked-to at the top of the page. +# diff --git a/intermediate_source/fx_conv_bn_fuser.py b/intermediate_source/fx_conv_bn_fuser.py index 90620ceba4e..547f93fb7f1 100644 --- a/intermediate_source/fx_conv_bn_fuser.py +++ b/intermediate_source/fx_conv_bn_fuser.py @@ -217,7 +217,7 @@ def fuse(model: torch.nn.Module) -> torch.nn.Module: ###################################################################### # Benchmarking our Fusion on ResNet18 -# ---------- +# ----------------------------------- # We can test our fusion pass on a larger model like ResNet18 and see how much # this pass improves inference performance. import torchvision.models as models diff --git a/intermediate_source/fx_profiling_tutorial.py b/intermediate_source/fx_profiling_tutorial.py index 18d8bc67cf4..8caaf7be39b 100644 --- a/intermediate_source/fx_profiling_tutorial.py +++ b/intermediate_source/fx_profiling_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ (beta) Building a Simple CPU Performance Profiler with FX -******************************************************* +********************************************************* **Author**: `James Reed `_ In this tutorial, we are going to use FX to do the following: diff --git a/intermediate_source/inductor_debug_cpu.py b/intermediate_source/inductor_debug_cpu.py index b534c432d88..4b6d62c0b0d 100644 --- a/intermediate_source/inductor_debug_cpu.py +++ b/intermediate_source/inductor_debug_cpu.py @@ -19,8 +19,8 @@ # # Meanwhile, you may also find related tutorials about ``torch.compile`` # around `basic usage `_, -# comprehensive `troubleshooting `_ -# and GPU-specific knowledge like `GPU performance profiling `_. +# comprehensive `troubleshooting `_ +# and GPU-specific knowledge like `GPU performance profiling `_. # # We will start debugging with a motivating example that triggers compilation issues and accuracy problems # by demonstrating the process of debugging to pinpoint the problems. @@ -64,7 +64,7 @@ def neg1(x): # # # Get more logging information -# ^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # No debugging information would be provided if you run this simple example by default. In order to get more useful debugging and logging information, we usually add a ``TORCH_COMPILE_DEBUG`` environment variable like below: # @@ -87,9 +87,9 @@ def neg1(x): # +-----------------------------+----------------------------------------------------------------+ # | ``fx_graph_transformed.py`` | Transformed FX graph, after pattern match | # +-----------------------------+----------------------------------------------------------------+ -# | ``ir_post_fusion.txt`` | Inductor IR before fusion | +# | ``ir_pre_fusion.txt`` | Inductor IR before fusion | # +-----------------------------+----------------------------------------------------------------+ -# | ``ir_pre_fusion.txt`` | Inductor IR after fusion | +# | ``ir_post_fusion.txt`` | Inductor IR after fusion | # +-----------------------------+----------------------------------------------------------------+ # | ``output_code.py`` | Generated Python code for graph, with C++/Triton kernels | # +-----------------------------+----------------------------------------------------------------+ @@ -98,6 +98,7 @@ def neg1(x): # Here are the main parts of code extracted from the files and we correlate the C++ generated line with the FX code line. # # ``fx_graph_runnable``: +# def forward1(self, arg0_1, arg1_1): neg = torch.ops.aten.neg.default(arg0_1); arg0_1 = None @@ -107,8 +108,10 @@ def forward1(self, arg0_1, arg1_1): ###################################################################### # C++ kernel in ``output_code``: +# -from torch._inductor.codecache import AsyncCompile +import torch +from torch._inductor.async_compile import AsyncCompile async_compile = AsyncCompile() cpp_fused_cat_maximum_neg_0 = async_compile.cpp(''' @@ -162,7 +165,7 @@ def forward1(self, arg0_1, arg1_1): # # As we know, the evolved chain of graph-level optimization is like: # -# :: +# .. code-block:: sh # # torch.neg (Python) -> torch.ops.aten.neg.default (within FX graph) -> ops.neg (within IR node) -> tmp2 = -tmp1 (within C++ kernel) # @@ -228,7 +231,7 @@ def neg2(x): ###################################################################### # IR node: # -# :: +# .. code-block:: sh # # buf0: SchedulerNode(ComputedBuffer) # buf0.writes = [MemoryDep('buf0', c0, {c0: 67120})] @@ -254,6 +257,7 @@ def neg2(x): # get_index_2 = self.get_index('index0') # store = ops.store('buf0', get_index_2, maximum, None) # return store +# ###################################################################### # According to the traceback logging, the compilation error is caused by the data type inconsistency of ``max_propagate_nan``'s inputs. @@ -304,7 +308,7 @@ def neg3(x): ###################################################################### # An accuracy problem would be raised as follows: # -# :: +# .. code-block:: sh # # torch._dynamo.utils: [ERROR] Accuracy failed: allclose not within tol=0.0001 # Traceback (most recent call last): @@ -314,13 +318,13 @@ def neg3(x): # # To debug an accuracy problem with Minifier, two environment variables are needed: # -# :: +# .. code-block:: sh # # TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4 python xx.py # # Which gives us logging information that demonstrates the steps of minifying: # -# :: +# .. code-block:: sh # # Started off with 6 nodes # @@ -339,7 +343,7 @@ def forward2(self, arg0_1): return (neg,) ###################################################################### -# For more usage details about Minifier, please refer to `Troubleshooting `_. +# For more usage details about Minifier, please refer to `Troubleshooting `_. ###################################################################### diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index e4bfd869160..03d6396a47e 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ Train a Mario-playing RL Agent -================ +=============================== **Authors:** `Yuansong Feng `__, `Suraj Subramanian `__, `Howard Wang `__, `Steven Guo `__. @@ -32,6 +32,9 @@ # # %%bash # pip install gym-super-mario-bros==7.4.0 +# pip install tensordict==0.3.0 +# pip install torchrl==0.3.0 +# import torch from torch import nn @@ -40,7 +43,7 @@ import numpy as np from pathlib import Path from collections import deque -import random, datetime, os, copy +import random, datetime, os # Gym is an OpenAI toolkit for RL import gym @@ -196,7 +199,7 @@ def __init__(self, env, shape): def observation(self, observation): transforms = T.Compose( - [T.Resize(self.shape), T.Normalize(0, 255)] + [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)] ) observation = transforms(observation).squeeze(0) return observation @@ -421,20 +424,10 @@ def __init__(self, input_dim, output_dim): if w != 84: raise ValueError(f"Expecting input width: 84, got: {w}") - self.online = nn.Sequential( - nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4), - nn.ReLU(), - nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), - nn.ReLU(), - nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), - nn.ReLU(), - nn.Flatten(), - nn.Linear(3136, 512), - nn.ReLU(), - nn.Linear(512, output_dim), - ) + self.online = self.__build_cnn(c, output_dim) - self.target = copy.deepcopy(self.online) + self.target = self.__build_cnn(c, output_dim) + self.target.load_state_dict(self.online.state_dict()) # Q_target parameters are frozen. for p in self.target.parameters(): @@ -446,6 +439,20 @@ def forward(self, input, model): elif model == "target": return self.target(input) + def __build_cnn(self, c, output_dim): + return nn.Sequential( + nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), + nn.ReLU(), + nn.Flatten(), + nn.Linear(3136, 512), + nn.ReLU(), + nn.Linear(512, output_dim), + ) + ###################################################################### # TD Estimate & TD Target @@ -771,7 +778,7 @@ def record(self, episode, epsilon, step): logger.log_episode() - if e % 20 == 0: + if (e % 20 == 0) or (e == episodes - 1): logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step) diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py deleted file mode 100644 index d7a4da73371..00000000000 --- a/intermediate_source/model_parallel_tutorial.py +++ /dev/null @@ -1,357 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Single-Machine Model Parallel Best Practices -================================ -**Author**: `Shen Li `_ - -Model parallel is widely-used in distributed training -techniques. Previous posts have explained how to use -`DataParallel `_ -to train a neural network on multiple GPUs; this feature replicates the -same model to all GPUs, where each GPU consumes a different partition of the -input data. Although it can significantly accelerate the training process, it -does not work for some use cases where the model is too large to fit into a -single GPU. This post shows how to solve that problem by using **model parallel**, -which, in contrast to ``DataParallel``, splits a single model onto different GPUs, -rather than replicating the entire model on each GPU (to be concrete, say a model -``m`` contains 10 layers: when using ``DataParallel``, each GPU will have a -replica of each of these 10 layers, whereas when using model parallel on two GPUs, -each GPU could host 5 layers). - -The high-level idea of model parallel is to place different sub-networks of a -model onto different devices, and implement the ``forward`` method accordingly -to move intermediate outputs across devices. As only part of a model operates -on any individual device, a set of devices can collectively serve a larger -model. In this post, we will not try to construct huge models and squeeze them -into a limited number of GPUs. Instead, this post focuses on showing the idea -of model parallel. It is up to the readers to apply the ideas to real-world -applications. - -.. note:: - - For distributed model parallel training where a model spans multiple - servers, please refer to - `Getting Started With Distributed RPC Framework `__ - for examples and details. - -Basic Usage ------------ -""" - -###################################################################### -# Let us start with a toy model that contains two linear layers. To run this -# model on two GPUs, simply put each linear layer on a different GPU, and move -# inputs and intermediate outputs to match the layer devices accordingly. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -class ToyModel(nn.Module): - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = torch.nn.Linear(10, 10).to('cuda:0') - self.relu = torch.nn.ReLU() - self.net2 = torch.nn.Linear(10, 5).to('cuda:1') - - def forward(self, x): - x = self.relu(self.net1(x.to('cuda:0'))) - return self.net2(x.to('cuda:1')) - -###################################################################### -# Note that, the above ``ToyModel`` looks very similar to how one would -# implement it on a single GPU, except the four ``to(device)`` calls which -# place linear layers and tensors on proper devices. That is the only place in -# the model that requires changes. The ``backward()`` and ``torch.optim`` will -# automatically take care of gradients as if the model is on one GPU. You only -# need to make sure that the labels are on the same device as the outputs when -# calling the loss function. - - -model = ToyModel() -loss_fn = nn.MSELoss() -optimizer = optim.SGD(model.parameters(), lr=0.001) - -optimizer.zero_grad() -outputs = model(torch.randn(20, 10)) -labels = torch.randn(20, 5).to('cuda:1') -loss_fn(outputs, labels).backward() -optimizer.step() - -###################################################################### -# Apply Model Parallel to Existing Modules -# ---------------------------------------- -# -# It is also possible to run an existing single-GPU module on multiple GPUs -# with just a few lines of changes. The code below shows how to decompose -# ``torchvision.models.resnet50()`` to two GPUs. The idea is to inherit from -# the existing ``ResNet`` module, and split the layers to two GPUs during -# construction. Then, override the ``forward`` method to stitch two -# sub-networks by moving the intermediate outputs accordingly. - - -from torchvision.models.resnet import ResNet, Bottleneck - -num_classes = 1000 - - -class ModelParallelResNet50(ResNet): - def __init__(self, *args, **kwargs): - super(ModelParallelResNet50, self).__init__( - Bottleneck, [3, 4, 6, 3], num_classes=num_classes, *args, **kwargs) - - self.seq1 = nn.Sequential( - self.conv1, - self.bn1, - self.relu, - self.maxpool, - - self.layer1, - self.layer2 - ).to('cuda:0') - - self.seq2 = nn.Sequential( - self.layer3, - self.layer4, - self.avgpool, - ).to('cuda:1') - - self.fc.to('cuda:1') - - def forward(self, x): - x = self.seq2(self.seq1(x).to('cuda:1')) - return self.fc(x.view(x.size(0), -1)) - - -###################################################################### -# The above implementation solves the problem for cases where the model is too -# large to fit into a single GPU. However, you might have already noticed that -# it will be slower than running it on a single GPU if your model fits. It is -# because, at any point in time, only one of the two GPUs are working, while -# the other one is sitting there doing nothing. The performance further -# deteriorates as the intermediate outputs need to be copied from ``cuda:0`` to -# ``cuda:1`` between ``layer2`` and ``layer3``. -# -# Let us run an experiment to get a more quantitative view of the execution -# time. In this experiment, we train ``ModelParallelResNet50`` and the existing -# ``torchvision.models.resnet50()`` by running random inputs and labels through -# them. After the training, the models will not produce any useful predictions, -# but we can get a reasonable understanding of the execution times. - - -import torchvision.models as models - -num_batches = 3 -batch_size = 120 -image_w = 128 -image_h = 128 - - -def train(model): - model.train(True) - loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.001) - - one_hot_indices = torch.LongTensor(batch_size) \ - .random_(0, num_classes) \ - .view(batch_size, 1) - - for _ in range(num_batches): - # generate random inputs and labels - inputs = torch.randn(batch_size, 3, image_w, image_h) - labels = torch.zeros(batch_size, num_classes) \ - .scatter_(1, one_hot_indices, 1) - - # run forward pass - optimizer.zero_grad() - outputs = model(inputs.to('cuda:0')) - - # run backward pass - labels = labels.to(outputs.device) - loss_fn(outputs, labels).backward() - optimizer.step() - - -###################################################################### -# The ``train(model)`` method above uses ``nn.MSELoss`` as the loss function, -# and ``optim.SGD`` as the optimizer. It mimics training on ``128 X 128`` -# images which are organized into 3 batches where each batch contains 120 -# images. Then, we use ``timeit`` to run the ``train(model)`` method 10 times -# and plot the execution times with standard deviations. - - -import matplotlib.pyplot as plt -plt.switch_backend('Agg') -import numpy as np -import timeit - -num_repeat = 10 - -stmt = "train(model)" - -setup = "model = ModelParallelResNet50()" -mp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -mp_mean, mp_std = np.mean(mp_run_times), np.std(mp_run_times) - -setup = "import torchvision.models as models;" + \ - "model = models.resnet50(num_classes=num_classes).to('cuda:0')" -rn_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -rn_mean, rn_std = np.mean(rn_run_times), np.std(rn_run_times) - - -def plot(means, stds, labels, fig_name): - fig, ax = plt.subplots() - ax.bar(np.arange(len(means)), means, yerr=stds, - align='center', alpha=0.5, ecolor='red', capsize=10, width=0.6) - ax.set_ylabel('ResNet50 Execution Time (Second)') - ax.set_xticks(np.arange(len(means))) - ax.set_xticklabels(labels) - ax.yaxis.grid(True) - plt.tight_layout() - plt.savefig(fig_name) - plt.close(fig) - - -plot([mp_mean, rn_mean], - [mp_std, rn_std], - ['Model Parallel', 'Single GPU'], - 'mp_vs_rn.png') - - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn.png -# :alt: -# -# The result shows that the execution time of model parallel implementation is -# ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. So we -# can conclude there is roughly 7% overhead in copying tensors back and forth -# across the GPUs. There are rooms for improvements, as we know one of the two -# GPUs is sitting idle throughout the execution. One option is to further -# divide each batch into a pipeline of splits, such that when one split reaches -# the second sub-network, the following split can be fed into the first -# sub-network. In this way, two consecutive splits can run concurrently on two -# GPUs. - -###################################################################### -# Speed Up by Pipelining Inputs -# ----------------------------- -# -# In the following experiments, we further divide each 120-image batch into -# 20-image splits. As PyTorch launches CUDA operations asynchronously, the -# implementation does not need to spawn multiple threads to achieve -# concurrency. - - -class PipelineParallelResNet50(ModelParallelResNet50): - def __init__(self, split_size=20, *args, **kwargs): - super(PipelineParallelResNet50, self).__init__(*args, **kwargs) - self.split_size = split_size - - def forward(self, x): - splits = iter(x.split(self.split_size, dim=0)) - s_next = next(splits) - s_prev = self.seq1(s_next).to('cuda:1') - ret = [] - - for s_next in splits: - # A. ``s_prev`` runs on ``cuda:1`` - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - # B. ``s_next`` runs on ``cuda:0``, which can run concurrently with A - s_prev = self.seq1(s_next).to('cuda:1') - - s_prev = self.seq2(s_prev) - ret.append(self.fc(s_prev.view(s_prev.size(0), -1))) - - return torch.cat(ret) - - -setup = "model = PipelineParallelResNet50()" -pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) -pp_mean, pp_std = np.mean(pp_run_times), np.std(pp_run_times) - -plot([mp_mean, rn_mean, pp_mean], - [mp_std, rn_std, pp_std], - ['Model Parallel', 'Single GPU', 'Pipelining Model Parallel'], - 'mp_vs_rn_vs_pp.png') - -###################################################################### -# Please note, device-to-device tensor copy operations are synchronized on -# current streams on the source and the destination devices. If you create -# multiple streams, you have to make sure that copy operations are properly -# synchronized. Writing the source tensor or reading/writing the destination -# tensor before finishing the copy operation can lead to undefined behavior. -# The above implementation only uses default streams on both source and -# destination devices, hence it is not necessary to enforce additional -# synchronizations. -# -# .. figure:: /_static/img/model-parallel-images/mp_vs_rn_vs_pp.png -# :alt: -# -# The experiment result shows that, pipelining inputs to model parallel -# ResNet50 speeds up the training process by roughly ``3.75/2.51-1=49%``. It is -# still quite far away from the ideal 100% speedup. As we have introduced a new -# parameter ``split_sizes`` in our pipeline parallel implementation, it is -# unclear how the new parameter affects the overall training time. Intuitively -# speaking, using small ``split_size`` leads to many tiny CUDA kernel launch, -# while using large ``split_size`` results to relatively long idle times during -# the first and last splits. Neither are optimal. There might be an optimal -# ``split_size`` configuration for this specific experiment. Let us try to find -# it by running experiments using several different ``split_size`` values. - - -means = [] -stds = [] -split_sizes = [1, 3, 5, 8, 10, 12, 20, 40, 60] - -for split_size in split_sizes: - setup = "model = PipelineParallelResNet50(split_size=%d)" % split_size - pp_run_times = timeit.repeat( - stmt, setup, number=1, repeat=num_repeat, globals=globals()) - means.append(np.mean(pp_run_times)) - stds.append(np.std(pp_run_times)) - -fig, ax = plt.subplots() -ax.plot(split_sizes, means) -ax.errorbar(split_sizes, means, yerr=stds, ecolor='red', fmt='ro') -ax.set_ylabel('ResNet50 Execution Time (Second)') -ax.set_xlabel('Pipeline Split Size') -ax.set_xticks(split_sizes) -ax.yaxis.grid(True) -plt.tight_layout() -plt.savefig("split_size_tradeoff.png") -plt.close(fig) - -###################################################################### -# -# .. figure:: /_static/img/model-parallel-images/split_size_tradeoff.png -# :alt: -# -# The result shows that setting ``split_size`` to 12 achieves the fastest -# training speed, which leads to ``3.75/2.43-1=54%`` speedup. There are -# still opportunities to further accelerate the training process. For example, -# all operations on ``cuda:0`` is placed on its default stream. It means that -# computations on the next split cannot overlap with the copy operation of the -# ``prev`` split. However, as ``prev`` and next splits are different tensors, there is -# no problem to overlap one's computation with the other one's copy. The -# implementation need to use multiple streams on both GPUs, and different -# sub-network structures require different stream management strategies. As no -# general multi-stream solution works for all model parallel use cases, we will -# not discuss it in this tutorial. -# -# **Note:** -# -# This post shows several performance measurements. You might see different -# numbers when running the same code on your own machine, because the result -# depends on the underlying hardware and software. To get the best performance -# for your environment, a proper approach is to first generate the curve to -# figure out the best split size, and then use that split size to pipeline -# inputs. -# diff --git a/intermediate_source/model_parallel_tutorial.rst b/intermediate_source/model_parallel_tutorial.rst new file mode 100644 index 00000000000..d687caf4634 --- /dev/null +++ b/intermediate_source/model_parallel_tutorial.rst @@ -0,0 +1,10 @@ +Single-Machine Model Parallel Best Practices +============================================ + +This tutorial has been deprecated. + +Redirecting to latest parallelism APIs in 3 seconds... + +.. raw:: html + + diff --git a/intermediate_source/nlp_from_scratch_index.rst b/intermediate_source/nlp_from_scratch_index.rst new file mode 100644 index 00000000000..95f70746cbc --- /dev/null +++ b/intermediate_source/nlp_from_scratch_index.rst @@ -0,0 +1,48 @@ +NLP from Scratch +================ + +In these three-part series you will build and train +a basic character-level Recurrent Neural Network (RNN) to classify words. + +You will learn: + +* How to construct Recurrent Neural Networks from scratch +* Essential data handling techniques for NLP +* How to train an RNN to identify the language origin of words. + +Before you begin, we recommend that you review the following: + +* `PyTorch Learn the Basics series `__ +* `How to install PyTorch `__ + +.. grid:: 3 + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 1: Classifying Names with a Character-Level RNN + :link: https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html + :link-type: url + + Learn how to use an RNN to classify names into their language of origin. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 2: Generating Names with a Character-Level RNN + :link: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html + :link-type: url + + Expand the RNN we created in Part 1 to generate names from languages. + +++ + :octicon:`code;1em` Code + + .. grid-item-card:: :octicon:`file-code;1em` + NLP From Scratch - Part 3: Translation with a Sequence to Sequence Network and Attention + :link: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html + :link-type: url + + Create a sequence-to-sequence model that can translate your text from French + to English. + +++ + :octicon:`code;1em` Code + + diff --git a/intermediate_source/optimizer_step_in_backward_tutorial.py b/intermediate_source/optimizer_step_in_backward_tutorial.py index fd5fcb74fc2..fd72f733c50 100644 --- a/intermediate_source/optimizer_step_in_backward_tutorial.py +++ b/intermediate_source/optimizer_step_in_backward_tutorial.py @@ -147,7 +147,7 @@ def train(model, optimizer): # API on Tensor. # # ``Tensor.register_post_accumulate_grad_hook(hook)`` API and our technique -# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Our technique relies on not having to save the gradients during ``backward()``. Instead, # once a gradient has been accumulated, we will immediately apply the optimizer to # the corresponding parameter and drop that gradient entirely! This removes the need @@ -265,4 +265,4 @@ def train(model): # fusing the optimizer into the backward step through the new # ``Tensor.register_post_accumulate_grad_hook()`` API and *when* to apply this # technique (when gradients memory is significant). Along the way, we also learned -# about memory snapshots, which are generally useful in memory optimization. \ No newline at end of file +# about memory snapshots, which are generally useful in memory optimization. diff --git a/intermediate_source/parametrizations.py b/intermediate_source/parametrizations.py index 096836e933b..59cff1d241c 100644 --- a/intermediate_source/parametrizations.py +++ b/intermediate_source/parametrizations.py @@ -227,7 +227,7 @@ def __init__(self, n): def forward(self, X): # (I + X)(I - X)^{-1} - return torch.solve(self.Id + X, self.Id - X).solution + return torch.linalg.solve(self.Id - X, self.Id + X) layer = nn.Linear(3, 3) parametrize.register_parametrization(layer, "weight", Skew()) @@ -301,13 +301,13 @@ def __init__(self, n): def forward(self, X): # Assume X skew-symmetric # (I + X)(I - X)^{-1} - return torch.solve(self.Id + X, self.Id - X).solution + return torch.linalg.solve(self.Id - X, self.Id + X) def right_inverse(self, A): # Assume A orthogonal # See https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map - # (X - I)(X + I)^{-1} - return torch.solve(X - self.Id, self.Id + X).solution + # (A - I)(A + I)^{-1} + return torch.linalg.solve(A + self.Id, self.Id - A) layer_orthogonal = nn.Linear(3, 3) parametrize.register_parametrization(layer_orthogonal, "weight", Skew()) diff --git a/intermediate_source/per_sample_grads.py b/intermediate_source/per_sample_grads.py index c423679229c..ece80d3f94f 100644 --- a/intermediate_source/per_sample_grads.py +++ b/intermediate_source/per_sample_grads.py @@ -42,7 +42,6 @@ def forward(self, x): x = F.relu(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) - output = x return output def loss_fn(predictions, targets): diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py new file mode 100644 index 00000000000..fa69507a0e7 --- /dev/null +++ b/intermediate_source/pinmem_nonblock.py @@ -0,0 +1,728 @@ +# -*- coding: utf-8 -*- +""" +A guide on good usage of ``non_blocking`` and ``pin_memory()`` in PyTorch +========================================================================= + +**Author**: `Vincent Moens `_ + +Introduction +------------ + +Transferring data from the CPU to the GPU is fundamental in many PyTorch applications. +It's crucial for users to understand the most effective tools and options available for moving data between devices. +This tutorial examines two key methods for device-to-device data transfer in PyTorch: +:meth:`~torch.Tensor.pin_memory` and :meth:`~torch.Tensor.to` with the ``non_blocking=True`` option. + +What you will learn +~~~~~~~~~~~~~~~~~~~ + +Optimizing the transfer of tensors from the CPU to the GPU can be achieved through asynchronous transfers and memory +pinning. However, there are important considerations: + +- Using ``tensor.pin_memory().to(device, non_blocking=True)`` can be up to twice as slow as a straightforward ``tensor.to(device)``. +- Generally, ``tensor.to(device, non_blocking=True)`` is an effective choice for enhancing transfer speed. +- While ``cpu_tensor.to("cuda", non_blocking=True).mean()`` executes correctly, attempting + ``cuda_tensor.to("cpu", non_blocking=True).mean()`` will result in erroneous outputs. + +Preamble +~~~~~~~~ + +The performance reported in this tutorial are conditioned on the system used to build the tutorial. +Although the conclusions are applicable across different systems, the specific observations may vary slightly +depending on the hardware available, especially on older hardware. +The primary objective of this tutorial is to offer a theoretical framework for understanding CPU to GPU data transfers. +However, any design decisions should be tailored to individual cases and guided by benchmarked throughput measurements, +as well as the specific requirements of the task at hand. + +""" + +import torch + +assert torch.cuda.is_available(), "A cuda device is required to run this tutorial" + + +###################################################################### +# +# This tutorial requires tensordict to be installed. If you don't have tensordict in your environment yet, install it +# by running the following command in a separate cell: +# +# .. code-block:: bash +# +# # Install tensordict with the following command +# !pip3 install tensordict +# +# We start by outlining the theory surrounding these concepts, and then move to concrete test examples of the features. +# +# +# Background +# ---------- +# +# .. _pinned_memory_background: +# +# Memory management basics +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_memory: +# +# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed +# in memory. The memory we talk about here is a rather complex concept worth looking at carefully. +# We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity) +# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory) +# make up the virtual memory, which is an abstraction of the total resources available. +# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation +# and creates the illusion that the main memory is larger than it actually is. +# +# In normal circumstances, a regular CPU tensor is pageable which means that it is divided in blocks called pages that +# can live anywhere in the virtual memory (both in RAM or on disk). As mentioned earlier, this has the advantage that +# the memory seems larger than what the main memory actually is. +# +# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings +# back this page into RAM ("swap in" or "page in"). +# In turn, the OS may have to swap out (or "page out") another page to make room for the new page. +# +# In contrast to pageable memory, a pinned (or page-locked or non-pageable) memory is a type of memory that cannot +# be swapped out to disk. +# It allows for faster and more predictable access times, but has the downside that it is more limited than the +# pageable memory (aka the main memory). +# +# .. figure:: /_static/img/pinmem/pinmem.png +# :alt: +# +# CUDA and (non-)pageable memory +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_cuda_pageable_memory: +# +# To understand how CUDA copies a tensor from CPU to CUDA, let's consider the two scenarios above: +# +# - If the memory is page-locked, the device can access the memory directly in the main memory. The memory addresses are well +# defined and functions that need to read these data can be significantly accelerated. +# - If the memory is pageable, all the pages will have to be brought to the main memory before being sent to the GPU. +# This operation may take time and is less predictable than when executed on page-locked tensors. +# +# More precisely, when CUDA sends pageable data from CPU to GPU, it must first create a page-locked copy of that data +# before making the transfer. +# +# Asynchronous vs. Synchronous Operations with ``non_blocking=True`` (CUDA ``cudaMemcpyAsync``) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_async_sync: +# +# When executing a copy from a host (e.g., CPU) to a device (e.g., GPU), the CUDA toolkit offers modalities to do these +# operations synchronously or asynchronously with respect to the host. +# +# In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to +# `cudaMemcpyAsync `_. +# If ``non_blocking=False`` (default), a ``cudaStreamSynchronize`` will be called after each and every ``cudaMemcpyAsync``, making +# the call to :meth:`~torch.Tensor.to` blocking in the main thread. +# If ``non_blocking=True``, no synchronization is triggered, and the main thread on the host is not blocked. +# Therefore, from the host perspective, multiple tensors can be sent to the device simultaneously, +# as the thread does not need to wait for one transfer to be completed to initiate the other. +# +# .. note:: In general, the transfer is blocking on the device side (even if it isn't on the host side): +# the copy on the device cannot occur while another operation is being executed. +# However, in some advanced scenarios, a copy and a kernel execution can be done simultaneously on the GPU side. +# As the following example will show, three requirements must be met to enable this: +# +# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra, +# Tesla, or H100 devices have more than one DMA engine. +# +# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using +# :class:`~torch.cuda.Stream`. +# +# 3. The source data must be in pinned memory. +# +# We demonstrate this by running profiles on the following script. +# + +import contextlib + +from torch.cuda import Stream + + +s = Stream() + +torch.manual_seed(42) +t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True) +t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False) +t3_cuda = torch.randn(1024**2 * 5, device="cuda:0") + +assert torch.cuda.is_available() +device = torch.device("cuda", torch.cuda.current_device()) + + +# The function we want to profile +def inner(pinned: bool, streamed: bool): + with torch.cuda.stream(s) if streamed else contextlib.nullcontext(): + if pinned: + t1_cuda = t1_cpu_pinned.to(device, non_blocking=True) + else: + t2_cuda = t2_cpu_paged.to(device, non_blocking=True) + t_star_cuda_h2d_event = s.record_event() + # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is + # done in the other stream + t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda + t3_cuda_h2d_event = torch.cuda.current_stream().record_event() + t_star_cuda_h2d_event.synchronize() + t3_cuda_h2d_event.synchronize() + + +# Our profiler: profiles the `inner` function and stores the results in a .json file +def benchmark_with_profiler( + pinned, + streamed, +) -> None: + torch._C._profiler._set_cuda_sync_enabled_val(True) + wait, warmup, active = 1, 1, 2 + num_steps = wait + warmup + active + rank = 0 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1 + ), + ) as prof: + for step_idx in range(1, num_steps + 1): + inner(streamed=streamed, pinned=pinned) + if rank is None or rank == 0: + prof.step() + prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json") + + +###################################################################### +# Loading these profile traces in chrome (``chrome://tracing``) shows the following results: first, let's see +# what happens if both the arithmetic operation on ``t3_cuda`` is executed after the pageable tensor is sent to GPU +# in the main stream: +# + +benchmark_with_profiler(streamed=False, pinned=False) + +###################################################################### +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned0.png +# :alt: +# +# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively: + +benchmark_with_profiler(streamed=False, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned1.png +# :alt: +# +# Sending a pageable tensor to GPU on a separate stream is also a blocking operation: + +benchmark_with_profiler(streamed=True, pinned=False) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned0.png +# :alt: +# +# Only pinned tensors copies to GPU on a separate stream overlap with another cuda kernel executed on +# the main stream: + +benchmark_with_profiler(streamed=True, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned1.png +# :alt: +# +# A PyTorch perspective +# --------------------- +# +# .. _pinned_memory_pt_perspective: +# +# ``pin_memory()`` +# ~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_pinned: +# +# PyTorch offers the possibility to create and send tensors to page-locked memory through the +# :meth:`~torch.Tensor.pin_memory` method and constructor arguments. +# CPU tensors on a machine where CUDA is initialized can be cast to pinned memory through the :meth:`~torch.Tensor.pin_memory` +# method. Importantly, ``pin_memory`` is blocking on the main thread of the host: it will wait for the tensor to be copied to +# page-locked memory before executing the next operation. +# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other +# constructors. +# +# Let us check the speed of pinning memory and sending tensors to CUDA: + + +import torch +import gc +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + + +def timer(cmd): + median = ( + Timer(cmd, globals=globals()) + .adaptive_autorange(min_run_time=1.0, max_run_time=20.0) + .median + * 1000 + ) + print(f"{cmd}: {median: 4.4f} ms") + return median + + +# A tensor in pageable memory +pageable_tensor = torch.randn(1_000_000) + +# A tensor in page-locked (pinned) memory +pinned_tensor = torch.randn(1_000_000, pin_memory=True) + +# Runtimes: +pageable_to_device = timer("pageable_tensor.to('cuda:0')") +pinned_to_device = timer("pinned_tensor.to('cuda:0')") +pin_mem = timer("pageable_tensor.pin_memory()") +pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')") + +# Ratios: +r1 = pinned_to_device / pageable_to_device +r2 = pin_mem_to_device / pageable_to_device + +# Create a figure with the results +fig, ax = plt.subplots() + +xlabels = [0, 1, 2] +bar_labels = [ + "pageable_tensor.to(device) (1x)", + f"pinned_tensor.to(device) ({r1:4.2f}x)", + f"pageable_tensor.pin_memory().to(device) ({r2:4.2f}x)" + f"\npin_memory()={100*pin_mem/pin_mem_to_device:.2f}% of runtime.", +] +values = [pageable_to_device, pinned_to_device, pin_mem_to_device] +colors = ["tab:blue", "tab:red", "tab:orange"] +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (pin-memory)") +ax.set_xticks([]) +ax.legend() + +plt.show() + +# Clear tensors +del pageable_tensor, pinned_tensor +_ = gc.collect() + +###################################################################### +# +# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under +# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU. +# +# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before +# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just +# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will +# perform anyway before copying the data from host to device. +# +# .. note:: The PyTorch implementation of +# `pin_memory `_ +# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc `_ +# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does. +# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or +# the amount of available RAM. +# +# ``non_blocking=True`` +# ~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_non_blocking: +# +# As mentioned earlier, many PyTorch operations have the option of being executed asynchronously with respect to the host +# through the ``non_blocking`` argument. +# +# Here, to account accurately of the benefits of using ``non_blocking``, we will design a slightly more complex +# experiment since we want to assess how fast it is to send multiple tensors to GPU with and without calling +# ``non_blocking``. +# + + +# A simple loop that copies all tensors to cuda +def copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0")) + return result + + +# A loop that copies all tensors to cuda asynchronously +def copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +# Create a list of tensors +tensors = [torch.randn(1000) for _ in range(1000)] +to_device = timer("copy_to_device(*tensors)") +to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)") + +# Ratio +r1 = to_device_nonblocking / to_device + +# Plot the results +fig, ax = plt.subplots() + +xlabels = [0, 1] +bar_labels = [f"to(device) (1x)", f"to(device, non_blocking=True) ({r1:4.2f}x)"] +colors = ["tab:blue", "tab:red"] +values = [to_device, to_device_nonblocking] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (non-blocking)") +ax.set_xticks([]) +ax.legend() + +plt.show() + + +###################################################################### +# To get a better sense of what is happening here, let us profile these two functions: + + +from torch.profiler import profile, ProfilerActivity + + +def profile_mem(cmd): + with profile(activities=[ProfilerActivity.CPU]) as prof: + exec(cmd) + print(cmd) + print(prof.key_averages().table(row_limit=10)) + + +###################################################################### +# Let's see the call stack with a regular ``to(device)`` first: +# + +print("Call to `to(device)`", profile_mem("copy_to_device(*tensors)")) + +###################################################################### +# and now the ``non_blocking`` version: +# + +print( + "Call to `to(device, non_blocking=True)`", + profile_mem("copy_to_device_nonblocking(*tensors)"), +) + + +###################################################################### +# The results are without any doubt better when using ``non_blocking=True``, as all transfers are initiated simultaneously +# on the host side and only one synchronization is done. +# +# The benefit will vary depending on the number and the size of the tensors as well as depending on the hardware being +# used. +# +# .. note:: Interestingly, the blocking ``to("cuda")`` actually performs the same asynchronous device casting operation +# (``cudaMemcpyAsync``) as the one with ``non_blocking=True`` with a synchronization point after each copy. +# +# Synergies +# ~~~~~~~~~ +# +# .. _pinned_memory_synergies: +# +# Now that we have made the point that data transfer of tensors already in pinned memory to GPU is faster than from +# pageable memory, and that we know that doing these transfers asynchronously is also faster than synchronously, we can +# benchmark combinations of these approaches. First, let's write a couple of new functions that will call ``pin_memory`` +# and ``to(device)`` on each tensor: +# + + +def pin_copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0")) + return result + + +def pin_copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +###################################################################### +# The benefits of using :meth:`~torch.Tensor.pin_memory` are more pronounced for +# somewhat large batches of large tensors: +# + +tensors = [torch.randn(1_000_000) for _ in range(1000)] +page_copy = timer("copy_to_device(*tensors)") +page_copy_nb = timer("copy_to_device_nonblocking(*tensors)") + +tensors_pinned = [torch.randn(1_000_000, pin_memory=True) for _ in range(1000)] +pinned_copy = timer("copy_to_device(*tensors_pinned)") +pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned)") + +pin_and_copy = timer("pin_copy_to_device(*tensors)") +pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)") + +# Plot +strategies = ("pageable copy", "pinned copy", "pin and copy") +blocking = { + "blocking": [page_copy, pinned_copy, pin_and_copy], + "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb], +} + +x = torch.arange(3) +width = 0.25 +multiplier = 0 + + +fig, ax = plt.subplots(layout="constrained") + +for attribute, runtimes in blocking.items(): + offset = width * multiplier + rects = ax.bar(x + offset, runtimes, width, label=attribute) + ax.bar_label(rects, padding=3, fmt="%.2f") + multiplier += 1 + +# Add some text for labels, title and custom x-axis tick labels, etc. +ax.set_ylabel("Runtime (ms)") +ax.set_title("Runtime (pin-mem and non-blocking)") +ax.set_xticks([0, 1, 2]) +ax.set_xticklabels(strategies) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") +ax.legend(loc="upper left", ncols=3) + +plt.show() + +del tensors, tensors_pinned +_ = gc.collect() + + +###################################################################### +# Other copy directions (GPU -> CPU, CPU -> MPS) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_other_direction: +# +# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe. +# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is +# valid at read time. +# However, this guarantee does not extend to transfers in the opposite direction, from GPU to CPU. +# Without explicit synchronization, these transfers offer no assurance that the copy will be complete at the time of +# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage: +# + + +tensor = ( + torch.arange(1, 1_000_000, dtype=torch.double, device="cuda") + .expand(100, 999999) + .clone() +) +torch.testing.assert_close( + tensor.mean(), torch.tensor(500_000, dtype=torch.double, device="cuda") +), tensor.mean() +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with non_blocking") +except AssertionError: + print(f"{i}th test failed with non_blocking. Skipping remaining tests") +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.cuda.synchronize() + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with synchronize") +except AssertionError: + print(f"One test failed with synchronize: {i}th assertion!") + + +###################################################################### +# The same considerations apply to copies from the CPU to non-CUDA devices, such as MPS. +# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a +# CUDA-enabled device. +# +# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction, +# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed before +# the data is accessed. +# +# Practical recommendations +# ------------------------- +# +# .. _pinned_memory_recommendations: +# +# We can now wrap up some early recommendations based on our observations: +# +# In general, ``non_blocking=True`` will provide good throughput, regardless of whether the original tensor is or +# isn't in pinned memory. +# If the tensor is already in pinned memory, the transfer can be accelerated, but sending it to +# pin memory manually from python main thread is a blocking operation on the host, and hence will annihilate much of +# the benefit of using ``non_blocking=True`` (as CUDA does the `pin_memory` transfer anyway). +# +# One might now legitimately ask what use there is for the :meth:`~torch.Tensor.pin_memory` method. +# In the following section, we will explore further how this can be used to accelerate the data transfer even more. +# +# Additional considerations +# ------------------------- +# +# .. _pinned_memory_considerations: +# +# PyTorch notoriously provides a :class:`~torch.utils.data.DataLoader` class whose constructor accepts a +# ``pin_memory`` argument. +# Considering our previous discussion on ``pin_memory``, you might wonder how the ``DataLoader`` manages to +# accelerate data transfers if memory pinning is inherently blocking. +# +# The key lies in the DataLoader's use of a separate thread to handle the transfer of data from pageable to pinned +# memory, thus preventing any blockage in the main thread. +# +# To illustrate this, we will use the TensorDict primitive from the homonymous library. +# When invoking :meth:`~tensordict.TensorDict.to`, the default behavior is to send tensors to the device asynchronously, +# followed by a single call to ``torch.device.synchronize()`` afterwards. +# +# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option which initiates multiple threads to execute +# ``pin_memory()`` before proceeding with to ``to(device)``. +# This approach can further accelerate data transfers, as demonstrated in the following example. +# +# + +from tensordict import TensorDict +import torch +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + +# Create the dataset +td = TensorDict({str(i): torch.randn(1_000_000) for i in range(1000)}) + +# Runtimes +copy_blocking = timer("td.to('cuda:0', non_blocking=False)") +copy_non_blocking = timer("td.to('cuda:0')") +copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)") +copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)") + +# Rations +r1 = copy_non_blocking / copy_blocking +r2 = copy_pin_nb / copy_blocking +r3 = copy_pin_multithread_nb / copy_blocking + +# Figure +fig, ax = plt.subplots() + +xlabels = [0, 1, 2, 3] +bar_labels = [ + "Blocking copy (1x)", + f"Non-blocking copy ({r1:4.2f}x)", + f"Blocking pin, non-blocking copy ({r2:4.2f}x)", + f"Non-blocking pin, non-blocking copy ({r3:4.2f}x)", +] +values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb] +colors = ["tab:blue", "tab:red", "tab:orange", "tab:green"] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime") +ax.set_xticks([]) +ax.legend() + +plt.show() + +###################################################################### +# In this example, we are transferring many large tensors from the CPU to the GPU. +# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance. +# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits. +# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited. +# +# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle +# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite +# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor. +# +# Moreover, transferring data that resides on disk (whether in shared memory or files) to the GPU typically requires an +# intermediate step of copying the data into pinned memory (located in RAM). +# Utilizing non_blocking for large data transfers in this context can significantly increase RAM consumption, +# potentially leading to adverse effects. +# +# In practice, there is no one-size-fits-all solution. +# The effectiveness of using multithreaded ``pin_memory`` combined with ``non_blocking`` transfers depends on a +# variety of factors, including the specific system, operating system, hardware, and the nature of the tasks +# being executed. +# Here is a list of factors to check when trying to speed-up data transfers between CPU and GPU, or comparing +# throughput's across scenarios: +# +# - **Number of available cores** +# +# How many CPU cores are available? Is the system shared with other users or processes that might compete for +# resources? +# +# - **Core utilization** +# +# Are the CPU cores heavily utilized by other processes? Does the application perform other CPU-intensive tasks +# concurrently with data transfers? +# +# - **Memory utilization** +# +# How much pageable and page-locked memory is currently being used? Is there sufficient free memory to allocate +# additional pinned memory without affecting system performance? Remember that nothing comes for free, for instance +# ``pin_memory`` will consume RAM and may impact other tasks. +# +# - **CUDA Device Capabilities** +# +# Does the GPU support multiple DMA engines for concurrent data transfers? What are the specific capabilities and +# limitations of the CUDA device being used? +# +# - **Number of tensors to be sent** +# +# How many tensors are transferred in a typical operation? +# +# - **Size of the tensors to be sent** +# +# What is the size of the tensors being transferred? A few large tensors or many small tensors may not benefit from +# the same transfer program. +# +# - **System Architecture** +# +# How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)? +# +# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial +# portion of RAM. +# This reduces the available memory for other critical operations, such as paging, which can negatively impact the +# overall performance of an algorithm. +# +# Conclusion +# ---------- +# +# .. _pinned_memory_conclusion: +# +# Throughout this tutorial, we have explored several critical factors that influence transfer speeds and memory +# management when sending tensors from the host to the device. We've learned that using ``non_blocking=True`` generally +# accelerates data transfers, and that :meth:`~torch.Tensor.pin_memory` can also enhance performance if implemented +# correctly. However, these techniques require careful design and calibration to be effective. +# +# Remember that profiling your code and keeping an eye on the memory consumption are essential to optimize resource +# usage and achieve the best possible performance. +# +# Additional resources +# -------------------- +# +# .. _pinned_memory_resources: +# +# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about +# what was discussed in this tutorial, check the following references: +# +# - `CUDA toolkit memory management doc `_; +# - `CUDA pin-memory note `_; +# - `How to Optimize Data Transfers in CUDA C/C++ `_; +# - `tensordict doc `_ and `repo `_. +# diff --git a/intermediate_source/pipeline_tutorial.py b/intermediate_source/pipeline_tutorial.py deleted file mode 100644 index 33561f60592..00000000000 --- a/intermediate_source/pipeline_tutorial.py +++ /dev/null @@ -1,420 +0,0 @@ -""" -Training Transformer models using Pipeline Parallelism -====================================================== - -**Author**: `Pritam Damania `_ - -This tutorial demonstrates how to train a large Transformer model across -multiple GPUs using pipeline parallelism. This tutorial is an extension of the -`Sequence-to-Sequence Modeling with nn.Transformer and TorchText `__ tutorial -and scales up the same model to demonstrate how pipeline parallelism can be -used to train Transformer models. - -Prerequisites: - - * `Pipeline Parallelism `__ - * `Sequence-to-Sequence Modeling with nn.Transformer and TorchText `__ -""" - - -###################################################################### -# Define the model -# ---------------- -# - - -###################################################################### -# In this tutorial, we will split a Transformer model across two GPUs and use -# pipeline parallelism to train the model. The model is exactly the same model -# used in the `Sequence-to-Sequence Modeling with nn.Transformer and TorchText -# `__ tutorial, -# but is split into two stages. The largest number of parameters belong to the -# `nn.TransformerEncoder `__ layer. -# The `nn.TransformerEncoder `__ -# itself consists of ``nlayers`` of `nn.TransformerEncoderLayer `__. -# As a result, our focus is on ``nn.TransformerEncoder`` and we split the model -# such that half of the ``nn.TransformerEncoderLayer`` are on one GPU and the -# other half are on another. To do this, we pull out the ``Encoder`` and -# ``Decoder`` sections into separate modules and then build an ``nn.Sequential`` -# representing the original Transformer module. - -import sys -import math -import torch -import torch.nn as nn -import torch.nn.functional as F -import tempfile -from torch.nn import TransformerEncoder, TransformerEncoderLayer - -if sys.platform == 'win32': - print('Windows platform is not supported for pipeline parallelism') - sys.exit(0) -if torch.cuda.device_count() < 2: - print('Need at least two GPU devices for this tutorial') - sys.exit(0) - -class Encoder(nn.Module): - def __init__(self, ntoken, ninp, dropout=0.5): - super(Encoder, self).__init__() - self.pos_encoder = PositionalEncoding(ninp, dropout) - self.encoder = nn.Embedding(ntoken, ninp) - self.ninp = ninp - self.init_weights() - - def init_weights(self): - initrange = 0.1 - self.encoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, src): - # Need (S, N) format for encoder. - src = src.t() - src = self.encoder(src) * math.sqrt(self.ninp) - return self.pos_encoder(src) - -class Decoder(nn.Module): - def __init__(self, ntoken, ninp): - super(Decoder, self).__init__() - self.decoder = nn.Linear(ninp, ntoken) - self.init_weights() - - def init_weights(self): - initrange = 0.1 - self.decoder.bias.data.zero_() - self.decoder.weight.data.uniform_(-initrange, initrange) - - def forward(self, inp): - # Need batch dimension first for output of pipeline. - return self.decoder(inp).permute(1, 0, 2) - - -###################################################################### -# ``PositionalEncoding`` module injects some information about the -# relative or absolute position of the tokens in the sequence. The -# positional encodings have the same dimension as the embeddings so that -# the two can be summed. Here, we use ``sine`` and ``cosine`` functions of -# different frequencies. - - -class PositionalEncoding(nn.Module): - - def __init__(self, d_model, dropout=0.1, max_len=5000): - super(PositionalEncoding, self).__init__() - self.dropout = nn.Dropout(p=dropout) - - pe = torch.zeros(max_len, d_model) - position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) - div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - pe = pe.unsqueeze(0).transpose(0, 1) - self.register_buffer('pe', pe) - - def forward(self, x): - x = x + self.pe[:x.size(0), :] - return self.dropout(x) - - - -###################################################################### -# Load and batch data -# ------------------- -# - - -###################################################################### -# The training process uses Wikitext-2 dataset from ``torchtext``. -# To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. -# -# The vocab object is built based on the train dataset and is used to numericalize -# tokens into tensors. Starting from sequential data, the ``batchify()`` -# function arranges the dataset into columns, trimming off any tokens remaining -# after the data has been divided into batches of size ``batch_size``. -# For instance, with the alphabet as the sequence (total length of 26) -# and a batch size of 4, we would divide the alphabet into 4 sequences of -# length 6: -# -# .. math:: -# -# \begin{bmatrix} -# \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} -# \end{bmatrix} -# \Rightarrow -# \begin{bmatrix} -# \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & -# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & -# \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & -# \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} -# \end{bmatrix} -# -# These columns are treated as independent by the model, which means that -# the dependence of ``G`` and ``F`` can not be learned, but allows more -# efficient batch processing. -# - -import torch -from torchtext.datasets import WikiText2 -from torchtext.data.utils import get_tokenizer -from torchtext.vocab import build_vocab_from_iterator - -train_iter = WikiText2(split='train') -tokenizer = get_tokenizer('basic_english') -vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=[""]) -vocab.set_default_index(vocab[""]) - -def data_process(raw_text_iter): - data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter] - return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) - -train_iter, val_iter, test_iter = WikiText2() -train_data = data_process(train_iter) -val_data = data_process(val_iter) -test_data = data_process(test_iter) - -device = torch.device("cuda") - -def batchify(data, bsz): - # Divide the dataset into ``bsz`` parts. - nbatch = data.size(0) // bsz - # Trim off any extra elements that wouldn't cleanly fit (remainders). - data = data.narrow(0, 0, nbatch * bsz) - # Evenly divide the data across the ``bsz` batches. - data = data.view(bsz, -1).t().contiguous() - return data.to(device) - -batch_size = 20 -eval_batch_size = 10 -train_data = batchify(train_data, batch_size) -val_data = batchify(val_data, eval_batch_size) -test_data = batchify(test_data, eval_batch_size) - - -###################################################################### -# Functions to generate input and target sequence -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - - -###################################################################### -# ``get_batch()`` function generates the input and target sequence for -# the transformer model. It subdivides the source data into chunks of -# length ``bptt``. For the language modeling task, the model needs the -# following words as ``Target``. For example, with a ``bptt`` value of 2, -# we'd get the following two Variables for ``i`` = 0: -# -# .. image:: ../_static/img/transformer_input_target.png -# -# It should be noted that the chunks are along dimension 0, consistent -# with the ``S`` dimension in the Transformer model. The batch dimension -# ``N`` is along dimension 1. -# - -bptt = 25 -def get_batch(source, i): - seq_len = min(bptt, len(source) - 1 - i) - data = source[i:i+seq_len] - target = source[i+1:i+1+seq_len].view(-1) - # Need batch dimension first for pipeline parallelism. - return data.t(), target - -###################################################################### -# Model scale and Pipe initialization -# ----------------------------------- -# - - -###################################################################### -# To demonstrate training large Transformer models using pipeline parallelism, -# we scale up the Transformer layers appropriately. We use an embedding -# dimension of 4096, hidden size of 4096, 16 attention heads and 12 total -# transformer layers (``nn.TransformerEncoderLayer``). This creates a model with -# **~1.4 billion** parameters. -# -# We need to initialize the `RPC Framework `__ -# since Pipe depends on the RPC framework via `RRef `__ -# which allows for future expansion to cross host pipelining. We need to -# initialize the RPC framework with only a single worker since we're using a -# single process to drive multiple GPUs. -# -# The pipeline is then initialized with 8 transformer layers on one GPU and 8 -# transformer layers on the other GPU. -# -# .. note:: -# For efficiency purposes we ensure that the ``nn.Sequential`` passed to -# ``Pipe`` only consists of two elements (corresponding to two GPUs), this -# allows the Pipe to work with only two partitions and avoid any -# cross-partition overheads. - -ntokens = len(vocab) # the size of vocabulary -emsize = 4096 # embedding dimension -nhid = 4096 # the dimension of the feedforward network model in ``nn.TransformerEncoder`` -nlayers = 12 # the number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder`` -nhead = 16 # the number of heads in the Multihead Attention models -dropout = 0.2 # the dropout value - -from torch.distributed import rpc -tmpfile = tempfile.NamedTemporaryFile() -rpc.init_rpc( - name="worker", - rank=0, - world_size=1, - rpc_backend_options=rpc.TensorPipeRpcBackendOptions( - init_method="file://{}".format(tmpfile.name), - # Specifying _transports and _channels is a workaround and we no longer - # will have to specify _transports and _channels for PyTorch - # versions >= 1.8.1 - _transports=["ibv", "uv"], - _channels=["cuda_ipc", "cuda_basic"], - ) -) - -num_gpus = 2 -partition_len = ((nlayers - 1) // num_gpus) + 1 - -# Add encoder in the beginning. -tmp_list = [Encoder(ntokens, emsize, dropout).cuda(0)] -module_list = [] - -# Add all the necessary transformer blocks. -for i in range(nlayers): - transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) - if i != 0 and i % (partition_len) == 0: - module_list.append(nn.Sequential(*tmp_list)) - tmp_list = [] - device = i // (partition_len) - tmp_list.append(transformer_block.to(device)) - -# Add decoder in the end. -tmp_list.append(Decoder(ntokens, emsize).cuda(num_gpus - 1)) -module_list.append(nn.Sequential(*tmp_list)) - -from torch.distributed.pipeline.sync import Pipe - -# Build the pipeline. -chunks = 8 -model = Pipe(torch.nn.Sequential(*module_list), chunks = chunks) - - -def get_total_params(module: torch.nn.Module): - total_params = 0 - for param in module.parameters(): - total_params += param.numel() - return total_params - -print ('Total parameters in model: {:,}'.format(get_total_params(model))) - -###################################################################### -# Run the model -# ------------- -# - - -###################################################################### -# `CrossEntropyLoss `__ -# is applied to track the loss and -# `SGD `__ -# implements stochastic gradient descent method as the optimizer. The initial -# learning rate is set to 5.0. `StepLR `__ is -# applied to adjust the learn rate through epochs. During the -# training, we use -# `nn.utils.clip_grad_norm\_ `__ -# function to scale all the gradient together to prevent exploding. -# - -criterion = nn.CrossEntropyLoss() -lr = 5.0 # learning rate -optimizer = torch.optim.SGD(model.parameters(), lr=lr) -scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) - -import time -def train(): - model.train() # Turn on the train mode - total_loss = 0. - start_time = time.time() - ntokens = len(vocab) - - # Train only for 50 batches to keep script execution time low. - nbatches = min(50 * bptt, train_data.size(0) - 1) - - for batch, i in enumerate(range(0, nbatches, bptt)): - data, targets = get_batch(train_data, i) - optimizer.zero_grad() - # Since the Pipe is only within a single host and process the ``RRef`` - # returned by forward method is local to this node and can simply - # retrieved via ``RRef.local_value()``. - output = model(data).local_value() - # Need to move targets to the device where the output of the - # pipeline resides. - loss = criterion(output.view(-1, ntokens), targets.cuda(1)) - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) - optimizer.step() - - total_loss += loss.item() - log_interval = 10 - if batch % log_interval == 0 and batch > 0: - cur_loss = total_loss / log_interval - elapsed = time.time() - start_time - print('| epoch {:3d} | {:5d}/{:5d} batches | ' - 'lr {:02.2f} | ms/batch {:5.2f} | ' - 'loss {:5.2f} | ppl {:8.2f}'.format( - epoch, batch, nbatches // bptt, scheduler.get_lr()[0], - elapsed * 1000 / log_interval, - cur_loss, math.exp(cur_loss))) - total_loss = 0 - start_time = time.time() - -def evaluate(eval_model, data_source): - eval_model.eval() # Turn on the evaluation mode - total_loss = 0. - ntokens = len(vocab) - # Evaluate only for 50 batches to keep script execution time low. - nbatches = min(50 * bptt, data_source.size(0) - 1) - with torch.no_grad(): - for i in range(0, nbatches, bptt): - data, targets = get_batch(data_source, i) - output = eval_model(data).local_value() - output_flat = output.view(-1, ntokens) - # Need to move targets to the device where the output of the - # pipeline resides. - total_loss += len(data) * criterion(output_flat, targets.cuda(1)).item() - return total_loss / (len(data_source) - 1) - -###################################################################### -# Loop over epochs. Save the model if the validation loss is the best -# we've seen so far. Adjust the learning rate after each epoch. - -best_val_loss = float("inf") -epochs = 3 # The number of epochs -best_model = None - -for epoch in range(1, epochs + 1): - epoch_start_time = time.time() - train() - val_loss = evaluate(model, val_data) - print('-' * 89) - print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' - 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), - val_loss, math.exp(val_loss))) - print('-' * 89) - - if val_loss < best_val_loss: - best_val_loss = val_loss - best_model = model - - scheduler.step() - - -###################################################################### -# Evaluate the model with the test dataset -# ------------------------------------- -# - - -###################################################################### -# Apply the best model to check the result with the test dataset. - -test_loss = evaluate(best_model, test_data) -print('=' * 89) -print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( - test_loss, math.exp(test_loss))) -print('=' * 89) diff --git a/intermediate_source/pipeline_tutorial.rst b/intermediate_source/pipeline_tutorial.rst new file mode 100644 index 00000000000..06f10a4a884 --- /dev/null +++ b/intermediate_source/pipeline_tutorial.rst @@ -0,0 +1,11 @@ +Training Transformer models using Pipeline Parallelism +====================================================== + +This tutorial has been deprecated. + +Redirecting to the latest parallelism APIs in 3 seconds... + +.. raw:: html + + + diff --git a/intermediate_source/pipelining_tutorial.rst b/intermediate_source/pipelining_tutorial.rst new file mode 100644 index 00000000000..36738011a40 --- /dev/null +++ b/intermediate_source/pipelining_tutorial.rst @@ -0,0 +1,240 @@ +Introduction to Distributed Pipeline Parallelism +================================================ +**Authors**: `Howard Huang `_ + +.. note:: + |edit| View and edit this tutorial in `github `__. + +This tutorial uses a gpt-style transformer model to demonstrate implementing distributed +pipeline parallelism with `torch.distributed.pipelining `__ +APIs. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to use ``torch.distributed.pipelining`` APIs + * How to apply pipeline parallelism to a transformer model + * How to utilize different schedules on a set of microbatches + + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * Familiarity with `basic distributed training `__ in PyTorch + +Setup +----- + +With ``torch.distributed.pipelining`` we will be partitioning the execution of a model and scheduling computation on micro-batches. We will be using a simplified version +of a transformer decoder model. The model architecture is for educational purposes and has multiple transformer decoder layers as we want to demonstrate how to split the model into different +chunks. First, let us define the model: + +.. code:: python + + import torch + import torch.nn as nn + from dataclasses import dataclass + + @dataclass + class ModelArgs: + dim: int = 512 + n_layers: int = 8 + n_heads: int = 8 + vocab_size: int = 10000 + + class Transformer(nn.Module): + def __init__(self, model_args: ModelArgs): + super().__init__() + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + + # Using a ModuleDict lets us delete layers witout affecting names, + # ensuring checkpoints will correctly save and load. + self.layers = torch.nn.ModuleDict() + for layer_id in range(model_args.n_layers): + self.layers[str(layer_id)] = nn.TransformerDecoderLayer(model_args.dim, model_args.n_heads) + + self.norm = nn.LayerNorm(model_args.dim) + self.output = nn.Linear(model_args.dim, model_args.vocab_size) + + def forward(self, tokens: torch.Tensor): + # Handling layers being 'None' at runtime enables easy pipeline splitting + h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens + + for layer in self.layers.values(): + h = layer(h, h) + + h = self.norm(h) if self.norm else h + output = self.output(h).clone() if self.output else h + return output + +Then, we need to import the necessary libraries in our script and initialize the distributed training process. In this case, we are defining some global variables to use +later in the script: + +.. code:: python + + import os + import torch.distributed as dist + from torch.distributed.pipelining import pipeline, SplitPoint, PipelineStage, ScheduleGPipe + + global rank, device, pp_group, stage_index, num_stages + def init_distributed(): + global rank, device, pp_group, stage_index, num_stages + rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + device = torch.device(f"cuda:{rank}") if torch.cuda.is_available() else torch.device("cpu") + dist.init_process_group() + + # This group can be a sub-group in the N-D parallel case + pp_group = dist.new_group() + stage_index = rank + num_stages = world_size + +The ``rank``, ``world_size``, and ``init_process_group()`` code should seem familiar to you as those are commonly used in +all distributed programs. The globals specific to pipeline parallelism include ``pp_group`` which is the process +group that will be used for send/recv communications, ``stage_index`` which, in this example, is a single rank +per stage so the index is equivalent to the rank, and ``num_stages`` which is equivalent to world_size. + +The ``num_stages`` is used to set the number of stages that will be used in the pipeline parallelism schedule. For example, +for ``num_stages=4``, a microbatch will need to go through 4 forwards and 4 backwards before it is completed. The ``stage_index`` +is necessary for the framework to know how to communicate between stages. For example, for the first stage (``stage_index=0``), it will +use data from the dataloader and does not need to receive data from any previous peers to perform its computation. + + +Step 1: Partition the Transformer Model +--------------------------------------- + +There are two different ways of partitioning the model: + +First is the manual mode in which we can manually create two instances of the model by deleting portions of +attributes of the model. In this example for two stages (2 ranks), the model is cut in half. + +.. code:: python + + def manual_model_split(model) -> PipelineStage: + if stage_index == 0: + # prepare the first stage model + for i in range(4, 8): + del model.layers[str(i)] + model.norm = None + model.output = None + + elif stage_index == 1: + # prepare the second stage model + for i in range(4): + del model.layers[str(i)] + model.tok_embeddings = None + + stage = PipelineStage( + model, + stage_index, + num_stages, + device, + ) + return stage + +As we can see the first stage does not have the layer norm or the output layer, and it only includes the first four transformer blocks. +The second stage does not have the input embedding layers, but includes the output layers and the final four transformer blocks. The function +then returns the ``PipelineStage`` for the current rank. + +The second method is the tracer-based mode which automatically splits the model based on a ``split_spec`` argument. Using the pipeline specification, we can instruct +``torch.distributed.pipelining`` where to split the model. In the following code block, +we are splitting before the before 4th transformer decoder layer, mirroring the manual split described above. Similarly, +we can retrieve a ``PipelineStage`` by calling ``build_stage`` after this splitting is done. + +.. code:: python + def tracer_model_split(model, example_input_microbatch) -> PipelineStage: + pipe = pipeline( + module=model, + mb_args=(example_input_microbatch,), + split_spec={ + "layers.4": SplitPoint.BEGINNING, + } + ) + stage = pipe.build_stage(stage_index, device, pp_group) + return stage + + +Step 2: Define The Main Execution +--------------------------------- + +In the main function we will create a particular pipeline schedule that the stages should follow. ``torch.distributed.pipelining`` +supports multiple schedules including supports multiple schedules, including single-stage-per-rank schedules ``GPipe`` and ``1F1B``, +as well as multiple-stage-per-rank schedules such as ``Interleaved1F1B`` and ``LoopedBFS``. + +.. code:: python + + if __name__ == "__main__": + init_distributed() + num_microbatches = 4 + model_args = ModelArgs() + model = Transformer(model_args) + + # Dummy data + x = torch.ones(32, 500, dtype=torch.long) + y = torch.randint(0, model_args.vocab_size, (32, 500), dtype=torch.long) + example_input_microbatch = x.chunk(num_microbatches)[0] + + # Option 1: Manual model splitting + stage = manual_model_split(model) + + # Option 2: Tracer model splitting + # stage = tracer_model_split(model, example_input_microbatch) + + model.to(device) + x = x.to(device) + y = y.to(device) + + def tokenwise_loss_fn(outputs, targets): + loss_fn = nn.CrossEntropyLoss() + outputs = outputs.reshape(-1, model_args.vocab_size) + targets = targets.reshape(-1) + return loss_fn(outputs, targets) + + schedule = ScheduleGPipe(stage, n_microbatches=num_microbatches, loss_fn=tokenwise_loss_fn) + + if rank == 0: + schedule.step(x) + elif rank == 1: + losses = [] + output = schedule.step(target=y, losses=losses) + print(f"losses: {losses}") + dist.destroy_process_group() + +In the example above, we are using the manual method to split the model, but the code can be uncommented to also try the +tracer-based model splitting function. In our schedule, we need to pass in the number of microbatches and +the loss function used to evaluate the targets. + +The ``.step()`` function processes the entire minibatch and automatically splits it into microbatches based +on the ``n_microbatches`` passed previously. The microbatches are then operated on according to the schedule class. +In the example above, we are using GPipe, which follows a simple all-forwards and then all-backwards schedule. The output +returned from rank 1 will be the same as if the model was on a single GPU and run with the entire batch. Similarly, +we can pass in a ``losses`` container to store the corresponding losses for each microbatch. + +Step 3: Launch the Distributed Processes +---------------------------------------- + +Finally, we are ready to run the script. We will use ``torchrun`` to create a single host, 2-process job. +Our script is already written in a way rank 0 that performs the required logic for pipeline stage 0, and rank 1 +performs the logic for pipeline stage 1. + +``torchrun --nnodes 1 --nproc_per_node 2 pipelining_tutorial.py`` + +Conclusion +---------- + +In this tutorial, we have learned how to implement distributed pipeline parallelism using PyTorch's ``torch.distributed.pipelining`` APIs. +We explored setting up the environment, defining a transformer model, and partitioning it for distributed training. +We discussed two methods of model partitioning, manual and tracer-based, and demonstrated how to schedule computations on +micro-batches across different stages. Finally, we covered the execution of the pipeline schedule and the launch of distributed +processes using ``torchrun``. + +Additional Resources +-------------------- + +We have successfully integrated ``torch.distributed.pipelining`` into the `torchtitan repository `__. TorchTitan is a clean, minimal code base for +large-scale LLM training using native PyTorch. For a production ready usage of pipeline +parallelism as well as composition with other distributed techniques, see +`TorchTitan end to end example of 3D parallelism `__. diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst index 68c1afe4597..3c72a9e319b 100644 --- a/intermediate_source/process_group_cpp_extension_tutorial.rst +++ b/intermediate_source/process_group_cpp_extension_tutorial.rst @@ -1,7 +1,7 @@ Customize Process Group Backends Using Cpp Extensions ===================================================== -**Author**: `Howard Huang `, `Feng Tian `__, `Shen Li `__, `Min Si `__ +**Author**: `Howard Huang `__, `Feng Tian `__, `Shen Li `__, `Min Si `__ .. note:: |edit| View and edit this tutorial in `github `__. @@ -25,9 +25,8 @@ Basics PyTorch collective communications power several widely adopted distributed training features, including -`DistributedDataParallel `__, -`ZeroRedundancyOptimizer `__, -`FullyShardedDataParallel `__. +`DistributedDataParallel `__ and +`ZeroRedundancyOptimizer `__. In order to make the same collective communication API work with different communication backends, the distributed package abstracts collective communication operations into a @@ -100,7 +99,7 @@ repository for the full implementation. // The collective communication APIs without a custom implementation // will error out if invoked by application code. }; - + class WorkDummy : public Work { public: WorkDummy( @@ -266,8 +265,8 @@ After installation, you can conveniently use the ``dummy`` backend when calling `init_process_group `__ as if it is an builtin backend. -We can specify dispatching based on backend by changing the ``backend`` argument of ``init_process_group``. We -can dispatch collective with CPU tensor to ``gloo`` backend and dispatch collective with CUDA tensor to ``dummy`` backend by +We can specify dispatching based on backend by changing the ``backend`` argument of ``init_process_group``. We +can dispatch collective with CPU tensor to ``gloo`` backend and dispatch collective with CUDA tensor to ``dummy`` backend by specifying ``cpu:gloo,cuda:dummy`` as the backend argument. To send all tensors to ``dummy`` backend, we can simply specify ``dummy`` as the backend argument. diff --git a/intermediate_source/pruning_tutorial.py b/intermediate_source/pruning_tutorial.py index ba6701c8c35..346200502d5 100644 --- a/intermediate_source/pruning_tutorial.py +++ b/intermediate_source/pruning_tutorial.py @@ -44,9 +44,9 @@ class LeNet(nn.Module): def __init__(self): super(LeNet, self).__init__() - # 1 input image channel, 6 output channels, 3x3 square conv kernel - self.conv1 = nn.Conv2d(1, 6, 3) - self.conv2 = nn.Conv2d(6, 16, 3) + # 1 input image channel, 6 output channels, 5x5 square conv kernel + self.conv1 = nn.Conv2d(1, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5x5 image dimension self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) diff --git a/intermediate_source/realtime_rpi.rst b/intermediate_source/realtime_rpi.rst index 9b11f899a3b..bb1a576a2c2 100644 --- a/intermediate_source/realtime_rpi.rst +++ b/intermediate_source/realtime_rpi.rst @@ -312,7 +312,7 @@ Detecting a mug: Troubleshooting: Performance -~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyTorch by default will use all of the cores available. If you have anything running in the background on the Raspberry Pi it may cause contention with the diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py index 6501e98971e..ec2dc0a488d 100644 --- a/intermediate_source/reinforcement_ppo.py +++ b/intermediate_source/reinforcement_ppo.py @@ -104,6 +104,22 @@ # description and more about the algorithm itself. # +import warnings +warnings.filterwarnings("ignore") +from torch import multiprocessing + +# sphinx_gallery_start_ignore + +# TorchRL prefers spawn method, that restricts creation of ``~torchrl.envs.ParallelEnv`` inside +# `__main__` method call, but for the easy of reading the code switch to fork +# which is also a default spawn method in Google's Colaboratory +try: + multiprocessing.set_start_method("fork") +except RuntimeError: + pass + +# sphinx_gallery_end_ignore + from collections import defaultdict import matplotlib.pyplot as plt @@ -118,7 +134,7 @@ from torchrl.envs import (Compose, DoubleToFloat, ObservationNorm, StepCounter, TransformedEnv) from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.utils import check_env_specs, set_exploration_mode +from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator from torchrl.objectives import ClipPPOLoss from torchrl.objectives.value import GAE @@ -137,7 +153,12 @@ # actually return ``frame_skip`` frames). # -device = "cpu" if not torch.has_cuda else "cuda:0" +is_fork = multiprocessing.get_start_method() == "fork" +device = ( + torch.device(0) + if torch.cuda.is_available() and not is_fork + else torch.device("cpu") +) num_cells = 256 # number of cells in each layer i.e. output dim. lr = 3e-4 max_grad_norm = 1.0 @@ -152,22 +173,10 @@ # use. In general, the goal of an RL algorithm is to learn to solve the task # as fast as it can in terms of environment interactions: the lower the ``total_frames`` # the better. -# We also define a ``frame_skip``: in some contexts, repeating the same action -# multiple times over the course of a trajectory may be beneficial as it makes -# the behavior more consistent and less erratic. However, "skipping" -# too many frames will hamper training by reducing the reactivity of the actor -# to observation changes. -# -# When using ``frame_skip`` it is good practice to -# correct the other frame counts by the number of frames we are grouping -# together. If we configure a total count of X frames for training but -# use a ``frame_skip`` of Y, we will be actually collecting ``XY`` frames in total -# which exceeds our predefined budget. -# -frame_skip = 1 -frames_per_batch = 1000 // frame_skip +# +frames_per_batch = 1000 # For a complete training, bring the number of frames up to 1M -total_frames = 50_000 // frame_skip +total_frames = 50_000 ###################################################################### # PPO parameters @@ -196,14 +205,14 @@ # # In RL, an *environment* is usually the way we refer to a simulator or a # control system. Various libraries provide simulation environments for reinforcement -# learning, including Gymnasium (previously OpenAI Gym), DeepMind Control Suite, and +# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and # many others. # As a general library, TorchRL's goal is to provide an interchangeable interface # to a large panel of RL simulators, allowing you to easily swap one environment # with another. For example, creating a wrapped gym environment can be achieved with few characters: # -base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip) +base_env = GymEnv("InvertedDoublePendulum-v4", device=device) ###################################################################### # There are a few things to notice in this code: first, we created @@ -262,7 +271,7 @@ Compose( # normalize observations ObservationNorm(in_keys=["observation"]), - DoubleToFloat(in_keys=["observation"]), + DoubleToFloat(), StepCounter(), ), ) @@ -410,8 +419,8 @@ in_keys=["loc", "scale"], distribution_class=TanhNormal, distribution_kwargs={ - "min": env.action_spec.space.minimum, - "max": env.action_spec.space.maximum, + "min": env.action_spec.space.low, + "max": env.action_spec.space.high, }, return_log_prob=True, # we'll need the log-prob for the numerator of the importance weights @@ -514,7 +523,7 @@ # replay_buffer = ReplayBuffer( - storage=LazyTensorStorage(frames_per_batch), + storage=LazyTensorStorage(max_size=frames_per_batch), sampler=SamplerWithoutReplacement(), ) @@ -546,16 +555,13 @@ ) loss_module = ClipPPOLoss( - actor=policy_module, - critic=value_module, - advantage_key="advantage", + actor_network=policy_module, + critic_network=value_module, clip_epsilon=clip_epsilon, entropy_bonus=bool(entropy_eps), entropy_coef=entropy_eps, # these keys match by default but we set this for completeness - value_target_key=advantage_module.value_target_key, critic_coef=1.0, - gamma=0.99, loss_critic_type="smooth_l1", ) @@ -586,7 +592,7 @@ logs = defaultdict(list) -pbar = tqdm(total=total_frames * frame_skip) +pbar = tqdm(total=total_frames) eval_str = "" # We iterate over the collector until it reaches the total number of frames it was @@ -618,7 +624,7 @@ optim.zero_grad() logs["reward"].append(tensordict_data["next", "reward"].mean().item()) - pbar.update(tensordict_data.numel() * frame_skip) + pbar.update(tensordict_data.numel()) cum_reward_str = ( f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})" ) @@ -633,7 +639,7 @@ # number of steps (1000, which is our ``env`` horizon). # The ``rollout`` method of the ``env`` can take a policy as argument: # it will then execute this policy at each step. - with set_exploration_mode("mean"), torch.no_grad(): + with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad(): # execute a rollout with the trained policy eval_rollout = env.rollout(1000, policy_module) logs["eval reward"].append(eval_rollout["next", "reward"].mean().item()) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 78dc7e2fc6e..0ae3ea9a90c 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -7,7 +7,9 @@ This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent -on the CartPole-v1 task from `Gymnasium `__. +on the CartPole-v1 task from `Gymnasium `__. + +You might find it helpful to read the original `Deep Q Learning (DQN) `__ paper **Task** @@ -83,7 +85,11 @@ plt.ion() # if GPU is to be used -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +device = torch.device( + "cuda" if torch.cuda.is_available() else + "mps" if torch.backends.mps.is_available() else + "cpu" +) ###################################################################### @@ -227,7 +233,7 @@ def forward(self, x): # This cell instantiates our model and its optimizer, and defines some # utilities: # -# - ``select_action`` - will select an action accordingly to an epsilon +# - ``select_action`` - will select an action according to an epsilon # greedy policy. Simply put, we'll sometimes use our model for choosing # the action, and sometimes we'll just sample one uniformly. The # probability of choosing a random action will start at ``EPS_START`` @@ -283,7 +289,7 @@ def select_action(state): # t.max(1) will return the largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. - return policy_net(state).max(1)[1].view(1, 1) + return policy_net(state).max(1).indices.view(1, 1) else: return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) @@ -360,12 +366,12 @@ def optimize_model(): # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based - # on the "older" target_net; selecting their best reward with max(1)[0]. + # on the "older" target_net; selecting their best reward with max(1).values # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) with torch.no_grad(): - next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0] + next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch @@ -397,13 +403,13 @@ def optimize_model(): # can produce better results if convergence is not observed. # -if torch.cuda.is_available(): +if torch.cuda.is_available() or torch.backends.mps.is_available(): num_episodes = 600 else: num_episodes = 50 for i_episode in range(num_episodes): - # Initialize the environment and get it's state + # Initialize the environment and get its state state, info = env.reset() state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) for t in count(): diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst index bd04fe33991..562d93bbbf8 100644 --- a/intermediate_source/rpc_async_execution.rst +++ b/intermediate_source/rpc_async_execution.rst @@ -199,7 +199,7 @@ speed. Batch-Processing CartPole Solver -------------------------------- -This section uses CartPole-v1 from `OpenAI Gym `__ as +This section uses CartPole-v1 from OpenAI Gym as an example to show the performance impact of batch processing RPC. Please note that since the goal is to demonstrate the usage of `@rpc.functions.async_execution `__ @@ -522,4 +522,3 @@ Learn More - `Batch-Updating Parameter Server Source Code `__ - `Batch-Processing CartPole Solver `__ - `Distributed Autograd `__ -- `Distributed Pipeline Parallelism `__ diff --git a/intermediate_source/rpc_param_server_tutorial.rst b/intermediate_source/rpc_param_server_tutorial.rst index 5531c51c0f8..324331646c5 100644 --- a/intermediate_source/rpc_param_server_tutorial.rst +++ b/intermediate_source/rpc_param_server_tutorial.rst @@ -310,12 +310,12 @@ We've now completed our trainer and parameter server specific code, and all that help="""Total number of participating processes. Should be the sum of master node and all training nodes.""") parser.add_argument( - "rank", + "--rank", type=int, default=None, help="Global rank of this process. Pass in 0 for master.") parser.add_argument( - "num_gpus", + "--num_gpus", type=int, default=0, help="""Number of GPUs to use for training, Currently supports between 0 diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst index 835e6f0649f..dd8af47e62a 100644 --- a/intermediate_source/rpc_tutorial.rst +++ b/intermediate_source/rpc_tutorial.rst @@ -59,7 +59,7 @@ Distributed Reinforcement Learning using RPC and RRef ----------------------------------------------------- This section describes steps to build a toy distributed reinforcement learning -model using RPC to solve CartPole-v1 from `OpenAI Gym `__. +model using RPC to solve CartPole-v1 from `OpenAI Gym `__. The policy code is mostly borrowed from the existing single-thread `example `__ as shown below. We will skip details of the ``Policy`` design, and focus on RPC @@ -156,7 +156,7 @@ send commands. Applications don't need to worry about the lifetime of ``RRefs``. The owner of each ``RRef`` maintains a reference counting map to track its lifetime, and guarantees the remote data object will not be deleted as long as there is any live user of that ``RRef``. Please refer to the ``RRef`` -`design doc `__ for details. +`design doc `__ for details. .. code:: python @@ -531,7 +531,7 @@ the given arguments (i.e., ``lr=0.05``). In the training loop, it first creates a distributed autograd context, which will help the distributed autograd engine to find gradients and involved RPC send/recv functions. The design details of the distributed autograd engine can -be found in its `design note `__. +be found in its `design note `__. Then, it kicks off the forward pass as if it is a local model, and run the distributed backward pass. For the distributed backward, you only need to specify a list of roots, in this case, it is the loss ``Tensor``. diff --git a/intermediate_source/scaled_dot_product_attention_tutorial.py b/intermediate_source/scaled_dot_product_attention_tutorial.py index 2bfeb46b56c..35b1ba7be4e 100644 --- a/intermediate_source/scaled_dot_product_attention_tutorial.py +++ b/intermediate_source/scaled_dot_product_attention_tutorial.py @@ -86,29 +86,24 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs): print(f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds") # Lets explore the speed of each of the 3 implementations -from torch.backends.cuda import sdp_kernel, SDPBackend +from torch.nn.attention import SDPBackend, sdpa_kernel -# Helpful arguments mapper -backend_map = { - SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False}, - SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False}, - SDPBackend.EFFICIENT_ATTENTION: { - "enable_math": False, "enable_flash": False, "enable_mem_efficient": True} -} -with sdp_kernel(**backend_map[SDPBackend.MATH]): - print(f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds") +with sdpa_kernel(SDPBackend.MATH): + math_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) + print(f"The math implementation runs in {math_time:.3f} microseconds") - -with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): try: - print(f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds") + flash_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) + print(f"The flash attention implementation runs in {flash_time:.3f} microseconds") except RuntimeError: print("FlashAttention is not supported. See warnings for reasons.") -with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]): +with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION): try: - print(f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds") + efficient_time=benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value) + print(f"The memory efficient implementation runs in {efficient_time:.3f} microseconds") except RuntimeError: print("EfficientAttention is not supported. See warnings for reasons.") @@ -119,7 +114,7 @@ def benchmark_torch_function_in_microseconds(f, *args, **kwargs): # # Depending on what machine you ran the above cell on and what hardware is # available, your results might be different. -# - If you don’t have a GPU and are running on CPU then the context manager +# - If you don’t have a GPU and are running on CPU then with FP32 the context manager # will have no effect and all three runs should return similar timings. # - Depending on what compute capability your graphics card supports # flash attention or memory efficient might have failed. @@ -239,7 +234,7 @@ def generate_rand_batch( # Currently the fused implementations don't support ``NestedTensor`` for training model.eval() -with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): try: print(f"Random NT runs in {benchmark_torch_function_in_microseconds(model, random_nt):.3f} microseconds") print(f"Random Dense runs in {benchmark_torch_function_in_microseconds(model, random_dense):.3f} microseconds") @@ -249,7 +244,7 @@ def generate_rand_batch( ###################################################################### # Using SDPA with ``torch.compile`` -# ================================= +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # With the release of PyTorch 2.0, a new feature called # ``torch.compile()`` has been introduced, which can provide @@ -303,7 +298,8 @@ def generate_rand_batch( print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) # For even more insights, you can export the trace and use ``chrome://tracing`` to view the results -# :: +# +# .. code-block:: python # # prof.export_chrome_trace("compiled_causal_attention_trace.json"). @@ -327,14 +323,82 @@ def generate_rand_batch( # the Shakespeare dataset. # +###################################################################### +# Using SDPA with attn_bias subclasses +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# As of PyTorch 2.3, we have added a new submodule that contains tensor subclasses. +# Designed to be used with ``torch.nn.functional.scaled_dot_product_attention``. +# The module is named ``torch.nn.attention.bias`` and contains the following two +# utilities for generating causal attention variants: +# +# - ``torch.nn.attention.bias.causal_upper_left`` +# - ``torch.nn.attention.bias.causal_lower_right`` +# +# .. note:: +# The current argument ``is_causal`` in ``torch.nn.functional.scaled_dot_product_attention`` +# is the same as using ``torch.nn.attention.bias.causal_upper_left``. +# + +from torch.nn.attention.bias import causal_lower_right, causal_upper_left + +batch_size = 32 +sequence_length_q = 2 +sequence_length_kv = 10 +num_heads = 16 +embed_dimension = 32 + +dtype = torch.float16 + +query = torch.rand(batch_size, num_heads, sequence_length_q, embed_dimension, device=device, dtype=dtype) +key = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype) +value = torch.rand(batch_size, num_heads, sequence_length_kv, embed_dimension, device=device, dtype=dtype) + +upper_left_bias = causal_upper_left(sequence_length_q, sequence_length_kv) +lower_right_bias = causal_lower_right(sequence_length_q, sequence_length_kv) + +print(type(upper_left_bias)) +print(type(lower_right_bias)) + +assert type(upper_left_bias) == type(lower_right_bias) +assert issubclass(type(upper_left_bias), torch.Tensor) + +# As you can see from the previous output, are the same type ``torch.nn.attention.bias.CausalBias`` +# and subclass ``torch.Tensor`` + +# Lets see what these tensors look like +print(upper_left_bias) +print(lower_right_bias) + +# Upper Left Bias aligns the causal attention mask to the upper left corner of the attention scores matrix. +# This only has an impact when the attention scores matrix is not square, which is common for decoding use cases. +# Another way of thinking about this concept is that when you use upper left bias, +# the 0th token in the query is aligned to the 0th token in the key, while for lower right bias, +# Assuming the attention score matrix is two dimensional, ``attn_score[0][0]`` is the attention score +# between the 0th token in the query and the 0th token in the key. +# For lower right bias, the sequence of q is aligned so that the last token in q is aligned to the last token in k +# (for example, ``attn_score[-1][-1])`` is all True since the last token in q is at the same position as the last token in k +# even if the sequence length of q and k are different. + +# These objects are intended to be used with sdpa +out_upper_left = F.scaled_dot_product_attention(query, key, value, upper_left_bias) +out_lower_right = F.scaled_dot_product_attention(query, key, value, lower_right_bias) +out_is_causal = F.scaled_dot_product_attention(query, key, value, is_causal=True) + +assert torch.allclose(out_upper_left, out_is_causal) +assert not torch.allclose(out_upper_left, out_lower_right) + +# These attention biases should also be compatible with torch.compile +compiled_sdpa = torch.compile(F.scaled_dot_product_attention, fullgraph=True) +out_upper_left = compiled_sdpa(query, key, value, upper_left_bias) ###################################################################### # Conclusion -# ========== +# ~~~~~~~~~~~ # # In this tutorial, we have demonstrated the basic usage of # ``torch.nn.functional.scaled_dot_product_attention``. We have shown how -# the ``sdp_kernel`` context manager can be used to assert a certain +# the ``sdpa_kernel`` context manager can be used to assert a certain # implementation is used on GPU. As well, we built a simple # ``CausalSelfAttention`` module that works with ``NestedTensor`` and is torch # compilable. In the process we have shown how to the profiling tools can diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py index 7e52f1eaea3..5de4bb4ca3e 100755 --- a/intermediate_source/seq2seq_translation_tutorial.py +++ b/intermediate_source/seq2seq_translation_tutorial.py @@ -4,16 +4,20 @@ ******************************************************************************* **Author**: `Sean Robertson `_ -This is the third and final tutorial on doing "NLP From Scratch", where we +This tutorials is part of a three-part series: + +* `NLP From Scratch: Classifying Names with a Character-Level RNN `__ +* `NLP From Scratch: Generating Names with a Character-Level RNN `__ +* `NLP From Scratch: Translation with a Sequence to Sequence Network and Attention `__ + +This is the third and final tutorial on doing **NLP From Scratch**, where we write our own classes and functions to preprocess the data to do our NLP -modeling tasks. We hope after you complete this tutorial that you'll proceed to -learn how `torchtext` can handle much of this preprocessing for you in the -three tutorials immediately following this one. +modeling tasks. In this project we will be teaching a neural network to translate from French to English. -:: +.. code-block:: sh [KEY: > input, = target, < output] @@ -112,11 +116,11 @@ # download to ``data/eng-fra.txt`` before continuing. The file is a tab # separated list of translation pairs: # -# :: +# .. code-block:: sh # # I am cold. J'ai froid. # -# .. Note:: +# .. note:: # Download the data from # `here `_ # and extract it to the current directory. @@ -775,7 +779,7 @@ def evaluateRandomly(encoder, decoder, n=10): # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some # reasonable results. # -# .. Note:: +# .. note:: # If you run this notebook you can train, interrupt the kernel, # evaluate, and continue training later. Comment out the lines where the # encoder and decoder are initialized and run ``trainIters`` again. diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py index 49b6b0f0a2b..99efe41b39b 100644 --- a/intermediate_source/spatial_transformer_tutorial.py +++ b/intermediate_source/spatial_transformer_tutorial.py @@ -84,7 +84,7 @@ # # .. figure:: /_static/img/stn/stn-arch.png # -# .. Note:: +# .. note:: # We need the latest version of PyTorch that contains # affine_grid and grid_sample modules. # diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py index 2b241071b7f..3782ced18d9 100644 --- a/intermediate_source/tensorboard_profiler_tutorial.py +++ b/intermediate_source/tensorboard_profiler_tutorial.py @@ -4,6 +4,14 @@ This tutorial demonstrates how to use TensorBoard plugin with PyTorch Profiler to detect performance bottlenecks of the model. +.. warning:: + The TensorBoard integration with the PyTorch profiler is now + deprecated. Instead, use Perfetto or the Chrome trace to + view ``trace.json`` files. After + `generating a trace `__, + simply drag the ``trace.json`` into `Perfetto UI `__ + or ``chrome://tracing`` to visualize your profile. + Introduction ------------ PyTorch 1.8 includes an updated profiler API capable of @@ -36,6 +44,7 @@ # 4. Use TensorBoard to view results and analyze model performance # 5. Improve performance with the help of profiler # 6. Analyze performance with other advanced features +# 7. Additional Practices: Profiling PyTorch on AMD GPUs # # 1. Prepare the data and model # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -96,12 +105,12 @@ def train(data): # - ``schedule`` - callable that takes step (int) as a single parameter # and returns the profiler action to perform at each step. # -# In this example with ``wait=1, warmup=1, active=3, repeat=2``, +# In this example with ``wait=1, warmup=1, active=3, repeat=1``, # profiler will skip the first step/iteration, # start warming up on the second, # record the following three iterations, # after which the trace will become available and on_trace_ready (when set) is called. -# In total, the cycle repeats twice. Each cycle is called a "span" in TensorBoard plugin. +# In total, the cycle repeats once. Each cycle is called a "span" in TensorBoard plugin. # # During ``wait`` steps, the profiler is disabled. # During ``warmup`` steps, the profiler starts tracing but the results are discarded. @@ -120,31 +129,31 @@ def train(data): # clicking a stack frame will navigate to the specific code line. with torch.profiler.profile( - schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2), + schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'), record_shapes=True, profile_memory=True, with_stack=True ) as prof: for step, batch_data in enumerate(train_loader): - if step >= (1 + 1 + 3) * 2: + prof.step() # Need to call this at each step to notify profiler of steps' boundary. + if step >= 1 + 1 + 3: break train(batch_data) - prof.step() # Need to call this at the end of each step to notify profiler of steps' boundary. ###################################################################### # Alternatively, the following non-context manager start/stop is supported as well. prof = torch.profiler.profile( - schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2), + schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'), record_shapes=True, with_stack=True) prof.start() for step, batch_data in enumerate(train_loader): - if step >= (1 + 1 + 3) * 2: + prof.step() + if step >= 1 + 1 + 3: break train(batch_data) - prof.step() prof.stop() ###################################################################### @@ -156,7 +165,11 @@ def train(data): ###################################################################### # 4. Use TensorBoard to view results and analyze model performance -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. note:: +# TensorBoard Plugin support has been deprecated, so some of these functions may not +# work as previously. Please take a look at the replacement, `HTA `_. # # Install PyTorch Profiler TensorBoard Plugin. # @@ -174,7 +187,7 @@ def train(data): # ###################################################################### -# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser. +# Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser (**Safari is not supported**). # # .. code-block:: # @@ -388,6 +401,102 @@ def train(data): # # The "Communication Operations Stats" summarizes the detailed statistics of all communication ops in each worker. +###################################################################### +# 7. Additional Practices: Profiling PyTorch on AMD GPUs +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# +# The AMD ROCm Platform is an open-source software stack designed for GPU computation, consisting of drivers, development tools, and APIs. +# We can run the above mentioned steps on AMD GPUs. In this section, we will use Docker to install the ROCm base development image +# before installing PyTorch. + + +###################################################################### +# For the purpose of example, let's create a directory called ``profiler_tutorial``, and save the code in **Step 1** as ``test_cifar10.py`` in this directory. +# +# .. code-block:: +# +# mkdir ~/profiler_tutorial +# cd profiler_tutorial +# vi test_cifar10.py + + +###################################################################### +# At the time of this writing, the Stable(``2.1.1``) Linux version of PyTorch on ROCm Platform is `ROCm 5.6 `_. +# +# +# - Obtain a base Docker image with the correct user-space ROCm version installed from `Docker Hub `_. +# +# It is ``rocm/dev-ubuntu-20.04:5.6``. +# +# - Start the ROCm base Docker container: +# +# +# .. code-block:: +# +# docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ~/profiler_tutorial:/profiler_tutorial rocm/dev-ubuntu-20.04:5.6 +# +# +# - Inside the container, install any dependencies needed for installing the wheels package. +# +# .. code-block:: +# +# sudo apt update +# sudo apt install libjpeg-dev python3-dev -y +# pip3 install wheel setuptools +# sudo apt install python-is-python3 +# +# +# - Install the wheels: +# +# .. code-block:: +# +# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6 +# +# +# - Install the ``torch_tb_profiler``, and then, run the Python file ``test_cifar10.py``: +# +# .. code-block:: +# +# pip install torch_tb_profiler +# cd /profiler_tutorial +# python test_cifar10.py +# +# +# Now, we have all the data needed to view in TensorBoard: +# +# .. code-block:: +# +# tensorboard --logdir=./log +# +# Choose different views as described in **Step 4**. For example, below is the **Operator** View: +# +# .. image:: ../../_static/img/profiler_rocm_tensorboard_operartor_view.png +# :scale: 25 % + + +###################################################################### +# At the time this section is written, **Trace** view does not work and it displays nothing. You can work around by typing ``chrome://tracing`` in your Chrome Browser. +# +# +# - Copy the ``trace.json`` file under ``~/profiler_tutorial/log/resnet18`` directory to the Windows. +# You may need to copy the file by using ``scp`` if the file is located in a remote location. +# +# - Click **Load** button to load the trace JSON file from the ``chrome://tracing`` page in the browser. +# +# .. image:: ../../_static/img/profiler_rocm_chrome_trace_view.png +# :scale: 25 % + + +###################################################################### +# As mentioned previously, you can move the graph and zoom in and out. +# You can also use keyboard to zoom and move around inside the timeline. +# The ``w`` and ``s`` keys zoom in centered around the mouse, +# and the ``a`` and ``d`` keys move the timeline left and right. +# You can hit these keys multiple times until you see a readable representation. + + + ###################################################################### # Learn More # ---------- @@ -395,5 +504,6 @@ def train(data): # Take a look at the following documents to continue your learning, # and feel free to open an issue `here `_. # -# - `Pytorch TensorBoard Profiler github `_ +# - `PyTorch TensorBoard Profiler Github `_ # - `torch.profiler API `_ +# - `HTA `_ diff --git a/intermediate_source/tiatoolbox_tutorial.rst b/intermediate_source/tiatoolbox_tutorial.rst new file mode 100644 index 00000000000..de9b3031330 --- /dev/null +++ b/intermediate_source/tiatoolbox_tutorial.rst @@ -0,0 +1,994 @@ +Whole Slide Image Classification Using PyTorch and TIAToolbox +============================================================= + +.. tip:: + To get the most of this tutorial, we suggest using this + `Colab Version `_. This will allow you to experiment with the information presented below. + + +Introduction +------------ + +In this tutorial, we will show how to classify Whole Slide Images (WSIs) +using PyTorch deep learning models with help from TIAToolbox. A WSI +is an image of a sample of human tissue taken through a surgery or biopsy and +scanned using specialized scanners. They are used by pathologists and +computational pathology researchers to `study diseases such as cancer at the microscopic +level `__ in +order to understand for example tumor growth and help improve treatment +for patients. + +What makes WSIs challenging to process is their enormous size. For +example, a typical slide image has in the order of `100,000x100,000 +pixels `__ where each pixel can +correspond to about 0.25x0.25 microns on the slide. This introduces +challenges in loading and processing such images, not to mention +hundreds or even thousands of WSIs in a single study (larger studies +produce better results)! + +Conventional image processing pipelines are not suitable for WSI +processing so we need better tools. This is where +`TIAToolbox `__ can +help as it brings a set of useful tools to import and process tissue +slides in a fast and computationally efficient manner. Typically, WSIs +are saved in a pyramid structure with multiple copies of the same image +at various magnification levels optimized for visualization. The level 0 +(or the bottom level) of the pyramid contains the image at the highest +magnification or zoom level, whereas the higher levels in the pyramid +have a lower resolution copy of the base image. The pyramid structure is +sketched below. + +|WSI pyramid stack| *WSI pyramid stack +(*\ `source `__\ *)* + +TIAToolbox allows us to automate common downstream analysis tasks such +as `tissue +classification `__. In this +tutorial we show how you can: 1. Load WSI images using +TIAToolbox; and 2. Use different PyTorch models to classify slides at +the patch-level. In this tutorial, we will provide an example of using +TorchVision ``ResNet18`` model and custom +`HistoEncoder` `__ model. + +Let’s get started! + +.. |WSI pyramid stack| image:: ../_static/img/tiatoolbox_tutorial/read_bounds_tissue.webp + + +Setting up the environment +-------------------------- + +To run the examples provided in this tutorial, the following packages +are required as prerequisites. + +1. OpenJpeg +2. OpenSlide +3. Pixman +4. TIAToolbox +5. HistoEncoder (for a custom model example) + +Please run the following command in your terminal to install these +packages: + + +`apt-get -y -qq install libopenjp2-7-dev libopenjp2-tools openslide-tools libpixman-1-dev` +`pip install -q 'tiatoolbox<1.5' histoencoder && echo "Installation is done."` + + +Alternatively, you can run ``brew install openjpeg openslide`` to +install the prerequisite packages on MacOS instead of ``apt-get``. +Further information on installation can be `found +here `__. + + + +Importing related libraries +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code-block:: python + + + """Import modules required to run the Jupyter notebook.""" + from __future__ import annotations + + # Configure logging + import logging + import warnings + if logging.getLogger().hasHandlers(): + logging.getLogger().handlers.clear() + warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") + + # Downloading data and files + import shutil + from pathlib import Path + from zipfile import ZipFile + + # Data processing and visualization + import matplotlib as mpl + import matplotlib.pyplot as plt + import numpy as np + import pandas as pd + from matplotlib import cm + import PIL + import contextlib + import io + from sklearn.metrics import accuracy_score, confusion_matrix + + # TIAToolbox for WSI loading and processing + from tiatoolbox import logger + from tiatoolbox.models.architecture import vanilla + from tiatoolbox.models.engine.patch_predictor import ( + IOPatchPredictorConfig, + PatchPredictor, + ) + from tiatoolbox.utils.misc import download_data, grab_files_from_dir + from tiatoolbox.utils.visualization import overlay_prediction_mask + from tiatoolbox.wsicore.wsireader import WSIReader + + # Torch-related + import torch + from torchvision import transforms + + # Configure plotting + mpl.rcParams["figure.dpi"] = 160 # for high resolution figure in notebook + mpl.rcParams["figure.facecolor"] = "white" # To make sure text is visible in dark mode + + # If you are not using GPU, change ON_GPU to False + ON_GPU = True + + # Function to suppress console output for overly verbose code blocks + def suppress_console_output(): + return contextlib.redirect_stderr(io.StringIO()) + + + +Clean-up before a run +~~~~~~~~~~~~~~~~~~~~~ + +To ensure proper clean-up (for example in abnormal termination), all +files downloaded or created in this run are saved in a single directory +``global_save_dir``, which we set equal to “./tmp/”. To simplify +maintenance, the name of the directory occurs only at this one place, so +that it can easily be changed, if desired. + + + +.. code-block:: python + + + warnings.filterwarnings("ignore") + global_save_dir = Path("./tmp/") + + + def rmdir(dir_path: str | Path) -> None: + """Helper function to delete directory.""" + if Path(dir_path).is_dir(): + shutil.rmtree(dir_path) + logger.info("Removing directory %s", dir_path) + + + rmdir(global_save_dir) # remove directory if it exists from previous runs + global_save_dir.mkdir() + logger.info("Creating new directory %s", global_save_dir) + + + +Downloading the data +~~~~~~~~~~~~~~~~~~~~ + +For our sample data, we will use one whole-slide image, and patches from +the validation subset of `Kather +100k `__ dataset. + + + +.. code-block:: python + + + wsi_path = global_save_dir / "sample_wsi.svs" + patches_path = global_save_dir / "kather100k-validation-sample.zip" + weights_path = global_save_dir / "resnet18-kather100k.pth" + + logger.info("Download has started. Please wait...") + + # Downloading and unzip a sample whole-slide image + download_data( + "https://tiatoolbox.dcs.warwick.ac.uk/sample_wsis/TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5A-FCBEDFC2394F.svs", + wsi_path, + ) + + # Download and unzip a sample of the validation set used to train the Kather 100K dataset + download_data( + "https://tiatoolbox.dcs.warwick.ac.uk/datasets/kather100k-validation-sample.zip", + patches_path, + ) + with ZipFile(patches_path, "r") as zipfile: + zipfile.extractall(path=global_save_dir) + + # Download pretrained model weights for WSI classification using ResNet18 architecture + download_data( + "https://tiatoolbox.dcs.warwick.ac.uk/models/pc/resnet18-kather100k.pth", + weights_path, + ) + + logger.info("Download is complete.") + + + +Reading the data +---------------- + +We create a list of patches and a list of corresponding labels. For +example, the first label in ``label_list`` will indicate the class of +the first image patch in ``patch_list``. + + + +.. code-block:: python + + + # Read the patch data and create a list of patches and a list of corresponding labels + dataset_path = global_save_dir / "kather100k-validation-sample" + + # Set the path to the dataset + image_ext = ".tif" # file extension of each image + + # Obtain the mapping between the label ID and the class name + label_dict = { + "BACK": 0, # Background (empty glass region) + "NORM": 1, # Normal colon mucosa + "DEB": 2, # Debris + "TUM": 3, # Colorectal adenocarcinoma epithelium + "ADI": 4, # Adipose + "MUC": 5, # Mucus + "MUS": 6, # Smooth muscle + "STR": 7, # Cancer-associated stroma + "LYM": 8, # Lymphocytes + } + + class_names = list(label_dict.keys()) + class_labels = list(label_dict.values()) + + # Generate a list of patches and generate the label from the filename + patch_list = [] + label_list = [] + for class_name, label in label_dict.items(): + dataset_class_path = dataset_path / class_name + patch_list_single_class = grab_files_from_dir( + dataset_class_path, + file_types="*" + image_ext, + ) + patch_list.extend(patch_list_single_class) + label_list.extend([label] * len(patch_list_single_class)) + + # Show some dataset statistics + plt.bar(class_names, [label_list.count(label) for label in class_labels]) + plt.xlabel("Patch types") + plt.ylabel("Number of patches") + + # Count the number of examples per class + for class_name, label in label_dict.items(): + logger.info( + "Class ID: %d -- Class Name: %s -- Number of images: %d", + label, + class_name, + label_list.count(label), + ) + + # Overall dataset statistics + logger.info("Total number of patches: %d", (len(patch_list))) + + + + + +.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png + :alt: tiatoolbox tutorial + :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_001.png + :class: sphx-glr-single-img + + +.. rst-class:: sphx-glr-script-out + + .. code-block:: none + + |2023-11-14|13:15:59.299| [INFO] Class ID: 0 -- Class Name: BACK -- Number of images: 211 + |2023-11-14|13:15:59.299| [INFO] Class ID: 1 -- Class Name: NORM -- Number of images: 176 + |2023-11-14|13:15:59.299| [INFO] Class ID: 2 -- Class Name: DEB -- Number of images: 230 + |2023-11-14|13:15:59.299| [INFO] Class ID: 3 -- Class Name: TUM -- Number of images: 286 + |2023-11-14|13:15:59.299| [INFO] Class ID: 4 -- Class Name: ADI -- Number of images: 208 + |2023-11-14|13:15:59.299| [INFO] Class ID: 5 -- Class Name: MUC -- Number of images: 178 + |2023-11-14|13:15:59.299| [INFO] Class ID: 6 -- Class Name: MUS -- Number of images: 270 + |2023-11-14|13:15:59.299| [INFO] Class ID: 7 -- Class Name: STR -- Number of images: 209 + |2023-11-14|13:15:59.299| [INFO] Class ID: 8 -- Class Name: LYM -- Number of images: 232 + |2023-11-14|13:15:59.299| [INFO] Total number of patches: 2000 + + + +As you can see for this patch dataset, we have 9 classes/labels with IDs +0-8 and associated class names. describing the dominant tissue type in +the patch: + +- BACK ⟶ Background (empty glass region) +- LYM ⟶ Lymphocytes +- NORM ⟶ Normal colon mucosa +- DEB ⟶ Debris +- MUS ⟶ Smooth muscle +- STR ⟶ Cancer-associated stroma +- ADI ⟶ Adipose +- MUC ⟶ Mucus +- TUM ⟶ Colorectal adenocarcinoma epithelium + + + +Classify image patches +---------------------- + +We demonstrate how to obtain a prediction for each patch within a +digital slide first with the ``patch`` mode and then with a large slide +using ``wsi`` mode. + + +Define ``PatchPredictor`` model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The PatchPredictor class runs a CNN-based classifier written in PyTorch. + +- ``model`` can be any trained PyTorch model with the constraint that + it should follow the + ``tiatoolbox.models.abc.ModelABC`` `(docs)` `__ + class structure. For more information on this matter, please refer to + `our example notebook on advanced model + techniques `__. + In order to load a custom model, you need to write a small + preprocessing function, as in ``preproc_func(img)``, which makes sure + the input tensors are in the right format for the loaded network. +- Alternatively, you can pass ``pretrained_model`` as a string + argument. This specifies the CNN model that performs the prediction, + and it must be one of the models listed + `here `__. + The command will look like this: + ``predictor = PatchPredictor(pretrained_model='resnet18-kather100k', pretrained_weights=weights_path, batch_size=32)``. +- ``pretrained_weights``: When using a ``pretrained_model``, the + corresponding pretrained weights will also be downloaded by default. + You can override the default with your own set of weights via the + ``pretrained_weight`` argument. +- ``batch_size``: Number of images fed into the model each time. Higher + values for this parameter require a larger (GPU) memory capacity. + + + +.. code-block:: python + + + # Importing a pretrained PyTorch model from TIAToolbox + predictor = PatchPredictor(pretrained_model='resnet18-kather100k', batch_size=32) + + # Users can load any PyTorch model architecture instead using the following script + model = vanilla.CNNModel(backbone="resnet18", num_classes=9) # Importing model from torchvision.models.resnet18 + model.load_state_dict(torch.load(weights_path, map_location="cpu", weights_only=True), strict=True) + def preproc_func(img): + img = PIL.Image.fromarray(img) + img = transforms.ToTensor()(img) + return img.permute(1, 2, 0) + model.preproc_func = preproc_func + predictor = PatchPredictor(model=model, batch_size=32) + + + +Predict patch labels +~~~~~~~~~~~~~~~~~~~~ + +We create a predictor object and then call the ``predict`` method using +the ``patch`` mode. We then compute the classification accuracy and +confusion matrix. + + + +.. code-block:: python + + + with suppress_console_output(): + output = predictor.predict(imgs=patch_list, mode="patch", on_gpu=ON_GPU) + + acc = accuracy_score(label_list, output["predictions"]) + logger.info("Classification accuracy: %f", acc) + + # Creating and visualizing the confusion matrix for patch classification results + conf = confusion_matrix(label_list, output["predictions"], normalize="true") + df_cm = pd.DataFrame(conf, index=class_names, columns=class_names) + df_cm + + + + + + +.. rst-class:: sphx-glr-script-out + + .. code-block:: none + + |2023-11-14|13:16:03.215| [INFO] Classification accuracy: 0.993000 + + +.. raw:: html + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    BACKNORMDEBTUMADIMUCMUSSTRLYM
    BACK1.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000
    NORM0.0000000.9886360.0000000.0113640.0000000.0000000.0000000.0000000.00000
    DEB0.0000000.0000000.9913040.0000000.0000000.0000000.0000000.0086960.00000
    TUM0.0000000.0000000.0000000.9965030.0000000.0034970.0000000.0000000.00000
    ADI0.0048080.0000000.0000000.0000000.9903850.0000000.0048080.0000000.00000
    MUC0.0000000.0000000.0000000.0000000.0000000.9887640.0000000.0112360.00000
    MUS0.0000000.0000000.0000000.0000000.0000000.0000000.9962960.0037040.00000
    STR0.0000000.0000000.0047850.0000000.0000000.0047850.0047850.9856460.00000
    LYM0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0043100.99569
    +
    +
    +
    +
    + + +Predict patch labels for a whole slide +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We now introduce ``IOPatchPredictorConfig``, a class that specifies the +configuration of image reading and prediction writing for the model +prediction engine. This is required to inform the classifier which level +of the WSI pyramid the classifier should read, process data and generate +output. + +Parameters of ``IOPatchPredictorConfig`` are defined as: + +- ``input_resolutions``: A list, in the form of a dictionary, + specifying the resolution of each input. List elements must be in the + same order as in the target ``model.forward()``. If your model + accepts only one input, you just need to put one dictionary + specifying ``'units'`` and ``'resolution'``. Note that TIAToolbox + supports a model with more than one input. For more information on + units and resolution, please see `TIAToolbox + documentation `__. +- ``patch_input_shape``: Shape of the largest input in (height, width) + format. +- ``stride_shape``: The size of a stride (steps) between two + consecutive patches, used in the patch extraction process. If the + user sets ``stride_shape`` equal to ``patch_input_shape``, patches + will be extracted and processed without any overlap. + + + +.. code-block:: python + + + wsi_ioconfig = IOPatchPredictorConfig( + input_resolutions=[{"units": "mpp", "resolution": 0.5}], + patch_input_shape=[224, 224], + stride_shape=[224, 224], + ) + + + +The ``predict`` method applies the CNN on the input patches and get the +results. Here are the arguments and their descriptions: + +- ``mode``: Type of input to be processed. Choose from ``patch``, + ``tile`` or ``wsi`` according to your application. +- ``imgs``: List of inputs, which should be a list of paths to the + input tiles or WSIs. +- ``return_probabilities``: Set to **True** to get per class + probabilities alongside predicted labels of input patches. If you + wish to merge the predictions to generate prediction maps for + ``tile`` or ``wsi`` modes, you can set ``return_probabilities=True``. +- ``ioconfig``: set the IO configuration information using the + ``IOPatchPredictorConfig`` class. +- ``resolution`` and ``unit`` (not shown below): These arguments + specify the level or micron-per-pixel resolution of the WSI levels + from which we plan to extract patches and can be used instead of + ``ioconfig``. Here we specify the WSI level as ``'baseline'``, + which is equivalent to level 0. In general, this is the level of + greatest resolution. In this particular case, the image has only one + level. More information can be found in the + `documentation `__. +- ``masks``: A list of paths corresponding to the masks of WSIs in the + ``imgs`` list. These masks specify the regions in the original WSIs + from which we want to extract patches. If the mask of a particular + WSI is specified as ``None``, then the labels for all patches of that + WSI (even background regions) would be predicted. This could cause + unnecessary computation. +- ``merge_predictions``: You can set this parameter to ``True`` if it’s + required to generate a 2D map of patch classification results. + However, for large WSIs this will require large available memory. An + alternative (default) solution is to set ``merge_predictions=False``, + and then generate the 2D prediction maps using the + ``merge_predictions`` function as you will see later on. + +Since we are using a large WSI the patch extraction and prediction +processes may take some time (make sure to set the ``ON_GPU=True`` if +you have access to Cuda enabled GPU and PyTorch+Cuda). + + + +.. code-block:: python + + + with suppress_console_output(): + wsi_output = predictor.predict( + imgs=[wsi_path], + masks=None, + mode="wsi", + merge_predictions=False, + ioconfig=wsi_ioconfig, + return_probabilities=True, + save_dir=global_save_dir / "wsi_predictions", + on_gpu=ON_GPU, + ) + + + + +We see how the prediction model works on our whole-slide images by +visualizing the ``wsi_output``. We first need to merge patch prediction +outputs and then visualize them as an overlay on the original image. As +before, the ``merge_predictions`` method is used to merge the patch +predictions. Here we set the parameters +``resolution=1.25, units='power'`` to generate the prediction map at +1.25x magnification. If you would like to have higher/lower resolution +(bigger/smaller) prediction maps, you need to change these parameters +accordingly. When the predictions are merged, use the +``overlay_patch_prediction`` function to overlay the prediction map on +the WSI thumbnail, which should be extracted at the resolution used for +prediction merging. + + +.. code-block:: python + + + overview_resolution = ( + 4 # the resolution in which we desire to merge and visualize the patch predictions + ) + # the unit of the `resolution` parameter. Can be "power", "level", "mpp", or "baseline" + overview_unit = "mpp" + wsi = WSIReader.open(wsi_path) + wsi_overview = wsi.slide_thumbnail(resolution=overview_resolution, units=overview_unit) + plt.figure(), plt.imshow(wsi_overview) + plt.axis("off") + + + + + +.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png + :alt: tiatoolbox tutorial + :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_002.png + :class: sphx-glr-single-img + + + +Overlaying the prediction map on this image as below gives: + + + +.. code-block:: python + + + # Visualization of whole-slide image patch-level prediction + # first set up a label to color mapping + label_color_dict = {} + label_color_dict[0] = ("empty", (0, 0, 0)) + colors = cm.get_cmap("Set1").colors + for class_name, label in label_dict.items(): + label_color_dict[label + 1] = (class_name, 255 * np.array(colors[label])) + + pred_map = predictor.merge_predictions( + wsi_path, + wsi_output[0], + resolution=overview_resolution, + units=overview_unit, + ) + overlay = overlay_prediction_mask( + wsi_overview, + pred_map, + alpha=0.5, + label_info=label_color_dict, + return_ax=True, + ) + plt.show() + + + + + +.. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png + :alt: tiatoolbox tutorial + :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_003.png + :class: sphx-glr-single-img + + + +Feature extraction with a pathology-specific model +-------------------------------------------------- + +In this section, we will show how to extract features from a pretrained +PyTorch model that exists outside TIAToolbox, using the WSI inference +engines provided by TIAToolbox. To illustrate this we will use +HistoEncoder, a computational-pathology specific model that has been +trained in a self-supervised fashion to extract features from histology +images. The model has been made available here: + +‘HistoEncoder: Foundation models for digital pathology’ +(https://github.com/jopo666/HistoEncoder) by Pohjonen, Joona and team at +the University of Helsinki. + +We will plot a umap reduction into 3D (RGB) of the feature map to +visualize how the features capture the differences between some of the +above mentioned tissue types. + + + +.. code-block:: python + + + # Import some extra modules + import histoencoder.functional as F + import torch.nn as nn + + from tiatoolbox.models.engine.semantic_segmentor import DeepFeatureExtractor, IOSegmentorConfig + from tiatoolbox.models.models_abc import ModelABC + import umap + + + +TIAToolbox defines a ModelABC which is a class inheriting PyTorch +`nn.Module `__ +and specifies how a model should look in order to be used in the +TIAToolbox inference engines. The histoencoder model doesn’t follow this +structure, so we need to wrap it in a class whose output and methods are +those that the TIAToolbox engine expects. + + + +.. code-block:: python + + + class HistoEncWrapper(ModelABC): + """Wrapper for HistoEnc model that conforms to tiatoolbox ModelABC interface.""" + + def __init__(self: HistoEncWrapper, encoder) -> None: + super().__init__() + self.feat_extract = encoder + + def forward(self: HistoEncWrapper, imgs: torch.Tensor) -> torch.Tensor: + """Pass input data through the model. + + Args: + imgs (torch.Tensor): + Model input. + + """ + out = F.extract_features(self.feat_extract, imgs, num_blocks=2, avg_pool=True) + return out + + @staticmethod + def infer_batch( + model: nn.Module, + batch_data: torch.Tensor, + *, + on_gpu: bool, + ) -> list[np.ndarray]: + """Run inference on an input batch. + + Contains logic for forward operation as well as i/o aggregation. + + Args: + model (nn.Module): + PyTorch defined model. + batch_data (torch.Tensor): + A batch of data generated by + `torch.utils.data.DataLoader`. + on_gpu (bool): + Whether to run inference on a GPU. + + """ + img_patches_device = batch_data.to('cuda') if on_gpu else batch_data + model.eval() + # Do not compute the gradient (not training) + with torch.inference_mode(): + output = model(img_patches_device) + return [output.cpu().numpy()] + + + + +Now that we have our wrapper, we will create our feature extraction +model and instantiate a +`DeepFeatureExtractor `__ +to allow us to use this model over a WSI. We will use the same WSI as +above, but this time we will extract features from the patches of the +WSI using the HistoEncoder model, rather than predicting some label for +each patch. + + + +.. code-block:: python + + + # create the model + encoder = F.create_encoder("prostate_medium") + model = HistoEncWrapper(encoder) + + # set the pre-processing function + norm=transforms.Normalize(mean=[0.662, 0.446, 0.605],std=[0.169, 0.190, 0.155]) + trans = [ + transforms.ToTensor(), + norm, + ] + model.preproc_func = transforms.Compose(trans) + + wsi_ioconfig = IOSegmentorConfig( + input_resolutions=[{"units": "mpp", "resolution": 0.5}], + patch_input_shape=[224, 224], + output_resolutions=[{"units": "mpp", "resolution": 0.5}], + patch_output_shape=[224, 224], + stride_shape=[224, 224], + ) + + + +When we create the ``DeepFeatureExtractor``, we will pass the +``auto_generate_mask=True`` argument. This will automatically create a +mask of the tissue region using otsu thresholding, so that the extractor +processes only those patches containing tissue. + + + +.. code-block:: python + + + # create the feature extractor and run it on the WSI + extractor = DeepFeatureExtractor(model=model, auto_generate_mask=True, batch_size=32, num_loader_workers=4, num_postproc_workers=4) + with suppress_console_output(): + out = extractor.predict(imgs=[wsi_path], mode="wsi", ioconfig=wsi_ioconfig, save_dir=global_save_dir / "wsi_features",) + + + + +These features could be used to train a downstream model, but here in +order to get some intuition for what the features represent, we will use +a UMAP reduction to visualize the features in RGB space. The points +labeled in a similar color should have similar features, so we can check +if the features naturally separate out into the different tissue regions +when we overlay the UMAP reduction on the WSI thumbnail. We will plot it +along with the patch-level prediction map from above to see how the +features compare to the patch-level predictions in the following cells. + + + +.. code-block:: python + + + # First we define a function to calculate the umap reduction + def umap_reducer(x, dims=3, nns=10): + """UMAP reduction of the input data.""" + reducer = umap.UMAP(n_neighbors=nns, n_components=dims, metric="manhattan", spread=0.5, random_state=2) + reduced = reducer.fit_transform(x) + reduced -= reduced.min(axis=0) + reduced /= reduced.max(axis=0) + return reduced + + # load the features output by our feature extractor + pos = np.load(global_save_dir / "wsi_features" / "0.position.npy") + feats = np.load(global_save_dir / "wsi_features" / "0.features.0.npy") + pos = pos / 8 # as we extracted at 0.5mpp, and we are overlaying on a thumbnail at 4mpp + + # reduce the features into 3 dimensional (rgb) space + reduced = umap_reducer(feats) + + # plot the prediction map the classifier again + overlay = overlay_prediction_mask( + wsi_overview, + pred_map, + alpha=0.5, + label_info=label_color_dict, + return_ax=True, + ) + + # plot the feature map reduction + plt.figure() + plt.imshow(wsi_overview) + plt.scatter(pos[:,0], pos[:,1], c=reduced, s=1, alpha=0.5) + plt.axis("off") + plt.title("UMAP reduction of HistoEnc features") + plt.show() + + + + + +.. rst-class:: sphx-glr-horizontal + + + * + + .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png + :alt: tiatoolbox tutorial + :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_004.png + :class: sphx-glr-multi-img + + * + + .. image-sg:: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png + :alt: UMAP reduction of HistoEnc features + :srcset: ../_static/img/tiatoolbox_tutorial/tiatoolbox_tutorial_005.png + :class: sphx-glr-multi-img + + + + +We see that the prediction map from our patch-level predictor, and the +feature map from our self-supervised feature encoder, capture similar +information about the tissue types in the WSI. This is a good sanity +check that our models are working as expected. It also shows that the +features extracted by the HistoEncoder model are capturing the +differences between the tissue types, and so that they are encoding +histologically relevant information. + + +Where to Go From Here +--------------------- + +In this notebook, we show how we can use the ``PatchPredictor`` and +``DeepFeatureExtractor`` classes and their ``predict`` method to predict +the label, or extract features, for patches of big tiles and WSIs. We +introduce ``merge_predictions`` and ``overlay_prediction_mask`` helper +functions that merge the patch prediction outputs and visualize the +resulting prediction map as an overlay on the input image/WSI. + +All the processes take place within TIAToolbox and we can easily put the +pieces together, following our example code. Please make sure to set +inputs and options correctly. We encourage you to further investigate +the effect on the prediction output of changing ``predict`` function +parameters. We have demonstrated how to use your own pretrained model or +one provided by the research community for a specific task in the +TIAToolbox framework to do inference on large WSIs even if the model +structure is not defined in the TIAToolbox model class. + +You can learn more through the following resources: + +- `Advanced model handling with PyTorch and + TIAToolbox `__ +- `Creating slide graphs for WSI with a custom PyTorch graph neural + network `__ + diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py index aff14f19674..67b055d9ff2 100644 --- a/intermediate_source/torch_compile_tutorial.py +++ b/intermediate_source/torch_compile_tutorial.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -torch.compile Tutorial -================ +Introduction to ``torch.compile`` +================================= **Author:** William Wen """ @@ -20,11 +20,8 @@ # # **Contents** # -# - Basic Usage -# - Demonstrating Speedups -# - Comparison to TorchScript and FX Tracing -# - TorchDynamo and FX Graphs -# - Conclusion +# .. contents:: +# :local: # # **Required pip Dependencies** # @@ -57,7 +54,7 @@ # Basic Usage # ------------ # -# ``torch.compile`` is included in the latest PyTorch.. +# ``torch.compile`` is included in the latest PyTorch. # Running TorchInductor on GPU requires Triton, which is included with the PyTorch 2.0 nightly # binary. If Triton is still missing, try installing ``torchtriton`` via pip # (``pip install torchtriton --extra-index-url "https://download.pytorch.org/whl/nightly/cu117"`` @@ -76,17 +73,21 @@ def foo(x, y): ###################################################################### # Alternatively, we can decorate the function. +t1 = torch.randn(10, 10) +t2 = torch.randn(10, 10) @torch.compile def opt_foo2(x, y): a = torch.sin(x) b = torch.cos(y) return a + b -print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10))) +print(opt_foo2(t1, t2)) ###################################################################### # We can also optimize ``torch.nn.Module`` instances. +t = torch.randn(10, 100) + class MyModule(torch.nn.Module): def __init__(self): super().__init__() @@ -97,7 +98,101 @@ def forward(self, x): mod = MyModule() opt_mod = torch.compile(mod) -print(opt_mod(torch.randn(10, 100))) +print(opt_mod(t)) + +###################################################################### +# torch.compile and Nested Calls +# ------------------------------ +# Nested function calls within the decorated function will also be compiled. + +def nested_function(x): + return torch.sin(x) + +@torch.compile +def outer_function(x, y): + a = nested_function(x) + b = torch.cos(y) + return a + b + +print(outer_function(t1, t2)) + +###################################################################### +# In the same fashion, when compiling a module all sub-modules and methods +# within it, that are not in a skip list, are also compiled. + +class OuterModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.inner_module = MyModule() + self.outer_lin = torch.nn.Linear(10, 2) + + def forward(self, x): + x = self.inner_module(x) + return torch.nn.functional.relu(self.outer_lin(x)) + +outer_mod = OuterModule() +opt_outer_mod = torch.compile(outer_mod) +print(opt_outer_mod(t)) + +###################################################################### +# We can also disable some functions from being compiled by using +# ``torch.compiler.disable``. Suppose you want to disable the tracing on just +# the ``complex_function`` function, but want to continue the tracing back in +# ``complex_conjugate``. In this case, you can use +# ``torch.compiler.disable(recursive=False)`` option. Otherwise, the default is +# ``recursive=True``. + +def complex_conjugate(z): + return torch.conj(z) + +@torch.compiler.disable(recursive=False) +def complex_function(real, imag): + # Assuming this function cause problems in the compilation + z = torch.complex(real, imag) + return complex_conjugate(z) + +def outer_function(): + real = torch.tensor([2, 3], dtype=torch.float32) + imag = torch.tensor([4, 5], dtype=torch.float32) + z = complex_function(real, imag) + return torch.abs(z) + +# Try to compile the outer_function +try: + opt_outer_function = torch.compile(outer_function) + print(opt_outer_function()) +except Exception as e: + print("Compilation of outer_function failed:", e) + +###################################################################### +# Best Practices and Recommendations +# ---------------------------------- +# +# Behavior of ``torch.compile`` with Nested Modules and Function Calls +# +# When you use ``torch.compile``, the compiler will try to recursively compile +# every function call inside the target function or module inside the target +# function or module that is not in a skip list (such as built-ins, some functions in +# the torch.* namespace). +# +# **Best Practices:** +# +# 1. **Top-Level Compilation:** One approach is to compile at the highest level +# possible (i.e., when the top-level module is initialized/called) and +# selectively disable compilation when encountering excessive graph breaks or +# errors. If there are still many compile issues, compile individual +# subcomponents instead. +# +# 2. **Modular Testing:** Test individual functions and modules with ``torch.compile`` +# before integrating them into larger models to isolate potential issues. +# +# 3. **Disable Compilation Selectively:** If certain functions or sub-modules +# cannot be handled by `torch.compile`, use the `torch.compiler.disable` context +# managers to recursively exclude them from compilation. +# +# 4. **Compile Leaf Functions First:** In complex models with multiple nested +# functions and modules, start by compiling the leaf functions or modules first. +# For more information see `TorchDynamo APIs for fine-grained tracing `__. ###################################################################### # Demonstrating Speedups @@ -138,24 +233,21 @@ def init_model(): ###################################################################### # First, let's compare inference. # -# Note that in the call to ``torch.compile``, we have have the additional +# Note that in the call to ``torch.compile``, we have the additional # ``mode`` argument, which we will discuss below. -def evaluate(mod, inp): - with torch.no_grad(): - return mod(inp) - model = init_model() # Reset since we are using a different mode. import torch._dynamo torch._dynamo.reset() -evaluate_opt = torch.compile(evaluate, mode="reduce-overhead") +model_opt = torch.compile(model, mode="reduce-overhead") inp = generate_data(16)[0] -print("eager:", timed(lambda: evaluate(model, inp))[1]) -print("compile:", timed(lambda: evaluate_opt(model, inp))[1]) +with torch.no_grad(): + print("eager:", timed(lambda: model(inp))[1]) + print("compile:", timed(lambda: model_opt(inp))[1]) ###################################################################### # Notice that ``torch.compile`` takes a lot longer to complete @@ -168,7 +260,8 @@ def evaluate(mod, inp): eager_times = [] for i in range(N_ITERS): inp = generate_data(16)[0] - _, eager_time = timed(lambda: evaluate(model, inp)) + with torch.no_grad(): + _, eager_time = timed(lambda: model(inp)) eager_times.append(eager_time) print(f"eager eval time {i}: {eager_time}") @@ -177,7 +270,8 @@ def evaluate(mod, inp): compile_times = [] for i in range(N_ITERS): inp = generate_data(16)[0] - _, compile_time = timed(lambda: evaluate_opt(model, inp)) + with torch.no_grad(): + _, compile_time = timed(lambda: model_opt(inp)) compile_times.append(compile_time) print(f"compile eval time {i}: {compile_time}") print("~" * 10) @@ -186,6 +280,7 @@ def evaluate(mod, inp): eager_med = np.median(eager_times) compile_med = np.median(compile_times) speedup = eager_med / compile_med +assert(speedup > 1) print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") print("~" * 10) @@ -198,11 +293,15 @@ def evaluate(mod, inp): # GPU compute and the observed speedup may be less significant. # # You may also see different speedup results depending on the chosen ``mode`` -# argument. Since our model and data are small, we want to reduce overhead as -# much as possible, and so we chose ``"reduce-overhead"``. For your own models, +# argument. The ``"reduce-overhead"`` mode uses CUDA graphs to further reduce +# the overhead of Python. For your own models, # you may need to experiment with different modes to maximize speedup. You can # read more about modes `here `__. # +# You may might also notice that the second time we run our model with ``torch.compile`` is significantly +# slower than the other runs, although it is much faster than the first run. This is because the ``"reduce-overhead"`` +# mode runs a few warm-up iterations for CUDA graphs. +# # For general PyTorch benchmarking, you can try using ``torch.utils.benchmark`` instead of the ``timed`` # function we defined above. We wrote our own timing function in this tutorial to show # ``torch.compile``'s compilation latency. @@ -242,6 +341,7 @@ def train(mod, data): eager_med = np.median(eager_times) compile_med = np.median(compile_times) speedup = eager_med / compile_med +assert(speedup > 1) print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x") print("~" * 10) @@ -249,6 +349,10 @@ def train(mod, data): # Again, we can see that ``torch.compile`` takes longer in the first # iteration, as it must compile the model, but in subsequent iterations, we see # significant speedups compared to eager. +# +# We remark that the speedup numbers presented in this tutorial are for +# demonstration purposes only. Official speedup values can be seen at the +# `TorchInductor performance dashboard `__. ###################################################################### # Comparison to TorchScript and FX Tracing @@ -485,19 +589,12 @@ def bar(a, b): print(opt_model(generate_data(16)[0])) ###################################################################### -# -# -# Finally, if we simply want TorchDynamo to output the FX graph for export, -# we can use ``torch._dynamo.export``. Note that ``torch._dynamo.export``, like -# ``fullgraph=True``, raises an error if TorchDynamo breaks the graph. - -try: - torch._dynamo.export(bar)(torch.randn(10), torch.randn(10)) -except: - tb.print_exc() - -model_exp = torch._dynamo.export(init_model())(generate_data(16)[0]) -print(model_exp[0](generate_data(16)[0])) +# We can use ``torch.export`` (from PyTorch 2.1+) to extract a single, exportable +# FX graph from the input PyTorch program. The exported graph is intended to be +# run on different (i.e. Python-less) environments. One important restriction +# is that the ``torch.export`` does not support graph breaks. Please check +# `this tutorial `__ +# for more details on ``torch.export``. ###################################################################### # Conclusion diff --git a/intermediate_source/torch_export_nightly_tutorial.rst b/intermediate_source/torch_export_nightly_tutorial.rst new file mode 100644 index 00000000000..e7ef2e88153 --- /dev/null +++ b/intermediate_source/torch_export_nightly_tutorial.rst @@ -0,0 +1,10 @@ +torch.export Nightly Tutorial +============================= + +This tutorial has been moved to https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html + +It will redirect in 3 seconds. + +.. raw:: html + + diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py index 20f6e100684..c992eefa9fc 100644 --- a/intermediate_source/torch_export_tutorial.py +++ b/intermediate_source/torch_export_tutorial.py @@ -2,8 +2,8 @@ """ torch.export Tutorial -================ -**Author:** William Wen, Zhengxu Chen +=================================================== +**Author:** William Wen, Zhengxu Chen, Angela Yi, Pian Pawakapan """ ###################################################################### @@ -11,11 +11,12 @@ # .. warning:: # # ``torch.export`` and its related features are in prototype status and are subject to backwards compatibility -# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.1. +# breaking changes. This tutorial provides a snapshot of ``torch.export`` usage as of PyTorch 2.5. # # :func:`torch.export` is the PyTorch 2.X way to export PyTorch models into # standardized model representations, intended -# to be run on different (i.e. Python-less) environments. +# to be run on different (i.e. Python-less) environments. The official +# documentation can be found `here `__. # # In this tutorial, you will learn how to use :func:`torch.export` to extract # ``ExportedProgram``'s (i.e. single-graph representations) from PyTorch programs. @@ -33,24 +34,29 @@ # # ``torch.export`` extracts single-graph representations from PyTorch programs # by tracing the target function, given example inputs. +# ``torch.export.export()`` is the main entry point for ``torch.export``. # -# The signature of ``torch.export`` is: +# In this tutorial, ``torch.export`` and ``torch.export.export()`` are practically synonymous, +# though ``torch.export`` generally refers to the PyTorch 2.X export process, and ``torch.export.export()`` +# generally refers to the actual function call. # -# .. code:: python +# The signature of ``torch.export.export()`` is: +# +# .. code-block:: python # # export( # f: Callable, # args: Tuple[Any, ...], # kwargs: Optional[Dict[str, Any]] = None, # *, -# constraints: Optional[List[Constraint]] = None +# dynamic_shapes: Optional[Dict[str, Dict[int, Dim]]] = None # ) -> ExportedProgram # -# ``torch.export`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` +# ``torch.export.export()`` traces the tensor computation graph from calling ``f(*args, **kwargs)`` # and wraps it in an ``ExportedProgram``, which can be serialized or executed later with # different inputs. Note that while the output ``ExportedGraph`` is callable and can be # called in the same way as the original input callable, it is not a ``torch.nn.Module``. -# We will detail the ``constraints`` argument later in the tutorial. +# We will detail the ``dynamic_shapes`` argument later in the tutorial. import torch from torch.export import export @@ -66,7 +72,8 @@ def forward(self, x, y): mod = MyModule() exported_mod = export(mod, (torch.randn(8, 100), torch.randn(8, 100))) print(type(exported_mod)) -print(exported_mod(torch.randn(8, 100), torch.randn(8, 100))) +print(exported_mod.module()(torch.randn(8, 100), torch.randn(8, 100))) + ###################################################################### # Let's review some attributes of ``ExportedProgram`` that are of interest. @@ -94,7 +101,7 @@ def forward(self, x, y): # Other attributes of interest in ``ExportedProgram`` include: # # - ``graph_signature`` -- the inputs, outputs, parameters, buffers, etc. of the exported graph. -# - ``range_constraints`` and ``equality_constraints`` -- constraints, covered later +# - ``range_constraints`` -- constraints, covered later print(exported_mod.graph_signature) @@ -107,70 +114,135 @@ def forward(self, x, y): # ------------ # # Although ``torch.export`` shares components with ``torch.compile``, -# the key limitation of ``torch.export``, especially when compared to ``torch.compile``, is that it does not -# support graph breaks. This is because handling graph breaks involves interpreting -# the unsupported operation with default Python evaluation, which is incompatible -# with the export use case. Therefore, in order to make your model code compatible -# with ``torch.export``, you will need to modify your code to remove graph breaks. +# the key limitation of ``torch.export``, especially when compared to +# ``torch.compile``, is that it does not support graph breaks. This is because +# handling graph breaks involves interpreting the unsupported operation with +# default Python evaluation, which is incompatible with the export use case. +# Therefore, in order to make your model code compatible with ``torch.export``, +# you will need to modify your code to remove graph breaks. # # A graph break is necessary in cases such as: # # - data-dependent control flow -def bad1(x): - if x.sum() > 0: - return torch.sin(x) - return torch.cos(x) +class Bad1(torch.nn.Module): + def forward(self, x): + if x.sum() > 0: + return torch.sin(x) + return torch.cos(x) import traceback as tb try: - export(bad1, (torch.randn(3, 3),)) + export(Bad1(), (torch.randn(3, 3),)) except Exception: tb.print_exc() ###################################################################### # - accessing tensor data with ``.data`` -def bad2(x): - x.data[0, 0] = 3 - return x +class Bad2(torch.nn.Module): + def forward(self, x): + x.data[0, 0] = 3 + return x try: - export(bad2, (torch.randn(3, 3),)) + export(Bad2(), (torch.randn(3, 3),)) except Exception: tb.print_exc() ###################################################################### # - calling unsupported functions (such as many built-in functions) -def bad3(x): - x = x + 1 - return x + id(x) +class Bad3(torch.nn.Module): + def forward(self, x): + x = x + 1 + return x + id(x) try: - export(bad3, (torch.randn(3, 3),)) + export(Bad3(), (torch.randn(3, 3),)) except Exception: tb.print_exc() ###################################################################### # - unsupported Python language features (e.g. throwing exceptions, match statements) -def bad4(x): - try: - x = x + 1 - raise RuntimeError("bad") - except: - x = x + 2 - return x +class Bad4(torch.nn.Module): + def forward(self, x): + try: + x = x + 1 + raise RuntimeError("bad") + except: + x = x + 2 + return x try: - export(bad4, (torch.randn(3, 3),)) + export(Bad4(), (torch.randn(3, 3),)) except Exception: tb.print_exc() ###################################################################### -# The sections below demonstrate some ways you can modify your code -# in order to remove graph breaks. +# Non-Strict Export +# ----------------- +# +# To trace the program, ``torch.export`` uses TorchDynamo, a byte code analysis +# engine, to symbolically analyze the Python code and build a graph based on the +# results. This analysis allows ``torch.export`` to provide stronger guarantees +# about safety, but not all Python code is supported, causing these graph +# breaks. +# +# To address this issue, in PyTorch 2.3, we introduced a new mode of +# exporting called non-strict mode, where we trace through the program using the +# Python interpreter executing it exactly as it would in eager mode, allowing us +# to skip over unsupported Python features. This is done through adding a +# ``strict=False`` flag. +# +# Looking at some of the previous examples which resulted in graph breaks: +# +# - Accessing tensor data with ``.data`` now works correctly + +class Bad2(torch.nn.Module): + def forward(self, x): + x.data[0, 0] = 3 + return x + +bad2_nonstrict = export(Bad2(), (torch.randn(3, 3),), strict=False) +print(bad2_nonstrict.module()(torch.ones(3, 3))) + +###################################################################### +# - Calling unsupported functions (such as many built-in functions) traces +# through, but in this case, ``id(x)`` gets specialized as a constant integer in +# the graph. This is because ``id(x)`` is not a tensor operation, so the +# operation is not recorded in the graph. + +class Bad3(torch.nn.Module): + def forward(self, x): + x = x + 1 + return x + id(x) + +bad3_nonstrict = export(Bad3(), (torch.randn(3, 3),), strict=False) +print(bad3_nonstrict) +print(bad3_nonstrict.module()(torch.ones(3, 3))) + +###################################################################### +# - Unsupported Python language features (such as throwing exceptions, match +# statements) now also get traced through. + +class Bad4(torch.nn.Module): + def forward(self, x): + try: + x = x + 1 + raise RuntimeError("bad") + except: + x = x + 2 + return x + +bad4_nonstrict = export(Bad4(), (torch.randn(3, 3),), strict=False) +print(bad4_nonstrict.module()(torch.ones(3, 3))) + + +###################################################################### +# However, there are still some features that require rewrites to the original +# module: ###################################################################### # Control Flow Ops @@ -179,22 +251,20 @@ def bad4(x): # ``torch.export`` actually does support data-dependent control flow. # But these need to be expressed using control flow ops. For example, # we can fix the control flow example above using the ``cond`` op, like so: -# -# .. -# [TODO] link to docs about ``cond`` when it is out from functorch.experimental.control_flow import cond -def bad1_fixed(x): - def true_fn(x): - return torch.sin(x) - def false_fn(x): - return torch.cos(x) - return cond(x.sum() > 0, true_fn, false_fn, [x]) +class Bad1Fixed(torch.nn.Module): + def forward(self, x): + def true_fn(x): + return torch.sin(x) + def false_fn(x): + return torch.cos(x) + return cond(x.sum() > 0, true_fn, false_fn, [x]) -exported_bad1_fixed = export(bad1_fixed, (torch.randn(3, 3),)) -print(exported_bad1_fixed(torch.ones(3, 3))) -print(exported_bad1_fixed(-torch.ones(3, 3))) +exported_bad1_fixed = export(Bad1Fixed(), (torch.randn(3, 3),)) +print(exported_bad1_fixed.module()(torch.ones(3, 3))) +print(exported_bad1_fixed.module()(-torch.ones(3, 3))) ###################################################################### # There are limitations to ``cond`` that one should be aware of: @@ -206,6 +276,8 @@ def false_fn(x): # - Branch functions cannot mutate input or global variables. # - Branch functions cannot access closure variables, except for ``self`` if the function is # defined in the scope of a method. +# +# For more details about ``cond``, check out the `cond documentation `__. ###################################################################### # .. @@ -229,221 +301,518 @@ def false_fn(x): # print(exported_map_example(inp)) ###################################################################### -# Constraints -# ----------- +# Constraints/Dynamic Shapes +# -------------------------- # -# Ops can have different specializations/behaviors for different tensor shapes, so by default, -# ``torch.export`` requires inputs to ``ExportedProgram`` to have the same shape as the respective -# example inputs given to the initial ``torch.export`` call. -# If we try to run the ``ExportedProgram`` in the example below with a tensor -# with a different shape, we get an error: +# This section covers dynamic behavior and representation of exported programs. Dynamic behavior is +# subjective to the particular model being exported, so for the most part of this tutorial, we'll focus +# on this particular toy model (with the resulting tensor shapes annotated): -class MyModule2(torch.nn.Module): +class DynamicModel(torch.nn.Module): def __init__(self): super().__init__() - self.lin = torch.nn.Linear(100, 10) + self.l = torch.nn.Linear(5, 3) + + def forward( + self, + w: torch.Tensor, # [6, 5] + x: torch.Tensor, # [4] + y: torch.Tensor, # [8, 4] + z: torch.Tensor, # [32] + ): + x0 = x + y # [8, 4] + x1 = self.l(w) # [6, 3] + x2 = x0.flatten() # [32] + x3 = x2 + z # [32] + return x1, x3 - def forward(self, x, y): - return torch.nn.functional.relu(self.lin(x + y), inplace=True) +###################################################################### +# By default, ``torch.export`` produces a static program. One consequence of this is that at runtime, +# the program won't work on inputs with different shapes, even if they're valid in eager mode. + +w = torch.randn(6, 5) +x = torch.randn(4) +y = torch.randn(8, 4) +z = torch.randn(32) +model = DynamicModel() +ep = export(model, (w, x, y, z)) +model(w, x, torch.randn(3, 4), torch.randn(12)) +ep.module()(w, x, torch.randn(3, 4), torch.randn(12)) -mod2 = MyModule2() -exported_mod2 = export(mod2, (torch.randn(8, 100), torch.randn(8, 100))) +###################################################################### +# Basic concepts: symbols and guards +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# To enable dynamism, ``export()`` provides a ``dynamic_shapes`` argument. The easiest way to work with +# dynamic shapes is using ``Dim.AUTO`` and looking at the program that's returned. Dynamic behavior is specified +# at a input dimension-level; for each input we can specify a tuple of values: -try: - exported_mod2(torch.randn(10, 100), torch.randn(10, 100)) -except Exception: - tb.print_exc() +from torch.export.dynamic_shapes import Dim + +dynamic_shapes = { + "w": (Dim.AUTO, Dim.AUTO), + "x": (Dim.AUTO,), + "y": (Dim.AUTO, Dim.AUTO), + "z": (Dim.AUTO,), +} +ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) ###################################################################### -# We can modify the ``torch.export`` call to -# relax some of these constraints. We use ``torch.export.dynamic_dim`` to -# express shape constraints manually. +# Before we look at the program that's produced, let's understand what specifying ``dynamic_shapes`` entails, +# and how that interacts with export. For every input dimension where a ``Dim`` object is specified, a symbol is +# `allocated `_, +# taking on a range of ``[2, inf]`` (why not ``[0, inf]`` or ``[1, inf]``? we'll explain later in the +# 0/1 specialization section). # -# .. -# [TODO] link to doc of dynamic_dim when it is available +# Export then runs model tracing, looking at each operation that's performed by the model. Each individual operation can emit +# what's called "guards"; basically boolean condition that are required to be true for the program to be valid. +# When guards involve symbols allocated for input dimensions, the program contains restrictions on what input shapes are valid; +# i.e. the program's dynamic behavior. The symbolic shapes subsystem is the part responsible for taking in all the emitted guards +# and producing a final program representation that adheres to all of these guards. Before we see this "final representation" in +# an ``ExportedProgram``, let's look at the guards emitted by the toy model we're tracing. # -# Using ``dynamic_dim`` on a tensor's dimension marks it as dynamic (i.e. unconstrained), and -# we can provide additional upper and lower bound shape constraints. -# The first argument of ``dynamic_dim`` is the tensor variable we wish -# to specify a dimension constraint for. The second argument specifies -# the dimension of the first argument the constraint applies to. -# In the example below, our input -# ``inp1`` has an unconstrained first dimension, but the size of the second -# dimension must be in the interval (3, 18]. +# Here, each forward input tensor is annotated with the symbol allocated at the start of tracing: -from torch.export import dynamic_dim +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.Linear(5, 3) + + def forward( + self, + w: torch.Tensor, # [s0, s1] + x: torch.Tensor, # [s2] + y: torch.Tensor, # [s3, s4] + z: torch.Tensor, # [s5] + ): + x0 = x + y # guard: s2 == s4 + x1 = self.l(w) # guard: s1 == 5 + x2 = x0.flatten() # no guard added here + x3 = x2 + z # guard: s3 * s4 == s5 + return x1, x3 -inp1 = torch.randn(10, 10) +###################################################################### +# Let's understand each of the operations and the emitted guards: +# +# - ``x0 = x + y``: This is an element-wise add with broadcasting, since ``x`` is a 1-d tensor and ``y`` a 2-d tensor. ``x`` is broadcasted along the last dimension of ``y``, emitting the guard ``s2 == s4``. +# - ``x1 = self.l(w)``: Calling ``nn.Linear()`` performs a matrix multiplication with model parameters. In export, parameters, buffers, and constants are considered program state, which is considered static, and so this is a matmul between a dynamic input (``w: [s0, s1]``), and a statically-shaped tensor. This emits the guard ``s1 == 5``. +# - ``x2 = x0.flatten()``: This call actually doesn't emit any guards! (at least none relevant to input shapes) +# - ``x3 = x2 + z``: ``x2`` has shape ``[s3*s4]`` after flattening, and this element-wise add emits ``s3 * s4 == s5``. +# +# Writing all of these guards down and summarizing is almost like a mathematical proof, which is what the symbolic shapes +# subsystem tries to do! In summary, we can conclude that the program must have the following input shapes to be valid: +# +# - ``w: [s0, 5]`` +# - ``x: [s2]`` +# - ``y: [s3, s2]`` +# - ``z: [s2*s3]`` +# +# And when we do finally print out the exported program to see our result, those shapes are what we see annotated on the +# corresponding inputs: -def constraints_example1(x): - x = x[:, 2:] - return torch.relu(x) +print(ep) -constraints1 = [ - dynamic_dim(inp1, 0), - 3 < dynamic_dim(inp1, 1), - dynamic_dim(inp1, 1) <= 18, -] +###################################################################### +# Another feature to notice is the range_constraints field above, which contains a valid range for each symbol. This isn't +# so interesting currently, since this export call doesn't emit any guards related to symbol bounds and each base symbol has +# a generic bound, but this will come up later. +# +# So far, because we've been exporting this toy model, this experience has not been representative of how hard +# it typically is to debug dynamic shapes guards & issues. In most cases it isn't obvious what guards are being emitted, +# and which operations and parts of user code are responsible. For this toy model we pinpoint the exact lines, and the guards +# are rather intuitive. +# +# In more complicated cases, a helpful first step is always to enable verbose logging. This can be done either with the environment +# variable ``TORCH_LOGS="+dynamic"``, or interactively with ``torch._logging.set_logs(dynamic=10)``: -exported_constraints_example1 = export(constraints_example1, (inp1,), constraints=constraints1) +torch._logging.set_logs(dynamic=10) +ep = export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) -print(exported_constraints_example1(torch.randn(5, 5))) +###################################################################### +# This spits out quite a handful, even with this simple toy model. The log lines here have been cut short at front and end +# to ignore unnecessary info, but looking through the logs we can see the lines relevant to what we described above; +# e.g. the allocation of symbols: -try: - exported_constraints_example1(torch.randn(8, 1)) -except Exception: - tb.print_exc() +""" +create_symbol s0 = 6 for L['w'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s1 = 5 for L['w'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +runtime_assert True == True [statically known] +create_symbol s2 = 4 for L['x'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s3 = 8 for L['y'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s4 = 4 for L['y'].size()[1] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +create_symbol s5 = 32 for L['z'].size()[0] [2, int_oo] (_dynamo/variables/builder.py:2841 in ) +""" -try: - exported_constraints_example1(torch.randn(8, 20)) -except Exception: - tb.print_exc() +###################################################################### +# The lines with `create_symbol` show when a new symbol has been allocated, and the logs also identify the tensor variable names +# and dimensions they've been allocated for. In other lines we can also see the guards emitted: + +""" +runtime_assert Eq(s2, s4) [guard added] x0 = x + y # output shape: [8, 4] # dynamic_shapes_tutorial.py:16 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2, s4)" +runtime_assert Eq(s1, 5) [guard added] x1 = self.l(w) # [6, 3] # dynamic_shapes_tutorial.py:17 in forward (_meta_registrations.py:2127 in meta_mm), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s1, 5)" +runtime_assert Eq(s2*s3, s5) [guard added] x3 = x2 + z # [32] # dynamic_shapes_tutorial.py:19 in forward (_subclasses/fake_impls.py:845 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s2*s3, s5)" +""" ###################################################################### -# Note that if our example inputs to ``torch.export`` do not satisfy the constraints, -# then we get an error. +# Next to the ``[guard added]`` messages, we also see the responsible user lines of code - luckily here the model is simple enough. +# In many real-world cases it's not so straightforward: high-level torch operations can have complicated fake-kernel implementations +# or operator decompositions that complicate where and what guards are emitted. In such cases the best way to dig deeper and investigate +# is to follow the logs' suggestion, and re-run with environment variable ``TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="..."``, to further +# attribute the guard of interest. +# +# ``Dim.AUTO`` is just one of the available options for interacting with ``dynamic_shapes``; as of writing this 2 other options are available: +# ``Dim.DYNAMIC``, and ``Dim.STATIC``. ``Dim.STATIC`` simply marks a dimension static, while ``Dim.DYNAMIC`` is similar to ``Dim.AUTO`` in all +# ways except one: it raises an error when specializing to a constant; this is designed to maintain dynamism. See for example what happens when a +# static guard is emitted on a dynamically-marked dimension: -constraints1_bad = [ - dynamic_dim(inp1, 0), - 10 < dynamic_dim(inp1, 1), - dynamic_dim(inp1, 1) <= 18, -] -try: - export(constraints_example1, (inp1,), constraints=constraints1_bad) -except Exception: - tb.print_exc() +dynamic_shapes["w"] = (Dim.AUTO, Dim.DYNAMIC) +export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) ###################################################################### -# We can also use ``dynamic_dim`` to enforce expected equalities between -# dimensions, for example, in matrix multiplication: +# Static guards also aren't always inherent to the model; they can also come from user specifications. In fact, a common pitfall leading to shape +# specializations is when the user specifies conflicting markers for equivalent dimensions; one dynamic and another static. The same error type is +# raised when this is the case for ``x.shape[0]`` and ``y.shape[1]``: + +dynamic_shapes["w"] = (Dim.AUTO, Dim.AUTO) +dynamic_shapes["x"] = (Dim.STATIC,) +dynamic_shapes["y"] = (Dim.AUTO, Dim.DYNAMIC) +export(model, (w, x, y, z), dynamic_shapes=dynamic_shapes) -inp2 = torch.randn(4, 8) -inp3 = torch.randn(8, 2) +###################################################################### +# Here you might ask why export "specializes", i.e. why we resolve this static/dynamic conflict by going with the static route. The answer is because +# of the symbolic shapes system described above, of symbols and guards. When ``x.shape[0]`` is marked static, we don't allocate a symbol, and compile +# treating this shape as a concrete integer 4. A symbol is allocated for ``y.shape[1]``, and so we finally emit the guard ``s3 == 4``, leading to +# specialization. +# +# One feature of export is that during tracing, statements like asserts, ``torch._check()``, and ``if/else`` conditions will also emit guards. +# See what happens when we augment the existing model with such statements: -def constraints_example2(x, y): - return x @ y +class DynamicModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.Linear(5, 3) + + def forward(self, w, x, y, z): + assert w.shape[0] <= 512 + torch._check(x.shape[0] >= 16) + if w.shape[0] == x.shape[0] + 2: + x0 = x + y + x1 = self.l(w) + x2 = x0.flatten() + x3 = x2 + z + return x1, x3 + else: + return w + +dynamic_shapes = { + "w": (Dim.AUTO, Dim.AUTO), + "x": (Dim.AUTO,), + "y": (Dim.AUTO, Dim.AUTO), + "z": (Dim.AUTO,), +} +ep = export(DynamicModel(), (w, x, y, z), dynamic_shapes=dynamic_shapes) +print(ep) -constraints2 = [ - dynamic_dim(inp2, 0), - dynamic_dim(inp2, 1) == dynamic_dim(inp3, 0), - dynamic_dim(inp3, 1), -] +###################################################################### +# Each of these statements emits an additional guard, and the exported program shows the changes; ``s0`` is eliminated in favor of ``s2 + 2``, +# and ``s2`` now contains lower and upper bounds, reflected in ``range_constraints``. +# +# For the if/else condition, you might ask why the True branch was taken, and why it wasn't the ``w.shape[0] != x.shape[0] + 2`` guard that +# got emitted from tracing. The answer is that export is guided by the sample inputs provided by tracing, and specializes on the branches taken. +# If different sample input shapes were provided that fail the ``if`` condition, export would trace and emit guards corresponding to the ``else`` branch. +# Additionally, you might ask why we traced only the ``if`` branch, and if it's possible to maintain control-flow in your program and keep both branches +# alive. For that, refer to rewriting your model code following the ``Control Flow Ops`` section above. -exported_constraints_example2 = export(constraints_example2, (inp2, inp3), constraints=constraints2) +###################################################################### +# 0/1 specialization +# ^^^^^^^^^^^^^^^^^^ +# +# Since we're talking about guards and specializations, it's a good time to talk about the 0/1 specialization issue we brought up earlier. +# The bottom line is that export will specialize on sample input dimensions with value 0 or 1, because these shapes have trace-time properties that +# don't generalize to other shapes. For example, size 1 tensors can broadcast while other sizes fail; and size 0 ... . This just means that you should +# specify 0/1 sample inputs when you'd like your program to hardcode them, and non-0/1 sample inputs when dynamic behavior is desirable. See what happens +# at runtime when we export this linear layer: + +ep = export( + torch.nn.Linear(4, 3), + (torch.randn(1, 4),), + dynamic_shapes={ + "input": (Dim.AUTO, Dim.STATIC), + }, +) +ep.module()(torch.randn(2, 4)) -print(exported_constraints_example2(torch.randn(2, 16), torch.randn(16, 4))) +###################################################################### +# Named Dims +# ^^^^^^^^^^ +# +# So far we've only been talking about 3 ways to specify dynamic shapes: ``Dim.AUTO``, ``Dim.DYNAMIC``, and ``Dim.STATIC``. The attraction of these is the +# low-friction user experience; all the guards emitted during model tracing are adhered to, and dynamic behavior like min/max ranges, relations, and static/dynamic +# dimensions are automatically figured out underneath export. The dynamic shapes subsystem essentially acts as a "discovery" process, summarizing these guards +# and presenting what export believes is the overall dynamic behavior of the program. The drawback of this design appears once the user has stronger expectations or +# beliefs about the dynamic behavior of these models - maybe there is a strong desire on dynamism and specializations on particular dimensions are to be avoided at +# all costs, or maybe we just want to catch changes in dynamic behavior with changes to the original model code, or possibly underlying decompositions or meta-kernels. +# These changes won't be detected and the ``export()`` call will most likely succeed, unless tests are in place that check the resulting ``ExportedProgram`` representation. +# +# For such cases, our stance is to recommend the "traditional" way of specifying dynamic shapes, which longer-term users of export might be familiar with: named ``Dims``: -try: - exported_constraints_example2(torch.randn(4, 8), torch.randn(4, 2)) -except Exception: - tb.print_exc() +dx = Dim("dx", min=4, max=256) +dh = Dim("dh", max=512) +dynamic_shapes = { + "x": (dx, None), + "y": (2 * dx, dh), +} ###################################################################### -# We can actually use ``torch.export`` to guide us as to which constraints -# are necessary. We can do this by relaxing all constraints (recall that if we -# do not provide constraints for a dimension, the default behavior is to constrain -# to the exact shape value of the example input) and letting ``torch.export`` -# error out. +# This style of dynamic shapes allows the user to specify what symbols are allocated for input dimensions, min/max bounds on those symbols, and places restrictions on the +# dynamic behavior of the ``ExportedProgram`` produced; ``ConstraintViolation`` errors will be raised if model tracing emits guards that conflict with the relations or static/dynamic +# specifications given. For example, in the above specification, the following is asserted: +# +# - ``x.shape[0]`` is to have range ``[4, 256]``, and related to ``y.shape[0]`` by ``y.shape[0] == 2 * x.shape[0]``. +# - ``x.shape[1]`` is static. +# - ``y.shape[1]`` has range ``[2, 512]``, and is unrelated to any other dimension. +# +# In this design, we allow relations between dimensions to be specified with univariate linear expressions: ``A * dim + B`` can be specified for any dimension. This allows users +# to specify more complex constraints like integer divisibility for dynamic dimensions: -inp4 = torch.randn(8, 16) -inp5 = torch.randn(16, 32) +dx = Dim("dx", min=4, max=512) +dynamic_shapes = { + "x": (4 * dx, None) # x.shape[0] has range [16, 2048], and is divisible by 4. +} -def constraints_example3(x, y): - if x.shape[0] <= 16: - return x @ y[:, :16] - return y +###################################################################### +# Constraint violations, suggested fixes +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# One common issue with this specification style (before ``Dim.AUTO`` was introduced), is that the specification would often be mismatched with what was produced by model tracing. +# That would lead to ``ConstraintViolation`` errors and export suggested fixes - see for example with this model & specification, where the model inherently requires equality between +# dimensions 0 of ``x`` and ``y``, and requires dimension 1 to be static. -constraints3 = ( - [dynamic_dim(inp4, i) for i in range(inp4.dim())] + - [dynamic_dim(inp5, i) for i in range(inp5.dim())] +class Foo(torch.nn.Module): + def forward(self, x, y): + w = x + y + return w + torch.ones(4) + +dx, dy, d1 = torch.export.dims("dx", "dy", "d1") +ep = export( + Foo(), + (torch.randn(6, 4), torch.randn(6, 4)), + dynamic_shapes={ + "x": (dx, d1), + "y": (dy, d1), + }, ) -try: - export(constraints_example3, (inp4, inp5), constraints=constraints3) -except Exception: - tb.print_exc() +###################################################################### +# The expectation with suggested fixes is that the user can interactively copy-paste the changes into their dynamic shapes specification, and successfully export afterwards. +# +# Lastly, there's couple nice-to-knows about the options for specification: +# +# - ``None`` is a good option for static behavior: +# - ``dynamic_shapes=None`` (default) exports with the entire model being static. +# - specifying ``None`` at an input-level exports with all tensor dimensions static, and is also required for non-tensor inputs. +# - specifying ``None`` at a dimension-level specializes that dimension, though this is deprecated in favor of ``Dim.STATIC``. +# - specifying per-dimension integer values also produces static behavior, and will additionally check that the provided sample input matches the specification. +# +# These options are combined in the inputs & dynamic shapes spec below: + +inputs = ( + torch.randn(4, 4), + torch.randn(3, 3), + 16, + False, +) +dynamic_shapes = { + "tensor_0": (Dim.AUTO, None), + "tensor_1": None, + "int_val": None, + "bool_val": None, +} ###################################################################### -# We can see that the error message suggests to us to use some additional code -# to specify the necessary constraints. Let us use that code (exact code may differ slightly): +# Data-dependent errors +# --------------------- +# +# While trying to export models, you have may have encountered errors like "Could not guard on data-dependent expression", or Could not extract specialized integer from data-dependent expression". +# These errors exist because ``torch.export()`` compiles programs using FakeTensors, which symbolically represent their real tensor counterparts. While these have equivalent symbolic properties +# (e.g. sizes, strides, dtypes), they diverge in that FakeTensors do not contain any data values. While this avoids unnecessary memory usage and expensive computation, it does mean that export may be +# unable to out-of-the-box compile parts of user code where compilation relies on data values. In short, if the compiler requires a concrete, data-dependent value in order to proceed, it will error out, +# complaining that the value is not available. +# +# Data-dependent values appear in many places, and common sources are calls like ``item()``, ``tolist()``, or ``torch.unbind()`` that extract scalar values from tensors. +# How are these values represented in the exported program? In the `Constraints/Dynamic Shapes `_ +# section, we talked about allocating symbols to represent dynamic input dimensions. +# The same happens here: we allocate symbols for every data-dependent value that appears in the program. The important distinction is that these are "unbacked" symbols, +# in contrast to the "backed" symbols allocated for input dimensions. The `"backed/unbacked" `_ +# nomenclature refers to the presence/absence of a "hint" for the symbol: a concrete value backing the symbol, that can inform the compiler on how to proceed. +# +# In the input shape symbol case (backed symbols), these hints are simply the sample input shapes provided, which explains why control-flow branching is determined by the sample input properties. +# For data-dependent values, the symbols are taken from FakeTensor "data" during tracing, and so the compiler doesn't know the actual values (hints) that these symbols would take on. +# +# Let's see how these show up in exported programs: -def specify_constraints(x, y): - return [ - # x: - dynamic_dim(x, 0) <= 16, +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = y.tolist() + return b + [a] - # y: - 16 < dynamic_dim(y, 1), - dynamic_dim(y, 0) == dynamic_dim(x, 1), - ] +inps = ( + torch.tensor(1), + torch.tensor([2, 3]), +) +ep = export(Foo(), inps) +print(ep) -constraints3_fixed = specify_constraints(inp4, inp5) -exported_constraints_example3 = export(constraints_example3, (inp4, inp5), constraints=constraints3_fixed) -print(exported_constraints_example3(torch.randn(4, 32), torch.randn(32, 64))) +###################################################################### +# The result is that 3 unbacked symbols (notice they're prefixed with "u", instead of the usual "s" for input shape/backed symbols) are allocated and returned: +# 1 for the ``item()`` call, and 1 for each of the elements of ``y`` with the ``tolist()`` call. +# Note from the range constraints field that these take on ranges of ``[-int_oo, int_oo]``, not the default ``[0, int_oo]`` range allocated to input shape symbols, +# since we have no information on what these values are - they don't represent sizes, so don't necessarily have positive values. ###################################################################### -# Note that in the example above, because we constrained the value of ``x.shape[0]`` in -# ``constraints_example3``, the exported program is sound even though there is a -# raw ``if`` statement. +# Guards, torch._check() +# ^^^^^^^^^^^^^^^^^^^^^^ # -# If you want to see why ``torch.export`` generated these constraints, you can -# re-run the script with the environment variable ``TORCH_LOGS=dynamic,dynamo``, -# or use ``torch._logging.set_logs``. +# But the case above is easy to export, because the concrete values of these symbols aren't used in any compiler decision-making; all that's relevant is that the return values are unbacked symbols. +# The data-dependent errors highlighted in this section are cases like the following, where `data-dependent guards `_ are encountered: -import logging -torch._logging.set_logs(dynamic=logging.INFO, dynamo=logging.INFO) -exported_constraints_example3 = export(constraints_example3, (inp4, inp5), constraints=constraints3_fixed) +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + if a // 2 >= 5: + return y + 2 + else: + return y * 5 -# reset to previous values -torch._logging.set_logs(dynamic=logging.WARNING, dynamo=logging.WARNING) +###################################################################### +# Here we actually need the "hint", or the concrete value of ``a`` for the compiler to decide whether to trace ``return y + 2`` or ``return y * 5`` as the output. +# Because we trace with FakeTensors, we don't know what ``a // 2 >= 5`` actually evaluates to, and export errors out with "Could not guard on data-dependent expression ``u0 // 2 >= 5 (unhinted)``". +# +# So how do we export this toy model? Unlike ``torch.compile()``, export requires full graph compilation, and we can't just graph break on this. Here are some basic options: +# +# 1. Manual specialization: we could intervene by selecting the branch to trace, either by removing the control-flow code to contain only the specialized branch, or using ``torch.compiler.is_compiling()`` to guard what's traced at compile-time. +# 2. ``torch.cond()``: we could rewrite the control-flow code to use ``torch.cond()`` so we don't specialize on a branch. +# +# While these options are valid, they have their pitfalls. Option 1 sometimes requires drastic, invasive rewrites of the model code to specialize, and ``torch.cond()`` is not a comprehensive system for handling data-dependent errors. +# As we will see, there are data-dependent errors that do not involve control-flow. +# +# The generally recommended approach is to start with ``torch._check()`` calls. While these give the impression of purely being assert statements, they are in fact a system of informing the compiler on properties of symbols. +# While a ``torch._check()`` call does act as an assertion at runtime, when traced at compile-time, the checked expression is sent to the symbolic shapes subsystem for reasoning, and any symbol properties that follow from the expression being true, +# are stored as symbol properties (provided it's smart enough to infer those properties). So even if unbacked symbols don't have hints, if we're able to communicate properties that are generally true for these symbols via +# ``torch._check()`` calls, we can potentially bypass data-dependent guards without rewriting the offending model code. +# +# For example in the model above, inserting ``torch._check(a >= 10)`` would tell the compiler that ``y + 2`` can always be returned, and ``torch._check(a == 4)`` tells it to return ``y * 5``. +# See what happens when we re-export this model. + +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + torch._check(a >= 10) + torch._check(a <= 60) + if a // 2 >= 5: + return y + 2 + else: + return y * 5 + +inps = ( + torch.tensor(32), + torch.randn(4), +) +ep = export(Foo(), inps) +print(ep) ###################################################################### -# We can view an ``ExportedProgram``'s constraints using the ``range_constraints`` and -# ``equality_constraints`` attributes. The logging above reveals what the symbols ``s0, s1, ...`` -# represent. +# Export succeeds, and note from the range constraints field that ``u0`` takes on a range of ``[10, 60]``. +# +# So what information do ``torch._check()`` calls actually communicate? This varies as the symbolic shapes subsystem gets smarter, but at a fundamental level, these are generally true: +# +# 1. Equality with non-data-dependent expressions: ``torch._check()`` calls that communicate equalities like ``u0 == s0 + 4`` or ``u0 == 5``. +# 2. Range refinement: calls that provide lower or upper bounds for symbols, like the above. +# 3. Some basic reasoning around more complicated expressions: inserting ``torch._check(a < 4)`` will typically tell the compiler that ``a >= 4`` is false. Checks on complex expressions like ``torch._check(a ** 2 - 3 * a <= 10)`` will typically get you past identical guards. +# +# As mentioned previously, ``torch._check()`` calls have applicability outside of data-dependent control flow. For example, here's a model where ``torch._check()`` insertion +# prevails while manual specialization & ``torch.cond()`` do not: + +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + return y[a] -print(exported_constraints_example3.range_constraints) -print(exported_constraints_example3.equality_constraints) +inps = ( + torch.tensor(32), + torch.randn(60), +) +export(Foo(), inps) ###################################################################### -# We can also constrain on individual values in the source code itself using -# ``constrain_as_value`` and ``constrain_as_size``. ``constrain_as_value`` specifies -# that a given integer value is expected to fall within the provided minimum/maximum bounds (inclusive). -# If a bound is not provided, then it is assumed to be unbounded. +# Here is a scenario where ``torch._check()`` insertion is required simply to prevent an operation from failing. The export call will fail with +# "Could not guard on data-dependent expression ``-u0 > 60``", implying that the compiler doesn't know if this is a valid indexing operation - +# if the value of ``x`` is out-of-bounds for ``y`` or not. Here, manual specialization is too prohibitive, and ``torch.cond()`` has no place. +# Instead, informing the compiler of ``u0``'s range is sufficient: -from torch.export import constrain_as_size, constrain_as_value +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + torch._check(a >= 0) + torch._check(a <= y.shape[0]) + return y[a] + +inps = ( + torch.tensor(32), + torch.randn(60), +) +ep = export(Foo(), inps) +print(ep) -def constraints_example4(x, y): - b = y.item() - constrain_as_value(b, 3, 5) - if b >= 3: - return x.cos() - return x.sin() +###################################################################### +# Specialized values +# ^^^^^^^^^^^^^^^^^^ +# +# Another category of data-dependent error happens when the program attempts to extract a concrete data-dependent integer/float value +# while tracing. This looks something like "Could not extract specialized integer from data-dependent expression", and is analogous to +# the previous class of errors - if these occur when attempting to evaluate concrete integer/float values, data-dependent guard errors arise +# with evaluating concrete boolean values. +# +# This error typically occurs when there is an explicit or implicit ``int()`` cast on a data-dependent expression. For example, this list comprehension +# has a `range()` call that implicitly does an ``int()`` cast on the size of the list: -exported_constraints_example4 = export(constraints_example4, (torch.randn(3, 3), torch.tensor([4]))) -print(exported_constraints_example4(torch.randn(3, 3), torch.tensor([5]))) -try: - exported_constraints_example4(torch.randn(3, 3), torch.randn([2])) -except Exception: - tb.print_exc() +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = torch.cat([y for y in range(a)], dim=0) + return b + int(a) + +inps = ( + torch.tensor(32), + torch.randn(60), +) +export(Foo(), inps, strict=False) ###################################################################### -# ``constrain_as_size`` is similar to ``constrain_as_value``, except that it should be used on integer values that -# will be used to specify tensor shapes -- in particular, the value must not be 0 or 1 because -# many operations have special behavior for tensors with a shape value of 0 or 1. +# For these errors, some basic options you have are: +# +# 1. Avoid unnecessary ``int()`` cast calls, in this case the ``int(a)`` in the return statement. +# 2. Use ``torch._check()`` calls; unfortunately all you may be able to do in this case is specialize (with ``torch._check(a == 60)``). +# 3. Rewrite the offending code at a higher level. For example, the list comprehension is semantically a ``repeat()`` op, which doesn't involve an ``int()`` cast. The following rewrite avoids data-dependent errors: -def constraints_example5(x, y): - b = y.item() - constrain_as_size(b) - z = torch.ones(b, 4) - return x.sum() + z.sum() +class Foo(torch.nn.Module): + def forward(self, x, y): + a = x.item() + b = y.unsqueeze(0).repeat(a, 1) + return b + a -exported_constraints_example5 = export(constraints_example5, (torch.randn(2, 2), torch.tensor([4]))) -print(exported_constraints_example5(torch.randn(2, 2), torch.tensor([5]))) -try: - exported_constraints_example5(torch.randn(2, 2), torch.randn([1])) -except Exception: - tb.print_exc() +inps = ( + torch.tensor(32), + torch.randn(60), +) +ep = export(Foo(), inps, strict=False) +print(ep) + +###################################################################### +# Data-dependent errors can be much more involved, and there are many more options in your toolkit to deal with them: ``torch._check_is_size()``, ``guard_size_oblivious()``, or real-tensor tracing, as starters. +# For more in-depth guides, please refer to the `Export Programming Model `_, +# or `Dealing with GuardOnDataDependentSymNode errors `_. ###################################################################### # Custom Ops @@ -451,20 +820,13 @@ def constraints_example5(x, y): # # ``torch.export`` can export PyTorch programs with custom operators. # -# # Currently, the steps to register a custom op for use by ``torch.export`` are: # -# - Define the custom op using ``torch.library`` (`reference `__) +# - Define the custom op using ``torch.library`` (`reference `__) # as with any other custom op -from torch.library import Library, impl - -m = Library("my_custom_library", "DEF") - -m.define("custom_op(Tensor input) -> Tensor") - -@impl(m, "custom_op", "CompositeExplicitAutograd") -def custom_op(x): +@torch.library.custom_op("my_custom_library::custom_op", mutates_args={}) +def custom_op(input: torch.Tensor) -> torch.Tensor: print("custom_op called!") return torch.relu(x) @@ -472,30 +834,123 @@ def custom_op(x): # - Define a ``"Meta"`` implementation of the custom op that returns an empty # tensor with the same shape as the expected output -@impl(m, "custom_op", "Meta") +@custom_op.register_fake def custom_op_meta(x): return torch.empty_like(x) ###################################################################### # - Call the custom op from the code you want to export using ``torch.ops`` -def custom_op_example(x): - x = torch.sin(x) - x = torch.ops.my_custom_library.custom_op(x) - x = torch.cos(x) - return x +class CustomOpExample(torch.nn.Module): + def forward(self, x): + x = torch.sin(x) + x = torch.ops.my_custom_library.custom_op(x) + x = torch.cos(x) + return x ###################################################################### # - Export the code as before -exported_custom_op_example = export(custom_op_example, (torch.randn(3, 3),)) +exported_custom_op_example = export(CustomOpExample(), (torch.randn(3, 3),)) exported_custom_op_example.graph_module.print_readable() -print(exported_custom_op_example(torch.randn(3, 3))) +print(exported_custom_op_example.module()(torch.randn(3, 3))) ###################################################################### # Note in the above outputs that the custom op is included in the exported graph. # And when we call the exported graph as a function, the original custom op is called, # as evidenced by the ``print`` call. +# +# If you have a custom operator implemented in C++, please refer to +# `this document `__ +# to make it compatible with ``torch.export``. + +###################################################################### +# Decompositions +# -------------- +# +# The graph produced by ``torch.export`` by default returns a graph containing +# only functional ATen operators. This functional ATen operator set (or "opset") contains around 2000 +# operators, all of which are functional, that is, they do not +# mutate or alias inputs. You can find a list of all ATen operators +# `here `__ +# and you can inspect if an operator is functional by checking +# ``op._schema.is_mutable``, for example: + +print(torch.ops.aten.add.Tensor._schema.is_mutable) +print(torch.ops.aten.add_.Tensor._schema.is_mutable) + +###################################################################### +# By default, the environment in which you want to run the exported graph +# should support all ~2000 of these operators. +# However, you can use the following API on the exported program +# if your specific environment is only able to support a subset of +# the ~2000 operators. +# +# .. code-block:: python +# +# def run_decompositions( +# self: ExportedProgram, +# decomposition_table: Optional[Dict[torch._ops.OperatorBase, Callable]] +# ) -> ExportedProgram +# +# ``run_decompositions`` takes in a decomposition table, which is a mapping of +# operators to a function specifying how to reduce, or decompose, that operator +# into an equivalent sequence of other ATen operators. +# +# The default decomposition table for ``run_decompositions`` is the +# `Core ATen decomposition table `__ +# which will decompose the all ATen operators to the +# `Core ATen Operator Set `__ +# which consists of only ~180 operators. + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 4) + + def forward(self, x): + return self.linear(x) + +ep = export(M(), (torch.randn(2, 3),)) +print(ep.graph) + +core_ir_ep = ep.run_decompositions() +print(core_ir_ep.graph) + +###################################################################### +# Notice that after running ``run_decompositions`` the +# ``torch.ops.aten.t.default`` operator, which is not part of the Core ATen +# Opset, has been replaced with ``torch.ops.aten.permute.default`` which is part +# of the Core ATen Opset. +# +# Most ATen operators already have decompositions, which are located +# `here `__. +# If you would like to use some of these existing decomposition functions, +# you can pass in a list of operators you would like to decompose to the +# `get_decompositions `__ +# function, which will return a decomposition table using existing +# decomposition implementations. + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 4) + + def forward(self, x): + return self.linear(x) + +ep = export(M(), (torch.randn(2, 3),)) +print(ep.graph) + +from torch._decomp import get_decompositions +decomp_table = get_decompositions([torch.ops.aten.t.default, torch.ops.aten.transpose.int]) +core_ir_ep = ep.run_decompositions(decomp_table) +print(core_ir_ep.graph) + +###################################################################### +# If there is no existing decomposition function for an ATen operator that you would +# like to decompose, feel free to send a pull request into PyTorch +# implementing the decomposition! ###################################################################### # ExportDB @@ -517,8 +972,8 @@ def custom_op_example(x): def cond_predicate(x): """ The conditional statement (aka predicate) passed to ``cond()`` must be one of the following: - - torch.Tensor with a single element - - boolean expression + - ``torch.Tensor`` with a single element + - boolean expression NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized. """ pred = x.dim() > 2 and x.shape[2] > 10 @@ -534,6 +989,54 @@ def cond_predicate(x): # ExportDB is not exhaustive, but is intended to cover all use cases found in typical PyTorch code. Feel free to reach # out if there is an important Python/PyTorch feature that should be added to ExportDB or supported by ``torch.export``. +###################################################################### +# Running the Exported Program +# ---------------------------- +# +# As ``torch.export`` is only a graph capturing mechanism, calling the artifact +# produced by ``torch.export`` eagerly will be equivalent to running the eager +# module. To optimize the execution of the Exported Program, we can pass this +# exported artifact to backends such as Inductor through ``torch.compile``, +# `AOTInductor `__, +# or `TensorRT `__. + +class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 3) + + def forward(self, x): + x = self.linear(x) + return x + +inp = torch.randn(2, 3, device="cuda") +m = M().to(device="cuda") +ep = torch.export.export(m, (inp,)) + +# Run it eagerly +res = ep.module()(inp) +print(res) + +# Run it with torch.compile +res = torch.compile(ep.module(), backend="inductor")(inp) +print(res) + +###################################################################### +# .. code-block:: python +# +# import torch._export +# import torch._inductor +# +# # Note: these APIs are subject to change +# # Compile the exported program to a .so using ``AOTInductor`` +# with torch.no_grad(): +# so_path = torch._inductor.aot_compile(ep.module(), [inp]) +# +# # Load and run the .so file in Python. +# # To load and run it in a C++ environment, see: +# # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html +# res = torch._export.aot_load(so_path, device="cuda")(inp) + ###################################################################### # Conclusion # ---------- diff --git a/intermediate_source/torchrec_intro_tutorial.py b/intermediate_source/torchrec_intro_tutorial.py new file mode 100644 index 00000000000..5f9464decd1 --- /dev/null +++ b/intermediate_source/torchrec_intro_tutorial.py @@ -0,0 +1,1116 @@ +""" +Introduction to TorchRec +================================== + +**TorchRec** is a PyTorch library tailored for building scalable and efficient recommendation systems using embeddings. +This tutorial guides you through the installation process, introduces the concept of embeddings, and highlights their importance in +recommendation systems. It offers practical demonstrations on implementing embeddings with PyTorch +and TorchRec, focusing on handling large embedding tables through distributed training and advanced optimizations. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Fundamentals of embeddings and their role in recommendation systems + * How to set up TorchRec to manage and implement embeddings in PyTorch environments + * Explore advanced techniques for distributing large embedding tables across multiple GPUs + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.5 or later with CUDA 11.8 or later + * Python 3.9 or later + * `FBGEMM `__ + + +""" + +############################################### +# Install Dependencies +# ^^^^^^^^^^^^^^^^^^^^ +# +# Before running this tutorial in Google Colab or other environment, install the +# following dependencies: +# +# .. code-block:: sh +# +# !pip3 install --pre torch --index-url https://download.pytorch.org/whl/cu121 -U +# !pip3 install fbgemm_gpu --index-url https://download.pytorch.org/whl/cu121 +# !pip3 install torchmetrics==1.0.3 +# !pip3 install torchrec --index-url https://download.pytorch.org/whl/cu121 +# +# .. note:: +# If you are running this in Google Colab, make sure to switch to a GPU runtime type. +# For more information, +# see `Enabling CUDA `__ +# + + + +###################################################################### +# Embeddings +# ~~~~~~~~~~ +# +# When building recommendation systems, categorical features typically +# have massive cardinality, posts, users, ads, and so on. +# +# In order to represent these entities and model these relationships, +# **embeddings** are used. In machine learning, **embeddings are a vectors +# of real numbers in a high-dimensional space used to represent meaning in +# complex data like words, images, or users**. +# +# Embeddings in RecSys +# ~~~~~~~~~~~~~~~~~~~~ +# +# Now you might wonder, how are these embeddings generated in the first +# place? Well, embeddings are represented as individual rows in an +# **Embedding Table**, also referred to as embedding weights. The reason +# for this is that embeddings or embedding table weights are trained just +# like all of the other weights of the model via gradient descent! +# +# Embedding tables are simply a large matrix for storing embeddings, with +# two dimensions (B, N), where: +# +# * B is the number of embeddings stored by the table +# * N is the number of dimensions per embedding (N-dimensional embedding). +# +# The inputs to embedding tables represent embedding lookups to retrieve +# the embedding for a specific index or row. In recommendation systems, such +# as those used in many large systems, unique IDs are not only used for +# specific users, but also across entities like posts and ads to serve as +# lookup indices to respective embedding tables! +# +# Embeddings are trained in RecSys through the following process: +# +# * **Input/lookup indices are fed into the model, as unique IDs**. IDs are +# hashed to the total size of the embedding table to prevent issues when +# the ID > number of rows +# +# * Embeddings are then retrieved and **pooled, such as taking the sum or +# mean of the embeddings**. This is required as there can be a variable number of +# embeddings per example while the model expects consistent shapes. +# +# * The **embeddings are used in conjunction with the rest of the model to +# produce a prediction**, such as `Click-Through Rate +# (CTR) `__ +# for an ad. +# +# * The loss is calculated with the prediction and the label +# for an example, and **all weights of the model are updated through +# gradient descent and backpropagation, including the embedding weights** +# that were associated with the example. +# +# These embeddings are crucial for representing categorical features, such +# as users, posts, and ads, in order to capture relationships and make +# good recommendations. The `Deep learning recommendation +# model `__ (DLRM) paper talks more +# about the technical details of using embedding tables in RecSys. +# +# This tutorial introduces the concept of embeddings, showcase +# TorchRec specific modules and data types, and depict how distributed training +# works with TorchRec. +# + +import torch + + +###################################################################### +# Embeddings in PyTorch +# --------------------- +# +# In PyTorch, we have the following types of embeddings: +# +# * :class:`torch.nn.Embedding`: An embedding table where forward pass returns the +# embeddings themselves as is. +# +# * :class:`torch.nn.EmbeddingBag`: Embedding table where forward pass returns +# embeddings that are then pooled, for example, sum or mean, otherwise known +# as **Pooled Embeddings**. +# +# In this section, we will go over a very brief introduction to performing +# embedding lookups by passing in indices into the table. +# + +num_embeddings, embedding_dim = 10, 4 + +# Initialize our embedding table +weights = torch.rand(num_embeddings, embedding_dim) +print("Weights:", weights) + +# Pass in pre-generated weights just for example, typically weights are randomly initialized +embedding_collection = torch.nn.Embedding( + num_embeddings, embedding_dim, _weight=weights +) +embedding_bag_collection = torch.nn.EmbeddingBag( + num_embeddings, embedding_dim, _weight=weights +) + +# Print out the tables, we should see the same weights as above +print("Embedding Collection Table: ", embedding_collection.weight) +print("Embedding Bag Collection Table: ", embedding_bag_collection.weight) + +# Lookup rows (ids for embedding ids) from the embedding tables +# 2D tensor with shape (batch_size, ids for each batch) +ids = torch.tensor([[1, 3]]) +print("Input row IDS: ", ids) + +embeddings = embedding_collection(ids) + +# Print out the embedding lookups +# You should see the specific embeddings be the same as the rows (ids) of the embedding tables above +print("Embedding Collection Results: ") +print(embeddings) +print("Shape: ", embeddings.shape) + +# ``nn.EmbeddingBag`` default pooling is mean, so should be mean of batch dimension of values above +pooled_embeddings = embedding_bag_collection(ids) + +print("Embedding Bag Collection Results: ") +print(pooled_embeddings) +print("Shape: ", pooled_embeddings.shape) + +# ``nn.EmbeddingBag`` is the same as ``nn.Embedding`` but just with pooling (mean, sum, and so on) +# We can see that the mean of the embeddings of embedding_collection is the same as the output of the embedding_bag_collection +print("Mean: ", torch.mean(embedding_collection(ids), dim=1)) + + +###################################################################### +# Congratulations! Now you have a basic understanding of how to use +# embedding tables --- one of the foundations of modern recommendation +# systems! These tables represent entities and their relationships. For +# example, the relationship between a given user and the pages and posts +# they have liked. +# + + +###################################################################### +# TorchRec Features Overview +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# In the section above we've learned how to use embedding tables, one of the foundations of +# modern recommendation systems! These tables represent entities and +# relationships, such as users, pages, posts, etc. Given that these +# entities are always increasing, a **hash** function is typically applied +# to make sure the IDs are within the bounds of a certain embedding table. +# However, in order to represent a vast amount of entities and reduce hash +# collisions, these tables can become quite massive (think about the number of ads +# for example). In fact, these tables can become so massive that they +# won't be able to fit on 1 GPU, even with 80G of memory. +# +# In order to train models with massive embedding tables, sharding these +# tables across GPUs is required, which then introduces a whole new set of +# problems and opportunities in parallelism and optimization. Luckily, we have +# the TorchRec library that has encountered, consolidated, and addressed +# many of these concerns. TorchRec serves as a **library that provides +# primitives for large scale distributed embeddings**. +# +# Next, we will explore the major features of the TorchRec +# library. We will start with ``torch.nn.Embedding`` and will extend that to +# custom TorchRec modules, explore distributed training environment with +# generating a sharding plan for embeddings, look at inherent TorchRec +# optimizations, and extend the model to be ready for inference in C++. +# Below is a quick outline of what this section consists of: +# +# * TorchRec Modules and Data Types +# * Distributed Training, Sharding, and Optimizations +# * Inference +# +# Let's begin with importing TorchRec: + +import torchrec + + +###################################################################### +# TorchRec Modules and Data Types +# ---------------------------------- +# +# This section goes over TorchRec Modules and data types including such +# entities as ``EmbeddingCollection`` and ``EmbeddingBagCollection``, +# ``JaggedTensor``, ``KeyedJaggedTensor``, ``KeyedTensor`` and more. +# +# From ``EmbeddingBag`` to ``EmbeddingBagCollection`` +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have already explored :class:`torch.nn.Embedding` and :class:`torch.nn.EmbeddingBag`. +# TorchRec extends these modules by creating collections of embeddings, in +# other words modules that can have multiple embedding tables, with +# ``EmbeddingCollection`` and ``EmbeddingBagCollection`` +# We will use ``EmbeddingBagCollection`` to represent a group of +# embedding bags. +# +# In the example code below, we create an ``EmbeddingBagCollection`` (EBC) +# with two embedding bags, 1 representing **products** and 1 representing **users**. +# Each table, ``product_table`` and ``user_table``, is represented by a 64 dimension +# embedding of size 4096. +# + +ebc = torchrec.EmbeddingBagCollection( + device="cpu", + tables=[ + torchrec.EmbeddingBagConfig( + name="product_table", + embedding_dim=64, + num_embeddings=4096, + feature_names=["product"], + pooling=torchrec.PoolingType.SUM, + ), + torchrec.EmbeddingBagConfig( + name="user_table", + embedding_dim=64, + num_embeddings=4096, + feature_names=["user"], + pooling=torchrec.PoolingType.SUM, + ) + ] +) +print(ebc.embedding_bags) + + +###################################################################### +# Let’s inspect the forward method for ``EmbeddingBagCollection`` and the +# module’s inputs and outputs: +# + +import inspect + +# Let's look at the ``EmbeddingBagCollection`` forward method +# What is a ``KeyedJaggedTensor`` and ``KeyedTensor``? +print(inspect.getsource(ebc.forward)) + + +###################################################################### +# TorchRec Input/Output Data Types +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# TorchRec has distinct data types for input and output of its modules: +# ``JaggedTensor``, ``KeyedJaggedTensor``, and ``KeyedTensor``. Now you +# might ask, why create new data types to represent sparse features? To +# answer that question, we must understand how sparse features are +# represented in code. +# +# Sparse features are otherwise known as ``id_list_feature`` and +# ``id_score_list_feature``, and are the **IDs** that will be used as +# indices to an embedding table to retrieve the embedding for that ID. To +# give a very simple example, imagine a single sparse feature being Ads +# that a user interacted with. The input itself would be a set of Ad IDs +# that a user interacted with, and the embeddings retrieved would be a +# semantic representation of those Ads. The tricky part of representing +# these features in code is that in each input example, **the number of +# IDs is variable**. One day a user might have interacted with only one ad +# while the next day they interact with three. +# +# A simple representation is shown below, where we have a ``lengths`` +# tensor denoting how many indices are in an example for a batch and a +# ``values`` tensor containing the indices themselves. +# + +# Batch Size 2 +# 1 ID in example 1, 2 IDs in example 2 +id_list_feature_lengths = torch.tensor([1, 2]) + +# Values (IDs) tensor: ID 5 is in example 1, ID 7, 1 is in example 2 +id_list_feature_values = torch.tensor([5, 7, 1]) + + +###################################################################### +# Next, let's look at the offsets as well as what is contained in each batch +# + +# Lengths can be converted to offsets for easy indexing of values +id_list_feature_offsets = torch.cumsum(id_list_feature_lengths, dim=0) + +print("Offsets: ", id_list_feature_offsets) +print("First Batch: ", id_list_feature_values[: id_list_feature_offsets[0]]) +print( + "Second Batch: ", + id_list_feature_values[id_list_feature_offsets[0] : id_list_feature_offsets[1]], +) + +from torchrec import JaggedTensor + +# ``JaggedTensor`` is just a wrapper around lengths/offsets and values tensors! +jt = JaggedTensor(values=id_list_feature_values, lengths=id_list_feature_lengths) + +# Automatically compute offsets from lengths +print("Offsets: ", jt.offsets()) + +# Convert to list of values +print("List of Values: ", jt.to_dense()) + +# ``__str__`` representation +print(jt) + +from torchrec import KeyedJaggedTensor + +# ``JaggedTensor`` represents IDs for 1 feature, but we have multiple features in an ``EmbeddingBagCollection`` +# That's where ``KeyedJaggedTensor`` comes in! ``KeyedJaggedTensor`` is just multiple ``JaggedTensors`` for multiple id_list_feature_offsets +# From before, we have our two features "product" and "user". Let's create ``JaggedTensors`` for both! + +product_jt = JaggedTensor( + values=torch.tensor([1, 2, 1, 5]), lengths=torch.tensor([3, 1]) +) +user_jt = JaggedTensor(values=torch.tensor([2, 3, 4, 1]), lengths=torch.tensor([2, 2])) + +# Q1: How many batches are there, and which values are in the first batch for ``product_jt`` and ``user_jt``? +kjt = KeyedJaggedTensor.from_jt_dict({"product": product_jt, "user": user_jt}) + +# Look at our feature keys for the ``KeyedJaggedTensor`` +print("Keys: ", kjt.keys()) + +# Look at the overall lengths for the ``KeyedJaggedTensor`` +print("Lengths: ", kjt.lengths()) + +# Look at all values for ``KeyedJaggedTensor`` +print("Values: ", kjt.values()) + +# Can convert ``KeyedJaggedTensor`` to dictionary representation +print("to_dict: ", kjt.to_dict()) + +# ``KeyedJaggedTensor`` string representation +print(kjt) + +# Q2: What are the offsets for the ``KeyedJaggedTensor``? + +# Now we can run a forward pass on our ``EmbeddingBagCollection`` from before +result = ebc(kjt) +result + +# Result is a ``KeyedTensor``, which contains a list of the feature names and the embedding results +print(result.keys()) + +# The results shape is [2, 128], as batch size of 2. Reread previous section if you need a refresher on how the batch size is determined +# 128 for dimension of embedding. If you look at where we initialized the ``EmbeddingBagCollection``, we have two tables "product" and "user" of dimension 64 each +# meaning embeddings for both features are of size 64. 64 + 64 = 128 +print(result.values().shape) + +# Nice to_dict method to determine the embeddings that belong to each feature +result_dict = result.to_dict() +for key, embedding in result_dict.items(): + print(key, embedding.shape) + + +###################################################################### +# Congrats! You now understand TorchRec modules and data types. +# Give yourself a pat on the back for making it this far. Next, we will +# learn about distributed training and sharding. +# + + +###################################################################### +# Distributed Training and Sharding +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Now that we have a grasp on TorchRec modules and data types, it's time +# to take it to the next level. +# +# Remember, the main purpose of TorchRec is to provide primitives for +# distributed embeddings. So far, we've only worked with embedding tables +# on a single device. This has been possible given how small the embedding tables +# have been, but in a production setting this isn't generally the case. +# Embedding tables often get massive, where one table can't fit on a single +# GPU, creating the requirement for multiple devices and a distributed +# environment. +# +# In this section, we will explore setting up a distributed environment, +# exactly how actual production training is done, and explore sharding +# embedding tables, all with TorchRec. +# +# **This section will also only use 1 GPU, though it will be treated in a +# distributed fashion. This is only a limitation for training, as training +# has a process per GPU. Inference does not run into this requirement** +# +# In the example code below, we set up our PyTorch distributed environment. +# +# .. warning:: +# If you are running this in Google Colab, you can only call this cell once, +# calling it again will cause an error as you can only initialize the process +# group once. + +import os + +import torch.distributed as dist + +# Set up environment variables for distributed training +# RANK is which GPU we are on, default 0 +os.environ["RANK"] = "0" +# How many devices in our "world", colab notebook can only handle 1 process +os.environ["WORLD_SIZE"] = "1" +# Localhost as we are training locally +os.environ["MASTER_ADDR"] = "localhost" +# Port for distributed training +os.environ["MASTER_PORT"] = "29500" + +# nccl backend is for GPUs, gloo is for CPUs +dist.init_process_group(backend="gloo") + +print(f"Distributed environment initialized: {dist}") + + +###################################################################### +# Distributed Embeddings +# ~~~~~~~~~~~~~~~~~~~~~~ +# +# We have already worked with the main TorchRec module: +# ``EmbeddingBagCollection``. We have examined how it works along with how +# data is represented in TorchRec. However, we have not yet explored one +# of the main parts of TorchRec, which is **distributed embeddings**. +# +# GPUs are the most popular choice for ML workloads by far today, as they +# are able to do magnitudes more floating point operations/s +# (`FLOPs `__) than CPU. However, +# GPUs come with the limitation of scarce fast memory (HBM which is +# analogous to RAM for CPU), typically, ~10s of GBs. +# +# A RecSys model can contain embedding tables that far exceed the memory +# limit for 1 GPU, hence the need for distribution of the embedding tables +# across multiple GPUs, otherwise known as **model parallel**. On the +# other hand, **data parallel** is where the entire model is replicated on +# each GPU, which each GPU taking in a distinct batch of data for +# training, syncing gradients on the backwards pass. +# +# Parts of the model that **require less compute but more memory +# (embeddings) are distributed with model parallel** while parts that +# **require more compute and less memory (dense layers, MLP, etc.) are +# distributed with data parallel**. +# +# Sharding +# ~~~~~~~~ +# +# In order to distribute an embedding table, we split up the embedding +# table into parts and place those parts onto different devices, also +# known as “sharding”. +# +# There are many ways to shard embedding tables. The most common ways are: +# +# * Table-Wise: the table is placed entirely onto one device +# * Column-Wise: columns of embedding tables are sharded +# * Row-Wise: rows of embedding tables are sharded +# +# Sharded Modules +# ~~~~~~~~~~~~~~~ +# +# While all of this seems like a lot to deal with and implement, you're in +# luck. **TorchRec provides all the primitives for easy distributed +# training and inference**! In fact, TorchRec modules have two corresponding +# classes for working with any TorchRec module in a distributed +# environment: +# +# * **The module sharder**: This class exposes a ``shard`` API +# that handles sharding a TorchRec Module, producing a sharded module. +# * For ``EmbeddingBagCollection``, the sharder is `EmbeddingBagCollectionSharder `__ +# * **Sharded module**: This class is a sharded variant of a TorchRec module. +# It has the same input/output as a the regular TorchRec module, but much +# more optimized and works in a distributed environment. +# * For ``EmbeddingBagCollection``, the sharded variant is `ShardedEmbeddingBagCollection `__ +# +# Every TorchRec module has an unsharded and sharded variant. +# +# * The unsharded version is meant to be prototyped and experimented with. +# * The sharded version is meant to be used in a distributed environment for +# distributed training and inference. +# +# The sharded versions of TorchRec modules, for example +# ``EmbeddingBagCollection``, will handle everything that is needed for Model +# Parallelism, such as communication between GPUs for distributing +# embeddings to the correct GPUs. +# +# Refresher of our ``EmbeddingBagCollection`` module +ebc + +from torchrec.distributed.embeddingbag import EmbeddingBagCollectionSharder +from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology +from torchrec.distributed.types import ShardingEnv + +# Corresponding sharder for ``EmbeddingBagCollection`` module +sharder = EmbeddingBagCollectionSharder() + +# ``ProcessGroup`` from torch.distributed initialized 2 cells above +pg = dist.GroupMember.WORLD +assert pg is not None, "Process group is not initialized" + +print(f"Process Group: {pg}") + + +###################################################################### +# Planner +# ~~~~~~~ +# +# Before we can show how sharding works, we must know about the +# **planner**, which helps us determine the best sharding configuration. +# +# Given a number of embedding tables and a number of ranks, there are many +# different sharding configurations that are possible. For example, given +# 2 embedding tables and 2 GPUs, you can: +# +# * Place 1 table on each GPU +# * Place both tables on a single GPU and no tables on the other +# * Place certain rows and columns on each GPU +# +# Given all of these possibilities, we typically want a sharding +# configuration that is optimal for performance. +# +# That is where the planner comes in. The planner is able to determine +# given the number of embedding tables and the number of GPUs, what is the optimal +# configuration. Turns out, this is incredibly difficult to do manually, +# with tons of factors that engineers have to consider to ensure an +# optimal sharding plan. Luckily, TorchRec provides an auto planner when +# the planner is used. +# +# The TorchRec planner: +# +# * Assesses memory constraints of hardware +# * Estimates compute based on memory fetches as embedding lookups +# * Addresses data specific factors +# * Considers other hardware specifics like bandwidth to generate an optimal sharding plan +# +# In order to take into consideration all these variables, The TorchRec +# planner can take in `various amounts of data for embedding tables, +# constraints, hardware information, and +# topology `__ +# to aid in generating the optimal sharding plan for a model, which is +# routinely provided across stacks. +# +# To learn more about sharding, see our `sharding +# tutorial `__. +# + +# In our case, 1 GPU and compute on CUDA device +planner = EmbeddingShardingPlanner( + topology=Topology( + world_size=1, + compute_device="cuda", + ) +) + +# Run planner to get plan for sharding +plan = planner.collective_plan(ebc, [sharder], pg) + +print(f"Sharding Plan generated: {plan}") + + +###################################################################### +# Planner Result +# ~~~~~~~~~~~~~~ +# +# As you can see above, when running the planner there is quite a bit of output. +# We can see a lot of stats being calculated along with where our +# tables end up being placed. +# +# The result of running the planner is a static plan, which can be reused +# for sharding! This allows sharding to be static for production models +# instead of determining a new sharding plan everytime. Below, we use the +# sharding plan to finally generate our ``ShardedEmbeddingBagCollection``. +# + +# The static plan that was generated +plan + +env = ShardingEnv.from_process_group(pg) + +# Shard the ``EmbeddingBagCollection`` module using the ``EmbeddingBagCollectionSharder`` +sharded_ebc = sharder.shard(ebc, plan.plan[""], env, torch.device("cuda")) + +print(f"Sharded EBC Module: {sharded_ebc}") + + +###################################################################### +# GPU Training with ``LazyAwaitable`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Remember that TorchRec is a highly optimized library for distributed +# embeddings. A concept that TorchRec introduces to enable higher +# performance for training on GPU is a +# `LazyAwaitable `__. +# You will see ``LazyAwaitable`` types as outputs of various sharded +# TorchRec modules. All a ``LazyAwaitable`` type does is delay calculating some +# result as long as possible, and it does it by acting like an async type. +# + +from typing import List + +from torchrec.distributed.types import LazyAwaitable + + +# Demonstrate a ``LazyAwaitable`` type: +class ExampleAwaitable(LazyAwaitable[torch.Tensor]): + def __init__(self, size: List[int]) -> None: + super().__init__() + self._size = size + + def _wait_impl(self) -> torch.Tensor: + return torch.ones(self._size) + + +awaitable = ExampleAwaitable([3, 2]) +awaitable.wait() + +kjt = kjt.to("cuda") +output = sharded_ebc(kjt) +# The output of our sharded ``EmbeddingBagCollection`` module is an `Awaitable`? +print(output) + +kt = output.wait() +# Now we have our ``KeyedTensor`` after calling ``.wait()`` +# If you are confused as to why we have a ``KeyedTensor ``output, +# give yourself a refresher on the unsharded ``EmbeddingBagCollection`` module +print(type(kt)) + +print(kt.keys()) + +print(kt.values().shape) + +# Same output format as unsharded ``EmbeddingBagCollection`` +result_dict = kt.to_dict() +for key, embedding in result_dict.items(): + print(key, embedding.shape) + + +###################################################################### +# Anatomy of Sharded TorchRec modules +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have now successfully sharded an ``EmbeddingBagCollection`` given a +# sharding plan that we generated! The sharded module has common APIs from +# TorchRec which abstract away distributed communication/compute amongst +# multiple GPUs. In fact, these APIs are highly optimized for performance +# in training and inference. **Below are the three common APIs for +# distributed training/inference** that are provided by TorchRec: +# +# * ``input_dist``: Handles distributing inputs from GPU to GPU. +# * ``lookups``: Does the actual embedding lookup in an optimized, +# batched manner using FBGEMM TBE (more on this later). +# * ``output_dist``: Handles distributing outputs from GPU to GPU. +# +# The distribution of inputs and outputs is done through `NCCL +# Collectives `__, +# namely +# `All-to-Alls `__, +# which is where all GPUs send and receive data to and from one another. +# TorchRec interfaces with PyTorch distributed for collectives and +# provides clean abstractions to the end users, removing the concern for +# the lower level details. +# +# The backwards pass does all of these collectives but in the reverse +# order for distribution of gradients. ``input_dist``, ``lookup``, and +# ``output_dist`` all depend on the sharding scheme. Since we sharded in a +# table-wise fashion, these APIs are modules that are constructed by +# `TwPooledEmbeddingSharding `__. +# + +sharded_ebc + +# Distribute input KJTs to all other GPUs and receive KJTs +sharded_ebc._input_dists + +# Distribute output embeddings to all other GPUs and receive embeddings +sharded_ebc._output_dists + + +###################################################################### +# Optimizing Embedding Lookups +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In performing lookups for a collection of embedding tables, a trivial +# solution would be to iterate through all the ``nn.EmbeddingBags`` and do +# a lookup per table. This is exactly what the standard, unsharded +# ``EmbeddingBagCollection`` does. However, while this solution +# is simple, it is extremely slow. +# +# `FBGEMM `__ is a +# library that provides GPU operators (otherwise known as kernels) that +# are very optimized. One of these operators is known as **Table Batched +# Embedding** (TBE), provides two major optimizations: +# +# - Table batching, which allows you to look up multiple embeddings with +# one kernel call. +# - Optimizer Fusion, which allows the module to update itself given the +# canonical pytorch optimizers and arguments. +# +# The ``ShardedEmbeddingBagCollection`` uses the FBGEMM TBE as the lookup +# instead of traditional ``nn.EmbeddingBags`` for optimized embedding +# lookups. +# + +sharded_ebc._lookups + + +###################################################################### +# ``DistributedModelParallel`` +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We have now explored sharding a single ``EmbeddingBagCollection``! We were +# able to take the ``EmbeddingBagCollectionSharder`` and use the unsharded +# ``EmbeddingBagCollection`` to generate a +# ``ShardedEmbeddingBagCollection`` module. This workflow is fine, but +# typically when implementing model parallel, +# `DistributedModelParallel `__ +# (DMP) is used as the standard interface. When wrapping your model (in +# our case ``ebc``), with DMP, the following will occur: +# +# 1. Decide how to shard the model. DMP will collect the available +# sharders and come up with a plan of the optimal way to shard the +# embedding table(s) (for example, ``EmbeddingBagCollection``) +# 2. Actually shard the model. This includes allocating memory for each +# embedding table on the appropriate device(s). +# +# DMP takes in everything that we've just experimented with, like a static +# sharding plan, a list of sharders, etc. However, it also has some nice +# defaults to seamlessly shard a TorchRec model. In this toy example, +# since we have two embedding tables and one GPU, TorchRec will place both +# on the single GPU. +# + +ebc + +model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) + +out = model(kjt) +out.wait() + +model + + +###################################################################### +# Sharding Best Practices +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# Currently, our configuration is only sharding on 1 GPU (or rank), which +# is trivial: just place all the tables on 1 GPUs memory. However, in real +# production use cases, embedding tables are **typically sharded on +# hundreds of GPUs**, with different sharding methods such as table-wise, +# row-wise, and column-wise. It is incredibly important to determine a +# proper sharding configuration (to prevent out of memory issues) while +# keeping it balanced not only in terms of memory but also compute for +# optimal performance. +# + + +###################################################################### +# Adding in the Optimizer +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# Remember that TorchRec modules are hyperoptimized for large scale +# distributed training. An important optimization is in regards to the +# optimizer. +# +# TorchRec modules provide a seamless API to fuse the +# backwards pass and optimize step in training, providing a significant +# optimization in performance and decreasing the memory used, alongside +# granularity in assigning distinct optimizers to distinct model +# parameters. +# +# Optimizer Classes +# ^^^^^^^^^^^^^^^^^ +# +# TorchRec uses ``CombinedOptimizer``, which contains a collection of +# ``KeyedOptimizers``. A ``CombinedOptimizer`` effectively makes it easy +# to handle multiple optimizers for various sub groups in the model. A +# ``KeyedOptimizer`` extends the ``torch.optim.Optimizer`` and is +# initialized through a dictionary of parameters exposes the parameters. +# Each ``TBE`` module in a ``EmbeddingBagCollection`` will have it's own +# ``KeyedOptimizer`` which combines into one ``CombinedOptimizer``. +# +# Fused optimizer in TorchRec +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Using ``DistributedModelParallel``, the **optimizer is fused, which +# means that the optimizer update is done in the backward**. This is an +# optimization in TorchRec and FBGEMM, where the optimizer embedding +# gradients are not materialized and applied directly to the parameters. +# This brings significant memory savings as embedding gradients are +# typically size of the parameters themselves. +# +# You can, however, choose to make the optimizer ``dense`` which does not +# apply this optimization and let's you inspect the embedding gradients or +# apply computations to it as you wish. A dense optimizer in this case +# would be your `canonical PyTorch model training loop with +# optimizer. `__ +# +# Once the optimizer is created through ``DistributedModelParallel``, you +# still need to manage an optimizer for the other parameters not +# associated with TorchRec embedding modules. To find the other +# parameters, +# use ``in_backward_optimizer_filter(model.named_parameters())``. +# Apply an optimizer to those parameters as you would a normal Torch +# optimizer and combine this and the ``model.fused_optimizer`` into one +# ``CombinedOptimizer`` that you can use in your training loop to +# ``zero_grad`` and ``step`` through. +# +# Adding an Optimizer to ``EmbeddingBagCollection`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We will do this in two ways, which are equivalent, but give you options +# depending on your preferences: +# +# 1. Passing optimizer kwargs through ``fused_params`` in sharder. +# 2. Through ``apply_optimizer_in_backward``, which converts the optimizer +# parameters to ``fused_params`` to pass to the ``TBE`` in the ``EmbeddingBagCollection`` or ``EmbeddingCollection``. +# + +# Option 1: Passing optimizer kwargs through fused parameters +from torchrec.optim.optimizers import in_backward_optimizer_filter +from fbgemm_gpu.split_embedding_configs import EmbOptimType + + +# We initialize the sharder with +fused_params = { + "optimizer": EmbOptimType.EXACT_ROWWISE_ADAGRAD, + "learning_rate": 0.02, + "eps": 0.002, +} + +# Initialize sharder with ``fused_params`` +sharder_with_fused_params = EmbeddingBagCollectionSharder(fused_params=fused_params) + +# We'll use same plan and unsharded EBC as before but this time with our new sharder +sharded_ebc_fused_params = sharder_with_fused_params.shard(ebc, plan.plan[""], env, torch.device("cuda")) + +# Looking at the optimizer of each, we can see that the learning rate changed, which indicates our optimizer has been applied correctly. +# If seen, we can also look at the TBE logs of the cell to see that our new optimizer is indeed being applied +print(f"Original Sharded EBC fused optimizer: {sharded_ebc.fused_optimizer}") +print(f"Sharded EBC with fused parameters fused optimizer: {sharded_ebc_fused_params.fused_optimizer}") + +print(f"Type of optimizer: {type(sharded_ebc_fused_params.fused_optimizer)}") + +from torch.distributed.optim import _apply_optimizer_in_backward as apply_optimizer_in_backward +import copy +# Option 2: Applying optimizer through apply_optimizer_in_backward +# Note: we need to call apply_optimizer_in_backward on unsharded model first and then shard it + +# We can achieve the same result as we did in the previous +ebc_apply_opt = copy.deepcopy(ebc) +optimizer_kwargs = {"lr": 0.5} + +for name, param in ebc_apply_opt.named_parameters(): + print(f"{name=}") + apply_optimizer_in_backward(torch.optim.SGD, [param], optimizer_kwargs) + +sharded_ebc_apply_opt = sharder.shard(ebc_apply_opt, plan.plan[""], env, torch.device("cuda")) + +# Now when we print the optimizer, we will see our new learning rate, you can verify momentum through the TBE logs as well if outputted +print(sharded_ebc_apply_opt.fused_optimizer) +print(type(sharded_ebc_apply_opt.fused_optimizer)) + +# We can also check through the filter other parameters that aren't associated with the "fused" optimizer(s) +# Practically, just non TorchRec module parameters. Since our module is just a TorchRec EBC +# there are no other parameters that aren't associated with TorchRec +print("Non Fused Model Parameters:") +print(dict(in_backward_optimizer_filter(sharded_ebc_fused_params.named_parameters())).keys()) + +# Here we do a dummy backwards call and see that parameter updates for fused +# optimizers happen as a result of the backward pass + +ebc_output = sharded_ebc_fused_params(kjt).wait().values() +loss = torch.sum(torch.ones_like(ebc_output) - ebc_output) +print(f"First Iteration Loss: {loss}") + +loss.backward() + +ebc_output = sharded_ebc_fused_params(kjt).wait().values() +loss = torch.sum(torch.ones_like(ebc_output) - ebc_output) +# We don't call an optimizer.step(), so for the loss to have changed here, +# that means that the gradients were somehow updated, which is what the +# fused optimizer automatically handles for us +print(f"Second Iteration Loss: {loss}") + + +###################################################################### +# Inference +# ~~~~~~~~~ +# +# Now that we are able to train distributed embeddings, how can we take +# the trained model and optimize it for inference? Inference is typically +# very sensitive to **performance and size of the model**. Running just +# the trained model in a Python environment is incredibly inefficient. +# There are two key differences between inference and training +# environments: +# +# * **Quantization**: Inference models are typically +# quantized, where model parameters lose precision for lower latency in +# predictions and reduced model size. For example FP32 (4 bytes) in +# trained model to INT8 (1 byte) for each embedding weight. This is also +# necessary given the vast scale of embedding tables, as we want to use as +# few devices as possible for inference to minimize latency. +# +# * **C++ environment**: Inference latency is very important, so in order to ensure +# ample performance, the model is typically ran in a C++ environment, +# along with the situations where we don't have a Python runtime, like on +# device. +# +# TorchRec provides primitives for converting a TorchRec model into being +# inference ready with: +# +# * APIs for quantizing the model, introducing +# optimizations automatically with FBGEMM TBE +# * Sharding embeddings for distributed inference +# * Compiling the model to `TorchScript `__ +# (compatible in C++) +# +# In this section, we will go over this entire workflow of: +# +# * Quantizing the model +# * Sharding the quantized model +# * Compiling the sharded quantized model into TorchScript +# + +ebc + +class InferenceModule(torch.nn.Module): + def __init__(self, ebc: torchrec.EmbeddingBagCollection): + super().__init__() + self.ebc_ = ebc + + def forward(self, kjt: KeyedJaggedTensor): + return self.ebc_(kjt) + +module = InferenceModule(ebc) +for name, param in module.named_parameters(): + # Here, the parameters should still be FP32, as we are using a standard EBC + # FP32 is default, regularly used for training + print(name, param.shape, param.dtype) + + +###################################################################### +# Quantization +# ~~~~~~~~~~~~ +# +# As you can see above, the normal EBC contains embedding table weights as +# FP32 precision (32 bits for each weight). Here, we will use the TorchRec +# inference library to quantize the embedding weights of the model to INT8 +# + +from torch import quantization as quant +from torchrec.modules.embedding_configs import QuantConfig +from torchrec.quant.embedding_modules import ( + EmbeddingBagCollection as QuantEmbeddingBagCollection, +) + + +quant_dtype = torch.int8 + + +qconfig = QuantConfig( + # dtype of the result of the embedding lookup, post activation + # torch.float generally for compatibility with rest of the model + # as rest of the model here usually isn't quantized + activation=quant.PlaceholderObserver.with_args(dtype=torch.float), + # quantized type for embedding weights, aka parameters to actually quantize + weight=quant.PlaceholderObserver.with_args(dtype=quant_dtype), +) +qconfig_spec = { + # Map of module type to qconfig + torchrec.EmbeddingBagCollection: qconfig, +} +mapping = { + # Map of module type to quantized module type + torchrec.EmbeddingBagCollection: QuantEmbeddingBagCollection, +} + + +module = InferenceModule(ebc) + +# Quantize the module +qebc = quant.quantize_dynamic( + module, + qconfig_spec=qconfig_spec, + mapping=mapping, + inplace=False, +) + + +print(f"Quantized EBC: {qebc}") + +kjt = kjt.to("cpu") + +qebc(kjt) + +# Once quantized, goes from parameters -> buffers, as no longer trainable +for name, buffer in qebc.named_buffers(): + # The shapes of the tables should be the same but the dtype should be int8 now + # post quantization + print(name, buffer.shape, buffer.dtype) + + +###################################################################### +# Shard +# ~~~~~ +# +# Here we perform sharding of the TorchRec quantized model. This is to +# ensure we are using the performant module through FBGEMM TBE. Here we +# are using one device to be consistent with training (1 TBE). +# + +from torchrec import distributed as trec_dist +from torchrec.distributed.shard import _shard_modules + + +sharded_qebc = _shard_modules( + module=qebc, + device=torch.device("cpu"), + env=trec_dist.ShardingEnv.from_local( + 1, + 0, + ), +) + + +print(f"Sharded Quantized EBC: {sharded_qebc}") + +sharded_qebc(kjt) + + +###################################################################### +# Compilation +# ~~~~~~~~~~~ +# +# Now we have the optimized eager TorchRec inference model. The next step +# is to ensure that this model is loadable in C++, as currently it is only +# runnable in a Python runtime. +# +# The recommended method of compilation at Meta is two fold: `torch.fx +# tracing `__ (generate +# intermediate representation of model) and converting the result to +# TorchScript, where TorchScript is C++ compatible. +# + +from torchrec.fx import Tracer + + +tracer = Tracer(leaf_modules=["IntNBitTableBatchedEmbeddingBagsCodegen"]) + +graph = tracer.trace(sharded_qebc) +gm = torch.fx.GraphModule(sharded_qebc, graph) + +print("Graph Module Created!") + +print(gm.code) + +scripted_gm = torch.jit.script(gm) +print("Scripted Graph Module Created!") + +print(scripted_gm.code) + + +###################################################################### +# Conclusion +# ^^^^^^^^^^ +# +# In this tutorial, you have gone from training a distributed RecSys model all the way +# to making it inference ready. The `TorchRec repo +# `__ has a +# full example of how to load a TorchRec TorchScript model into C++ for +# inference. +# + + +###################################################################### +# See Also +# -------------- +# +# For more information, please see our +# `dlrm `__ +# example, which includes multinode training on the Criteo 1TB +# dataset using the methods described in `Deep Learning Recommendation Model +# for Personalization and Recommendation Systems `__. +# diff --git a/intermediate_source/torchrec_tutorial.rst b/intermediate_source/torchrec_tutorial.rst index 6a450b16591..883ca11087a 100644 --- a/intermediate_source/torchrec_tutorial.rst +++ b/intermediate_source/torchrec_tutorial.rst @@ -1,244 +1,10 @@ Introduction to TorchRec ======================== -.. tip:: - To get the most of this tutorial, we suggest using this - `Colab Version `__. - This will allow you to experiment with the information presented below. - -Follow along with the video below or on `youtube `__. +There is a newer tutorial on this topic. -.. raw:: html - -
    - -
    - -When building recommendation systems, we frequently want to represent -entities like products or pages with embeddings. For example, see Meta -AI’s `Deep learning recommendation -model `__, or DLRM. As the number of -entities grow, the size of the embedding tables can exceed a single -GPU’s memory. A common practice is to shard the embedding table across -devices, a type of model parallelism. To that end, TorchRec introduces -its primary API -called |DistributedModelParallel|_, -or DMP. Like PyTorch’s DistributedDataParallel, DMP wraps a model to -enable distributed training. - -Installation ------------- - -Requirements: python >= 3.7 - -We highly recommend CUDA when using TorchRec (If using CUDA: cuda >= 11.0). - - -.. code:: shell - - # install pytorch with cudatoolkit 11.3 - conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y - # install TorchRec - pip3 install torchrec-nightly - - -Overview --------- - -This tutorial will cover three pieces of TorchRec: the ``nn.module`` |EmbeddingBagCollection|_, the |DistributedModelParallel|_ API, and -the datastructure |KeyedJaggedTensor|_. - - -Distributed Setup -~~~~~~~~~~~~~~~~~ - -We setup our environment with torch.distributed. For more info on -distributed, see this -`tutorial `__. - -Here, we use one rank (the colab process) corresponding to our 1 colab -GPU. - -.. code:: python - - import os - import torch - import torchrec - import torch.distributed as dist - - os.environ["RANK"] = "0" - os.environ["WORLD_SIZE"] = "1" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "29500" - - # Note - you will need a V100 or A100 to run tutorial as as! - # If using an older GPU (such as colab free K80), - # you will need to compile fbgemm with the appripriate CUDA architecture - # or run with "gloo" on CPUs - dist.init_process_group(backend="nccl") - - -From EmbeddingBag to EmbeddingBagCollection -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PyTorch represents embeddings through |torch.nn.Embedding|_ and |torch.nn.EmbeddingBag|_. -EmbeddingBag is a pooled version of Embedding. - -TorchRec extends these modules by creating collections of embeddings. We -will use |EmbeddingBagCollection|_ to represent a group of EmbeddingBags. - -Here, we create an EmbeddingBagCollection (EBC) with two embedding bags. -Each table, ``product_table`` and ``user_table``, is represented by a 64 -dimension embedding of size 4096. Note how we initially allocate the EBC -on device “meta”. This will tell EBC to not allocate memory yet. - -.. code:: python - - ebc = torchrec.EmbeddingBagCollection( - device="meta", - tables=[ - torchrec.EmbeddingBagConfig( - name="product_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["product"], - pooling=torchrec.PoolingType.SUM, - ), - torchrec.EmbeddingBagConfig( - name="user_table", - embedding_dim=64, - num_embeddings=4096, - feature_names=["user"], - pooling=torchrec.PoolingType.SUM, - ) - ] - ) - - -DistributedModelParallel -~~~~~~~~~~~~~~~~~~~~~~~~ - -Now, we’re ready to wrap our model with |DistributedModelParallel|_ (DMP). Instantiating DMP will: - -1. Decide how to shard the model. DMP will collect the available - ‘sharders’ and come up with a ‘plan’ of the optimal way to shard the - embedding table(s) (i.e., the EmbeddingBagCollection). -2. Actually shard the model. This includes allocating memory for each - embedding table on the appropriate device(s). - -In this toy example, since we have two EmbeddingTables and one GPU, -TorchRec will place both on the single GPU. - -.. code:: python - - model = torchrec.distributed.DistributedModelParallel(ebc, device=torch.device("cuda")) - print(model) - print(model.plan) - - -Query vanilla nn.EmbeddingBag with input and offsets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We query |nn.Embedding|_ and |nn.EmbeddingBag|_ -with ``input`` and ``offsets``. Input is a 1-D tensor containing the -lookup values. Offsets is a 1-D tensor where the sequence is a -cumulative sum of the number of values to pool per example. - -Let’s look at an example, recreating the product EmbeddingBag above: - -:: - - |------------| - | product ID | - |------------| - | [101, 202] | - | [] | - | [303] | - |------------| - -.. code:: python - - product_eb = torch.nn.EmbeddingBag(4096, 64) - product_eb(input=torch.tensor([101, 202, 303]), offsets=torch.tensor([0, 2, 2])) - - -Representing minibatches with KeyedJaggedTensor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We need an efficient representation of multiple examples of an arbitrary -number of entity IDs per feature per example. In order to enable this -“jagged” representation, we use the TorchRec datastructure -|KeyedJaggedTensor|_ (KJT). - -Let’s take a look at how to lookup a collection of two embedding -bags, “product” and “user”. Assume the minibatch is made up of three -examples for three users. The first of which has two product IDs, the -second with none, and the third with one product ID. - -:: - - |------------|------------| - | product ID | user ID | - |------------|------------| - | [101, 202] | [404] | - | [] | [505] | - | [303] | [606] | - |------------|------------| - -The query should be: - -.. code:: python - - mb = torchrec.KeyedJaggedTensor( - keys = ["product", "user"], - values = torch.tensor([101, 202, 303, 404, 505, 606]).cuda(), - lengths = torch.tensor([2, 0, 1, 1, 1, 1], dtype=torch.int64).cuda(), - ) - - print(mb.to(torch.device("cpu"))) - - -Note that the KJT batch size is -``batch_size = len(lengths)//len(keys)``. In the above example, -batch_size is 3. - - - -Putting it all together, querying our distributed model with a KJT minibatch -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, we can query our model using our minibatch of products and -users. - -The resulting lookup will contain a KeyedTensor, where each key (or -feature) contains a 2D tensor of size 3x64 (batch_size x embedding_dim). - -.. code:: python - - pooled_embeddings = model(mb) - print(pooled_embeddings) - - -More resources --------------- - -For more information, please see our -`dlrm `__ -example, which includes multinode training on the criteo terabyte -dataset, using Meta’s `DLRM `__. +Redirecting... +.. raw:: html -.. |DistributedModelParallel| replace:: ``DistributedModelParallel`` -.. _DistributedModelParallel: https://pytorch.org/torchrec/torchrec.distributed.html#torchrec.distributed.model_parallel.DistributedModelParallel -.. |EmbeddingBagCollection| replace:: ``EmbeddingBagCollection`` -.. _EmbeddingBagCollection: https://pytorch.org/torchrec/torchrec.modules.html#torchrec.modules.embedding_modules.EmbeddingBagCollection -.. |KeyedJaggedTensor| replace:: ``KeyedJaggedTensor`` -.. _KeyedJaggedTensor: https://pytorch.org/torchrec/torchrec.sparse.html#torchrec.sparse.jagged_tensor.JaggedTensor -.. |torch.nn.Embedding| replace:: ``torch.nn.Embedding`` -.. _torch.nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |torch.nn.EmbeddingBag| replace:: ``torch.nn.EmbeddingBag`` -.. _torch.nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html -.. |nn.Embedding| replace:: ``nn.Embedding`` -.. _nn.Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html -.. |nn.EmbeddingBag| replace:: ``nn.EmbeddingBag`` -.. _nn.EmbeddingBag: https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html + diff --git a/intermediate_source/torchserve_with_ipex.rst b/intermediate_source/torchserve_with_ipex.rst index fbf705a7c47..23d91f50cb6 100644 --- a/intermediate_source/torchserve_with_ipex.rst +++ b/intermediate_source/torchserve_with_ipex.rst @@ -265,7 +265,7 @@ Additionally, notice that thread (TID:97097) was executing on a large number of Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration. 2. torch.set_num_threads = ``number of physical cores / number of workers`` (no core pinning) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For an apple-to-apple comparison with launcher's core pinning, we'll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the `base_handler `_: @@ -379,8 +379,8 @@ For interested readers, please check out the following documents: - `CPU specific optimizations `_ - `Maximize Performance of Intel® Software Optimization for PyTorch* on CPU `_ -- `Performance Tuning Guide `_ -- `Launch Script Usage Guide `_ +- `Performance Tuning Guide `_ +- `Launch Script Usage Guide `_ - `Top-down Microarchitecture Analysis Method `_ - `Configuring oneDNN for Benchmarking `_ - `Intel® VTune™ Profiler `_ diff --git a/intermediate_source/torchserve_with_ipex_2.rst b/intermediate_source/torchserve_with_ipex_2.rst index 6ace1e6a3e2..64f3db6b27c 100644 --- a/intermediate_source/torchserve_with_ipex_2.rst +++ b/intermediate_source/torchserve_with_ipex_2.rst @@ -366,7 +366,7 @@ Above is oneDNN verbose from channels first. We can verify that there are reorde Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders. Performance Boost with Intel® Extension for PyTorch* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Below summarizes performance boost of TorchServe with Intel® Extension for PyTorch* for ResNet50 and BERT-base-uncased. .. figure:: /_static/img/torchserve-ipex-images-2/19.png diff --git a/_static/tv-training-code.py b/intermediate_source/torchvision_tutorial.py similarity index 91% rename from _static/tv-training-code.py rename to intermediate_source/torchvision_tutorial.py index bdd93760a7d..d1e4c5c5d5e 100644 --- a/_static/tv-training-code.py +++ b/intermediate_source/torchvision_tutorial.py @@ -6,17 +6,10 @@ ###################################################################### # -# .. tip:: -# -# To get the most of this tutorial, we suggest using this -# `Colab Version `__. -# This will allow you to experiment with the information presented below. -# -# # For this tutorial, we will be finetuning a pre-trained `Mask -# R-CNN `__ model on the `Penn-Fudan +# R-CNN `_ model on the `Penn-Fudan # Database for Pedestrian Detection and -# Segmentation `__. It contains +# Segmentation `_. It contains # 170 images with 345 instances of pedestrians, and we will use it to # illustrate how to use the new features in torchvision in order to train # an object detection and instance segmentation model on a custom dataset. @@ -35,7 +28,7 @@ # The reference scripts for training object detection, instance # segmentation and person keypoint detection allows for easily supporting # adding new custom datasets. The dataset should inherit from the standard -# ``torch.utils.data.Dataset`` class, and implement ``__len__`` and +# :class:`torch.utils.data.Dataset` class, and implement ``__len__`` and # ``__getitem__``. # # The only specificity that we require is that the dataset ``__getitem__`` @@ -65,7 +58,7 @@ # ``pycocotools`` which can be installed with ``pip install pycocotools``. # # .. note :: -# For Windows, please install ``pycocotools`` from `gautamchitnis `__ with command +# For Windows, please install ``pycocotools`` from `gautamchitnis `_ with command # # ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI`` # @@ -85,10 +78,16 @@ # Writing a custom dataset for PennFudan # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Let’s write a dataset for the PennFudan dataset. After `downloading and -# extracting the zip -# file `__, we -# have the following folder structure: +# Let’s write a dataset for the PennFudan dataset. First, let's download the dataset and +# extract the `zip file `_: +# +# .. code:: python +# +# wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P data +# cd data && unzip PennFudanPed.zip +# +# +# We have the following folder structure: # # :: # @@ -106,21 +105,33 @@ # FudanPed00004.png # # Here is one example of a pair of images and segmentation masks -# -# .. image:: ../../_static/img/tv_tutorial/tv_image01.png -# -# .. image:: ../../_static/img/tv_tutorial/tv_image02.png -# + +import matplotlib.pyplot as plt +from torchvision.io import read_image + + +image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") +mask = read_image("data/PennFudanPed/PedMasks/FudanPed00046_mask.png") + +plt.figure(figsize=(16, 8)) +plt.subplot(121) +plt.title("Image") +plt.imshow(image.permute(1, 2, 0)) +plt.subplot(122) +plt.title("Mask") +plt.imshow(mask.permute(1, 2, 0)) + +###################################################################### # So each image has a corresponding # segmentation mask, where each color correspond to a different instance. # Let’s write a :class:`torch.utils.data.Dataset` class for this dataset. # In the code below, we are wrapping images, bounding boxes and masks into -# ``torchvision.TVTensor`` classes so that we will be able to apply torchvision +# :class:`torchvision.tv_tensors.TVTensor` classes so that we will be able to apply torchvision # built-in transformations (`new Transforms API `_) # for the given object detection and segmentation task. # Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into # :class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`. -# As ``torchvision.TVTensor`` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain +# As :class:`torchvision.tv_tensors.TVTensor` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain # :class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see # `this documentation `_. @@ -196,8 +207,8 @@ def __len__(self): # ------------------- # # In this tutorial, we will be using `Mask -# R-CNN `__, which is based on top of -# `Faster R-CNN `__. Faster R-CNN is a +# R-CNN `_, which is based on top of +# `Faster R-CNN `_. Faster R-CNN is a # model that predicts both bounding boxes and class scores for potential # objects in the image. # @@ -345,6 +356,7 @@ def get_model_instance_segmentation(num_classes): os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py") os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py") +###################################################################### # Since v0.15.0 torchvision provides `new Transforms API `_ # to easily write data augmentation pipelines for Object Detection and Segmentation tasks. # @@ -362,7 +374,7 @@ def get_transform(train): transforms.append(T.ToPureTensor()) return T.Compose(transforms) - +###################################################################### # Testing ``forward()`` method (Optional) # --------------------------------------- # @@ -370,14 +382,12 @@ def get_transform(train): # expects during training and inference time on sample data. import utils - model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) data_loader = torch.utils.data.DataLoader( dataset, batch_size=2, shuffle=True, - num_workers=4, collate_fn=utils.collate_fn ) @@ -421,7 +431,6 @@ def get_transform(train): dataset, batch_size=2, shuffle=True, - num_workers=4, collate_fn=utils.collate_fn ) @@ -429,7 +438,6 @@ def get_transform(train): dataset_test, batch_size=1, shuffle=False, - num_workers=4, collate_fn=utils.collate_fn ) @@ -455,8 +463,8 @@ def get_transform(train): gamma=0.1 ) -# let's train it for 5 epochs -num_epochs = 5 +# let's train it just for 2 epochs +num_epochs = 2 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations @@ -477,14 +485,12 @@ def get_transform(train): # But what do the predictions look like? Let’s take one image in the # dataset and verify # -# .. image:: ../../_static/img/tv_tutorial/tv_image05.png -# import matplotlib.pyplot as plt from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks -image = read_image("../_static/img/tv_tutorial/tv_image05.png") +image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") eval_transform = get_transform(train=False) model.eval() @@ -517,7 +523,7 @@ def get_transform(train): # # In this tutorial, you have learned how to create your own training # pipeline for object detection models on a custom dataset. For -# that, you wrote a ``torch.utils.data.Dataset`` class that returns the +# that, you wrote a :class:`torch.utils.data.Dataset` class that returns the # images and the ground truth boxes and segmentation masks. You also # leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to # perform transfer learning on this new dataset. @@ -526,5 +532,3 @@ def get_transform(train): # training, check ``references/detection/train.py``, which is present in # the torchvision repository. # -# You can download a full source file for this tutorial -# `here `__. \ No newline at end of file diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst deleted file mode 100644 index a3856c16a11..00000000000 --- a/intermediate_source/torchvision_tutorial.rst +++ /dev/null @@ -1,638 +0,0 @@ -TorchVision Object Detection Finetuning Tutorial -==================================================== - -.. tip:: - - To get the most of this tutorial, we suggest using this - `Colab Version `__. - This will allow you to experiment with the information presented below. - - -For this tutorial, we will be finetuning a pre-trained `Mask -R-CNN `__ model on the `Penn-Fudan -Database for Pedestrian Detection and -Segmentation `__. It contains -170 images with 345 instances of pedestrians, and we will use it to -illustrate how to use the new features in torchvision in order to train -an object detection and instance segmentation model on a custom dataset. - - -.. note :: - - This tutorial works only with torchvision version >=0.16 or nightly. - If you're using torchvision<=0.15, please follow - `this tutorial instead `_. - - -Defining the Dataset --------------------- - -The reference scripts for training object detection, instance -segmentation and person keypoint detection allows for easily supporting -adding new custom datasets. The dataset should inherit from the standard -``torch.utils.data.Dataset`` class, and implement ``__len__`` and -``__getitem__``. - -The only specificity that we require is that the dataset ``__getitem__`` -should return a tuple: - -- image: :class:`torchvision.tv_tensors.Image` of shape ``[3, H, W]``, a pure tensor, or a PIL Image of size ``(H, W)`` -- target: a dict containing the following fields - - - ``boxes``, :class:`torchvision.tv_tensors.BoundingBoxes` of shape ``[N, 4]``: - the coordinates of the ``N`` bounding boxes in ``[x0, y0, x1, y1]`` format, ranging from ``0`` - to ``W`` and ``0`` to ``H`` - - ``labels``, integer :class:`torch.Tensor` of shape ``[N]``: the label for each bounding box. - ``0`` represents always the background class. - - ``image_id``, int: an image identifier. It should be - unique between all the images in the dataset, and is used during - evaluation - - ``area``, float :class:`torch.Tensor` of shape ``[N]``: the area of the bounding box. This is used - during evaluation with the COCO metric, to separate the metric - scores between small, medium and large boxes. - - ``iscrowd``, uint8 :class:`torch.Tensor` of shape ``[N]``: instances with ``iscrowd=True`` will be - ignored during evaluation. - - (optionally) ``masks``, :class:`torchvision.tv_tensors.Mask` of shape ``[N, H, W]``: the segmentation - masks for each one of the objects - -If your dataset is compliant with above requirements then it will work for both -training and evaluation codes from the reference script. Evaluation code will use scripts from -``pycocotools`` which can be installed with ``pip install pycocotools``. - -.. note :: - For Windows, please install ``pycocotools`` from `gautamchitnis `__ with command - - ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI`` - -One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, -you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can -define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both -classes, your ``labels`` tensor should look like ``[1, 2]``. - -Additionally, if you want to use aspect ratio grouping during training -(so that each batch only contains images with similar aspect ratios), -then it is recommended to also implement a ``get_height_and_width`` -method, which returns the height and the width of the image. If this -method is not provided, we query all elements of the dataset via -``__getitem__`` , which loads the image in memory and is slower than if -a custom method is provided. - -Writing a custom dataset for PennFudan -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Let’s write a dataset for the PennFudan dataset. After `downloading and -extracting the zip -file `__, we -have the following folder structure: - -:: - - PennFudanPed/ - PedMasks/ - FudanPed00001_mask.png - FudanPed00002_mask.png - FudanPed00003_mask.png - FudanPed00004_mask.png - ... - PNGImages/ - FudanPed00001.png - FudanPed00002.png - FudanPed00003.png - FudanPed00004.png - -Here is one example of a pair of images and segmentation masks - -.. image:: ../../_static/img/tv_tutorial/tv_image01.png - -.. image:: ../../_static/img/tv_tutorial/tv_image02.png - -So each image has a corresponding -segmentation mask, where each color correspond to a different instance. -Let’s write a :class:`torch.utils.data.Dataset` class for this dataset. -In the code below, we are wrapping images, bounding boxes and masks into -``torchvision.TVTensor`` classes so that we will be able to apply torchvision -built-in transformations (`new Transforms API `_) -for the given object detection and segmentation task. -Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into -:class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`. -As ``torchvision.TVTensor`` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain -:class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see -`this documentation `_. - -.. code:: python - - import os - import torch - - from torchvision.io import read_image - from torchvision.ops.boxes import masks_to_boxes - from torchvision import tv_tensors - from torchvision.transforms.v2 import functional as F - - - class PennFudanDataset(torch.utils.data.Dataset): - def __init__(self, root, transforms): - self.root = root - self.transforms = transforms - # load all image files, sorting them to - # ensure that they are aligned - self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages")))) - self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks")))) - - def __getitem__(self, idx): - # load images and masks - img_path = os.path.join(self.root, "PNGImages", self.imgs[idx]) - mask_path = os.path.join(self.root, "PedMasks", self.masks[idx]) - img = read_image(img_path) - mask = read_image(mask_path) - # instances are encoded as different colors - obj_ids = torch.unique(mask) - # first id is the background, so remove it - obj_ids = obj_ids[1:] - num_objs = len(obj_ids) - - # split the color-encoded mask into a set - # of binary masks - masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8) - - # get bounding box coordinates for each mask - boxes = masks_to_boxes(masks) - - # there is only one class - labels = torch.ones((num_objs,), dtype=torch.int64) - - image_id = idx - area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) - # suppose all instances are not crowd - iscrowd = torch.zeros((num_objs,), dtype=torch.int64) - - # Wrap sample and targets into torchvision tv_tensors: - img = tv_tensors.Image(img) - - target = {} - target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img)) - target["masks"] = tv_tensors.Mask(masks) - target["labels"] = labels - target["image_id"] = image_id - target["area"] = area - target["iscrowd"] = iscrowd - - if self.transforms is not None: - img, target = self.transforms(img, target) - - return img, target - - def __len__(self): - return len(self.imgs) - - -That’s all for the dataset. Now let’s define a model that can perform -predictions on this dataset. - -Defining your model -------------------- - -In this tutorial, we will be using `Mask -R-CNN `__, which is based on top of -`Faster R-CNN `__. Faster R-CNN is a -model that predicts both bounding boxes and class scores for potential -objects in the image. - -.. image:: ../../_static/img/tv_tutorial/tv_image03.png - -Mask R-CNN adds an extra branch -into Faster R-CNN, which also predicts segmentation masks for each -instance. - -.. image:: ../../_static/img/tv_tutorial/tv_image04.png - -There are two common -situations where one might want -to modify one of the available models in TorchVision Model Zoo. The first -is when we want to start from a pre-trained model, and just finetune the -last layer. The other is when we want to replace the backbone of the -model with a different one (for faster predictions, for example). - -Let’s go see how we would do one or another in the following sections. - -1 - Finetuning from a pretrained model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Let’s suppose that you want to start from a model pre-trained on COCO -and want to finetune it for your particular classes. Here is a possible -way of doing it: - - -.. code:: python - - import torchvision - from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - - # load a model pre-trained on COCO - model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") - - # replace the classifier with a new one, that has - # num_classes which is user-defined - num_classes = 2 # 1 class (person) + background - # get number of input features for the classifier - in_features = model.roi_heads.box_predictor.cls_score.in_features - # replace the pre-trained head with a new one - model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - - -2 - Modifying the model to add a different backbone -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - import torchvision - from torchvision.models.detection import FasterRCNN - from torchvision.models.detection.rpn import AnchorGenerator - - # load a pre-trained model for classification and return - # only the features - backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features - # ``FasterRCNN`` needs to know the number of - # output channels in a backbone. For mobilenet_v2, it's 1280 - # so we need to add it here - backbone.out_channels = 1280 - - # let's make the RPN generate 5 x 3 anchors per spatial - # location, with 5 different sizes and 3 different aspect - # ratios. We have a Tuple[Tuple[int]] because each feature - # map could potentially have different sizes and - # aspect ratios - anchor_generator = AnchorGenerator( - sizes=((32, 64, 128, 256, 512),), - aspect_ratios=((0.5, 1.0, 2.0),) - ) - - # let's define what are the feature maps that we will - # use to perform the region of interest cropping, as well as - # the size of the crop after rescaling. - # if your backbone returns a Tensor, featmap_names is expected to - # be [0]. More generally, the backbone should return an - # ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which - # feature maps to use. - roi_pooler = torchvision.ops.MultiScaleRoIAlign( - featmap_names=['0'], - output_size=7, - sampling_ratio=2, - ) - - # put the pieces together inside a Faster-RCNN model - model = FasterRCNN( - backbone, - num_classes=2, - rpn_anchor_generator=anchor_generator, - box_roi_pool=roi_pooler, - ) - - -Object detection and instance segmentation model for PennFudan Dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In our case, we want to finetune from a pre-trained model, given that -our dataset is very small, so we will be following approach number 1. - -Here we want to also compute the instance segmentation masks, so we will -be using Mask R-CNN: - -.. code:: python - - import torchvision - from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor - - - def get_model_instance_segmentation(num_classes): - # load an instance segmentation model pre-trained on COCO - model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT") - - # get number of input features for the classifier - in_features = model.roi_heads.box_predictor.cls_score.in_features - # replace the pre-trained head with a new one - model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - - # now get the number of input features for the mask classifier - in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels - hidden_layer = 256 - # and replace the mask predictor with a new one - model.roi_heads.mask_predictor = MaskRCNNPredictor( - in_features_mask, - hidden_layer, - num_classes, - ) - - return model - - -That’s it, this will make ``model`` be ready to be trained and evaluated -on your custom dataset. - -Putting everything together ---------------------------- - -In ``references/detection/``, we have a number of helper functions to -simplify training and evaluating detection models. Here, we will use -``references/detection/engine.py`` and ``references/detection/utils.py``. -Just download everything under ``references/detection`` to your folder and use them here. -On Linux if you have ``wget``, you can download them using below commands: - -.. code:: python - - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py") - - -Since v0.15.0 torchvision provides `new Transforms API `_ -to easily write data augmentation pipelines for Object Detection and Segmentation tasks. - -Let’s write some helper functions for data augmentation / -transformation: - -.. code:: python - - from torchvision.transforms import v2 as T - - - def get_transform(train): - transforms = [] - if train: - transforms.append(T.RandomHorizontalFlip(0.5)) - transforms.append(T.ToDtype(torch.float, scale=True)) - transforms.append(T.ToPureTensor()) - return T.Compose(transforms) - - -Testing ``forward()`` method (Optional) ---------------------------------------- - -Before iterating over the dataset, it's good to see what the model -expects during training and inference time on sample data. - -.. code:: python - - import utils - - - model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") - dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) - data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - num_workers=4, - collate_fn=utils.collate_fn - ) - - # For Training - images, targets = next(iter(data_loader)) - images = list(image for image in images) - targets = [{k: v for k, v in t.items()} for t in targets] - output = model(images, targets) # Returns losses and detections - print(output) - - # For inference - model.eval() - x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] - predictions = model(x) # Returns predictions - print(predictions[0]) - -:: - - {'loss_classifier': tensor(0.0820, grad_fn=), 'loss_box_reg': tensor(0.0278, grad_fn=), 'loss_objectness': tensor(0.0027, grad_fn=), 'loss_rpn_box_reg': tensor(0.0036, grad_fn=)} - {'boxes': tensor([], size=(0, 4), grad_fn=), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=)} - - -Let’s now write the main function which performs the training and the -validation: - -.. code:: python - - from engine import train_one_epoch, evaluate - - # train on the GPU or on the CPU, if a GPU is not available - device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - - # our dataset has two classes only - background and person - num_classes = 2 - # use our dataset and defined transformations - dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) - dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False)) - - # split the dataset in train and test set - indices = torch.randperm(len(dataset)).tolist() - dataset = torch.utils.data.Subset(dataset, indices[:-50]) - dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) - - # define training and validation data loaders - data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - num_workers=4, - collate_fn=utils.collate_fn - ) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, - batch_size=1, - shuffle=False, - num_workers=4, - collate_fn=utils.collate_fn - ) - - # get the model using our helper function - model = get_model_instance_segmentation(num_classes) - - # move model to the right device - model.to(device) - - # construct an optimizer - params = [p for p in model.parameters() if p.requires_grad] - optimizer = torch.optim.SGD( - params, - lr=0.005, - momentum=0.9, - weight_decay=0.0005 - ) - - # and a learning rate scheduler - lr_scheduler = torch.optim.lr_scheduler.StepLR( - optimizer, - step_size=3, - gamma=0.1 - ) - - # let's train it for 5 epochs - num_epochs = 5 - - for epoch in range(num_epochs): - # train for one epoch, printing every 10 iterations - train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) - # update the learning rate - lr_scheduler.step() - # evaluate on the test dataset - evaluate(model, data_loader_test, device=device) - - print("That's it!") - -:: - - Epoch: [0] [ 0/60] eta: 0:02:43 lr: 0.000090 loss: 2.8181 (2.8181) loss_classifier: 0.5218 (0.5218) loss_box_reg: 0.1272 (0.1272) loss_mask: 2.1324 (2.1324) loss_objectness: 0.0346 (0.0346) loss_rpn_box_reg: 0.0022 (0.0022) time: 2.7332 data: 0.4483 max mem: 1984 - Epoch: [0] [10/60] eta: 0:00:24 lr: 0.000936 loss: 1.3190 (1.6752) loss_classifier: 0.4611 (0.4213) loss_box_reg: 0.2928 (0.3031) loss_mask: 0.6962 (0.9183) loss_objectness: 0.0238 (0.0253) loss_rpn_box_reg: 0.0074 (0.0072) time: 0.4944 data: 0.0439 max mem: 2762 - Epoch: [0] [20/60] eta: 0:00:13 lr: 0.001783 loss: 0.9419 (1.2621) loss_classifier: 0.2171 (0.3037) loss_box_reg: 0.2906 (0.3064) loss_mask: 0.4174 (0.6243) loss_objectness: 0.0190 (0.0210) loss_rpn_box_reg: 0.0059 (0.0068) time: 0.2108 data: 0.0042 max mem: 2823 - Epoch: [0] [30/60] eta: 0:00:08 lr: 0.002629 loss: 0.6349 (1.0344) loss_classifier: 0.1184 (0.2339) loss_box_reg: 0.2706 (0.2873) loss_mask: 0.2276 (0.4897) loss_objectness: 0.0065 (0.0168) loss_rpn_box_reg: 0.0059 (0.0067) time: 0.1650 data: 0.0051 max mem: 2823 - Epoch: [0] [40/60] eta: 0:00:05 lr: 0.003476 loss: 0.4631 (0.8771) loss_classifier: 0.0650 (0.1884) loss_box_reg: 0.1924 (0.2604) loss_mask: 0.1734 (0.4084) loss_objectness: 0.0029 (0.0135) loss_rpn_box_reg: 0.0051 (0.0063) time: 0.1760 data: 0.0052 max mem: 2823 - Epoch: [0] [50/60] eta: 0:00:02 lr: 0.004323 loss: 0.3261 (0.7754) loss_classifier: 0.0368 (0.1606) loss_box_reg: 0.1424 (0.2366) loss_mask: 0.1479 (0.3599) loss_objectness: 0.0022 (0.0116) loss_rpn_box_reg: 0.0051 (0.0067) time: 0.1775 data: 0.0052 max mem: 2823 - Epoch: [0] [59/60] eta: 0:00:00 lr: 0.005000 loss: 0.3261 (0.7075) loss_classifier: 0.0415 (0.1433) loss_box_reg: 0.1114 (0.2157) loss_mask: 0.1573 (0.3316) loss_objectness: 0.0020 (0.0103) loss_rpn_box_reg: 0.0052 (0.0066) time: 0.2064 data: 0.0049 max mem: 2823 - Epoch: [0] Total time: 0:00:14 (0.2412 s / it) - creating index... - index created! - Test: [ 0/50] eta: 0:00:25 model_time: 0.1576 (0.1576) evaluator_time: 0.0029 (0.0029) time: 0.5063 data: 0.3452 max mem: 2823 - Test: [49/50] eta: 0:00:00 model_time: 0.0335 (0.0701) evaluator_time: 0.0025 (0.0038) time: 0.0594 data: 0.0025 max mem: 2823 - Test: Total time: 0:00:04 (0.0862 s / it) - Averaged stats: model_time: 0.0335 (0.0701) evaluator_time: 0.0025 (0.0038) - Accumulating evaluation results... - DONE (t=0.01s). - Accumulating evaluation results... - DONE (t=0.01s). - IoU metric: bbox - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.722 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.987 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.938 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.359 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.752 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.730 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.353 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.762 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.762 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.500 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.775 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.769 - IoU metric: segm - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.726 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.993 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.913 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.344 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.593 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.743 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.360 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.760 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.760 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.633 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.662 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.772 - - ... - - Epoch: [4] [ 0/60] eta: 0:00:32 lr: 0.000500 loss: 0.1593 (0.1593) loss_classifier: 0.0194 (0.0194) loss_box_reg: 0.0272 (0.0272) loss_mask: 0.1046 (0.1046) loss_objectness: 0.0044 (0.0044) loss_rpn_box_reg: 0.0037 (0.0037) time: 0.5369 data: 0.3801 max mem: 3064 - Epoch: [4] [10/60] eta: 0:00:10 lr: 0.000500 loss: 0.1609 (0.1870) loss_classifier: 0.0194 (0.0236) loss_box_reg: 0.0272 (0.0383) loss_mask: 0.1140 (0.1190) loss_objectness: 0.0005 (0.0023) loss_rpn_box_reg: 0.0029 (0.0037) time: 0.2016 data: 0.0378 max mem: 3064 - Epoch: [4] [20/60] eta: 0:00:08 lr: 0.000500 loss: 0.1652 (0.1826) loss_classifier: 0.0224 (0.0242) loss_box_reg: 0.0286 (0.0374) loss_mask: 0.1075 (0.1165) loss_objectness: 0.0003 (0.0016) loss_rpn_box_reg: 0.0016 (0.0029) time: 0.1866 data: 0.0044 max mem: 3064 - Epoch: [4] [30/60] eta: 0:00:06 lr: 0.000500 loss: 0.1676 (0.1884) loss_classifier: 0.0245 (0.0264) loss_box_reg: 0.0286 (0.0401) loss_mask: 0.1075 (0.1175) loss_objectness: 0.0003 (0.0013) loss_rpn_box_reg: 0.0018 (0.0030) time: 0.2106 data: 0.0055 max mem: 3064 - Epoch: [4] [40/60] eta: 0:00:03 lr: 0.000500 loss: 0.1726 (0.1884) loss_classifier: 0.0245 (0.0265) loss_box_reg: 0.0283 (0.0394) loss_mask: 0.1187 (0.1184) loss_objectness: 0.0003 (0.0011) loss_rpn_box_reg: 0.0020 (0.0029) time: 0.1897 data: 0.0056 max mem: 3064 - Epoch: [4] [50/60] eta: 0:00:01 lr: 0.000500 loss: 0.1910 (0.1938) loss_classifier: 0.0273 (0.0280) loss_box_reg: 0.0414 (0.0418) loss_mask: 0.1177 (0.1198) loss_objectness: 0.0003 (0.0010) loss_rpn_box_reg: 0.0022 (0.0031) time: 0.1623 data: 0.0056 max mem: 3064 - Epoch: [4] [59/60] eta: 0:00:00 lr: 0.000500 loss: 0.1732 (0.1888) loss_classifier: 0.0273 (0.0278) loss_box_reg: 0.0327 (0.0405) loss_mask: 0.0993 (0.1165) loss_objectness: 0.0003 (0.0010) loss_rpn_box_reg: 0.0023 (0.0030) time: 0.1732 data: 0.0056 max mem: 3064 - Epoch: [4] Total time: 0:00:11 (0.1920 s / it) - creating index... - index created! - Test: [ 0/50] eta: 0:00:21 model_time: 0.0589 (0.0589) evaluator_time: 0.0032 (0.0032) time: 0.4269 data: 0.3641 max mem: 3064 - Test: [49/50] eta: 0:00:00 model_time: 0.0515 (0.0521) evaluator_time: 0.0020 (0.0031) time: 0.0579 data: 0.0024 max mem: 3064 - Test: Total time: 0:00:03 (0.0679 s / it) - Averaged stats: model_time: 0.0515 (0.0521) evaluator_time: 0.0020 (0.0031) - Accumulating evaluation results... - DONE (t=0.01s). - Accumulating evaluation results... - DONE (t=0.01s). - IoU metric: bbox - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.846 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.997 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.978 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.412 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.689 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.864 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.417 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.876 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.876 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.567 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.750 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.896 - IoU metric: segm - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.777 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.997 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.961 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.424 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.631 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.791 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.373 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.814 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.814 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.633 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.713 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.827 - - That's it! - - -So after one epoch of training, we obtain a COCO-style mAP > 50, and -a mask mAP of 65. - -But what do the predictions look like? Let’s take one image in the -dataset and verify - -.. image:: ../../_static/img/tv_tutorial/tv_image05.png - -.. code:: python - - import matplotlib.pyplot as plt - from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks - - image = read_image("../_static/img/tv_tutorial/tv_image05.png") - eval_transform = get_transform(train=False) - - model.eval() - with torch.no_grad(): - x = eval_transform(image) - # convert RGBA -> RGB and move to device - x = x[:3, ...].to(device) - predictions = model([x, ]) - pred = predictions[0] - - image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8) - image = image[:3, ...] - pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])] - pred_boxes = pred["boxes"].long() - output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red") - - masks = (pred["masks"] > 0.7).squeeze(1) - output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue") - - plt.figure(figsize=(12, 12)) - plt.imshow(output_image.permute(1, 2, 0)) - - -.. image:: ../../_static/img/tv_tutorial/tv_image06.png - - -The results look good! - -Wrapping up ------------ - -In this tutorial, you have learned how to create your own training -pipeline for object detection models on a custom dataset. For -that, you wrote a ``torch.utils.data.Dataset`` class that returns the -images and the ground truth boxes and segmentation masks. You also -leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to -perform transfer learning on this new dataset. - -For a more complete example, which includes multi-machine / multi-GPU -training, check ``references/detection/train.py``, which is present in -the torchvision repository. - -You can download a full source file for this tutorial -`here `__. \ No newline at end of file diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py new file mode 100644 index 00000000000..932be472e89 --- /dev/null +++ b/intermediate_source/transformer_building_blocks.py @@ -0,0 +1,781 @@ +""" +Accelerating PyTorch Transformers by replacing ``nn.Transformer`` with Nested Tensors and ``torch.compile()`` +============================================================================================================= +**Author:** `Mikayla Gawarecki `_ + +.. note:: + This tutorial currently requires you to use the PyTorch nightly build. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * Learn about the low-level building blocks PyTorch provides to build custom transformer layers ( + nested tensors, ``scaled_dot_product_attention``, ``torch.compile()``, and ``FlexAttention``) + * Discover how the above improve memory usage and performance using MultiHeadAttention as an example + * Explore advanced customizations using the aforementioned building blocks + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v.2.6.0 or later + + +Over the past few years, the PyTorch team has developed various lower level +features that, when composed, can create a variety of transformer variants. These +include: + +* Nested Tensors with the ``torch.jagged`` layout (AKA NJTs) +* ``scaled_dot_product_attention`` +* ``torch.compile()`` +* ``FlexAttention`` + +This tutorial will give a brief overview of the above technologies and +demonstrate how they can be composed to yield flexible and performant transformer \ +layers with improved user experience. + +One may observe that the ``torch.nn`` module currently provides various ``Transformer``-related layers. +In particular, it includes ``TransformerEncoderLayer``, ``TransformerEncoder``, ``TransformerDecoderLayer``, +``TransformerDecoder``, ``Transformer`` and ``MultiheadAttention``. This family +of layers was initially implemented following the `Attention is All +You Need `_ paper. The components discussed in +this tutorial provide improved user experience, flexibility and performance over +the existing ``nn`` layers. + + +Is this tutorial for me? +======================== + +If you are wondering about what building blocks the ``torch`` library provides +for writing your own transformer layers and best practices, you are in the +right place. Please keep reading! + +If you are looking for an out-of-the-box implementation of a popular transformer +architecture, note that there are many open-source libraries that provide them, +including: + +* `HuggingFace transformers `_ +* `xformers `_ +* `torchtune `_ + +If you are only interested in performant attention score modifications, please +check out the `FlexAttention blog `_ that +contains a `gym of masks `_. + +""" + +################################################################################ +# Introducing the Building Blocks +# =============================== +# First, we will briefly introduce the four technologies mentioned in the introduction +# +# * `torch.nested `_ +# +# Nested tensors generalize the shape of regular dense tensors, allowing for +# representation of ragged-sized data with the same tensor UX. In the context of +# transformers, we can think of nested tensors as a tool for representing variable +# sequence lengths. They eliminate the need for the bug-prone practices of explicit +# padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``). +# +# * `scaled_dot_product_attention `_ +# +# ``scaled_dot_product_attention`` is a primitive for +# :math:`\text{softmax}(\frac{QK^T}{\sqrt{E}} + B)V` that dispatches into either fused +# implementations of the operator or a fallback implementation. It works out of +# the box in eager mode (i.e. the default mode of using PyTorch where operations +# are executed on the fly as they are encountered) and also integrates seamlessly +# with ``torch.compile()``. As of 2.6, it will also offer grouped query attention +# natively. +# +# * `torch.compile() `_ +# +# ``torch.compile()`` is a compiler introduced in version 2.0 that is able to +# capture a graph of PyTorch code and perform various optimizations on it, such as +# fusing together sequences of ops. Nested tensors with the ``torch.jagged`` layout +# and ``scaled_dot_product_attention`` work seamlessly with compile. In the +# context of transformers, the value add of using compile with nested tensor +# and SDPA is that compile can remove framework overhead ones sees in eager mode +# and fuse sequences of ops in transformers together, such as projection and +# activation. +# +# * `FlexAttention `_ +# +# ``FlexAttention`` is a primitive that allows users to modify attention scores +# prior to the softmax operation. It generalizes the additive ``B`` term above +# for ``scaled_dot_product_attention``, allowing for arbitrary calculation. It +# requires compile to achieve good performance. +# +# The above building blocks are "All You Need" (as of October 2024) +# ================================================================== +# +# The main premise in this section is that most transformer variations are +# GPT-style, consisting of layers like Embedding, Positional Encoding, Attention +# Blocks and Feed Forward networks. If we were to try to classify the differences +# in this space, we might land on something like: +# +# 1. Layer type (activation functions such as ``SwiGLU`` and others, normalization functions +# such as ``RMSNorm`` and others, positional encodings, such as Sinusoidal, Rotary.) +# 2. Layer ordering, such as where to apply norms and positional encoding. +# 3. Modifications to attention score, such as ``ALiBi``, Relative Positional Bias and so on. +# +# +# In a pre-compiler environment, you might write a custom transformer and notice +# that it functions correctly but is slow. To address this, you might develop a +# custom fused kernel for the specific series of operations. In a compiler environment, +# you can simply perform the initial step and then compile and benefit from improved performance. + + +############################################################################### +# MultiheadAttention +# ------------------ +# Remember that MultiheadAttention takes in a query, key, and value, and consists +# of an input projection, a ``scaled_dot_product_attention`` operator and an +# output projection. The main takeaway we want to demonstrate here is the +# improvement yielded when we replaced padded/masked inputs with nested tensors. +# The improvements are threefold: +# +# * **User Experience** +# Remember that ``nn.MultiheadAttention`` requires ``query``, ``key``, and +# ``value`` to be dense ``torch.Tensors``. It also provides a +# ``key_padding_mask`` that is used to mask out padding tokens in the ``key`` +# that arise due to different sequence lengths within a batch. Since there is +# no ``query_padding_mask`` in ``nn.MHA``, users have to take care to mask/slice +# the outputs appropriately to account for query sequence lengths. ``NestedTensor`` +# cleanly removes the need for this sort of error-prone padding masks. +# +# * **Memory** +# Instead of materializing a dense ``[B, S, D]`` tensor with a ``[B, S]`` +# padding mask (where ``B`` is batch size, ``S`` is max sequence length in the +# batch and ``D`` is embedding size), nested tensors allow you to cleanly +# represent the batch of varying sequence lengths. As a result, the input and +# intermediate activations will use less memory. +# +# * **Performance** +# Since padding is not materialized and unnecessary computation on padding is +# skipped, performance and memory usage improve. +# +# We'll demonstrate the above by building upon the ``MultiheadAttention`` layer in the +# `Nested Tensor tutorial `_ +# and comparing it to the ``nn.MultiheadAttention`` layer. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MultiHeadAttention(nn.Module): + """ + Computes multi-head attention. Supports nested or padded tensors. + + Args: + E_q (int): Size of embedding dim for query + E_k (int): Size of embedding dim for key + E_v (int): Size of embedding dim for value + E_total (int): Total embedding dim of combined heads post input projection. Each head + has dim E_total // nheads + nheads (int): Number of heads + dropout (float, optional): Dropout probability. Default: 0.0 + bias (bool, optional): Whether to add bias to input projection. Default: True + """ + def __init__( + self, + E_q: int, + E_k: int, + E_v: int, + E_total: int, + nheads: int, + dropout: float = 0.0, + bias=True, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.nheads = nheads + self.dropout = dropout + self._qkv_same_embed_dim = E_q == E_k and E_q == E_v + if self._qkv_same_embed_dim: + self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs) + else: + self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.k_proj = nn.Linear(E_k, E_total, bias=bias, **factory_kwargs) + self.v_proj = nn.Linear(E_v, E_total, bias=bias, **factory_kwargs) + E_out = E_q + self.out_proj = nn.Linear(E_total, E_out, bias=bias, **factory_kwargs) + assert E_total % nheads == 0, "Embedding dim is not divisible by nheads" + self.E_head = E_total // nheads + self.bias = bias + + def forward(self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask=None, + is_causal=False) -> torch.Tensor: + """ + Forward pass; runs the following process: + 1. Apply input projection + 2. Split heads and prepare for SDPA + 3. Run SDPA + 4. Apply output projection + + Args: + query (torch.Tensor): query of shape (``N``, ``L_q``, ``E_qk``) + key (torch.Tensor): key of shape (``N``, ``L_kv``, ``E_qk``) + value (torch.Tensor): value of shape (``N``, ``L_kv``, ``E_v``) + attn_mask (torch.Tensor, optional): attention mask of shape (``N``, ``L_q``, ``L_kv``) to pass to SDPA. Default: None + is_causal (bool, optional): Whether to apply causal mask. Default: False + + Returns: + attn_output (torch.Tensor): output of shape (N, L_t, E_q) + """ + # Step 1. Apply input projection + if self._qkv_same_embed_dim: + if query is key and key is value: + result = self.packed_proj(query) + query, key, value = torch.chunk(result, 3, dim=-1) + else: + q_weight, k_weight, v_weight = torch.chunk(self.packed_proj.weight, 3, dim=0) + if self.bias: + q_bias, k_bias, v_bias = torch.chunk(self.packed_proj.bias, 3, dim=0) + else: + q_bias, k_bias, v_bias = None, None, None + query, key, value = F.linear(query, q_weight, q_bias), F.linear(key, k_weight, k_bias), F.linear(value, v_weight, v_bias) + + else: + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Step 2. Split heads and prepare for SDPA + # reshape query, key, value to separate by head + # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head) + query = query.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + key = key.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + value = value.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + + # Step 3. Run SDPA + # (N, nheads, L_t, E_head) + attn_output = F.scaled_dot_product_attention( + query, key, value, dropout_p=self.dropout, is_causal=is_causal) + # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total) + attn_output = attn_output.transpose(1, 2).flatten(-2) + + # Step 4. Apply output projection + # (N, L_t, E_total) -> (N, L_t, E_out) + attn_output = self.out_proj(attn_output) + + return attn_output + + +############################################################################### +# Utilities +# ~~~~~~~~~ +# In this section, we include a utility to generate semi-realistic data using +# ``Zipf`` distribution for sentence lengths. This is used to generate the nested +# query, key, and value tensors. We also include a benchmark utility. + + +import numpy as np + +def zipf_sentence_lengths(alpha: float, batch_size: int) -> torch.Tensor: + # generate fake corpus by unigram Zipf distribution + # from wikitext-2 corpus, we get rank "." = 3, "!" = 386, "?" = 858 + sentence_lengths = np.empty(batch_size, dtype=int) + for ibatch in range(batch_size): + sentence_lengths[ibatch] = 1 + word = np.random.zipf(alpha) + while word != 3 and word != 386 and word != 858: + sentence_lengths[ibatch] += 1 + word = np.random.zipf(alpha) + return torch.tensor(sentence_lengths) + +# Generate a batch of semi-realistic data using Zipf distribution for sentence lengths +# in the form of nested tensors with the jagged layout. +def gen_batch(N, E_q, E_k, E_v, device, dtype=torch.float32, query_seq_len_1=False): + # generate semi-realistic data using Zipf distribution for sentence lengths + sentence_lengths = zipf_sentence_lengths(alpha=1.2, batch_size=N) + + # Note: the torch.jagged layout is a nested tensor layout that supports a single ragged + # dimension and works with torch.compile. The batch items each have shape (B, S*, D) + # where B = batch size, S* = ragged sequence length, and D = embedding dimension. + if query_seq_len_1: + query = torch.nested.nested_tensor([ + torch.randn(1, E_q, dtype=dtype, device=device) + for l in sentence_lengths + ], layout=torch.jagged) + else: + query = torch.nested.nested_tensor([ + torch.randn(l.item(), E_q, dtype=dtype, device=device) + for l in sentence_lengths + ], layout=torch.jagged) + + key = torch.nested.nested_tensor([ + torch.randn(s.item(), E_k, dtype=dtype, device=device) + for s in sentence_lengths + ], layout=torch.jagged) + + value = torch.nested.nested_tensor([ + torch.randn(s.item(), E_v, dtype=dtype, device=device) + for s in sentence_lengths + ], layout=torch.jagged) + + return query, key, value, sentence_lengths + +import timeit +import math + +def benchmark(func, *args, **kwargs): + torch.cuda.synchronize() + torch.cuda.reset_peak_memory_stats() + begin = timeit.default_timer() + output = func(*args, **kwargs) + torch.cuda.synchronize() + end = timeit.default_timer() + return output, (end - begin), torch.cuda.max_memory_allocated() + +############################################################################## +# We will now demonstrate the performance improvements of using nested tensors +# in the ``MultiheadAttention`` layer + compile for self attention. We compare this against +# the traditional ``nn.MultiheadAttention`` + compile with padding and masking. + +N, E_q, E_k, E_v, E_total = 512, 512, 512, 512, 512 +E_out = E_q +d_model = E_q +nheads = 8 +dropout = 0.0 +bias = True +device='cuda' +torch.manual_seed(6) +query, key, value, sentence_lengths = gen_batch(N, E_q, E_k, E_v, device) +S = sentence_lengths.max().item() +print(f"Total sequence length in nested query {sentence_lengths.sum().item()}, max sequence length {S}") +padded_query, padded_key, padded_value = ( + t.to_padded_tensor(0.0) for t in (query, key, value) +) + +torch.manual_seed(6) +mha_layer = MultiHeadAttention(E_q, E_k, E_v, E_total, nheads, dropout=dropout, bias=bias, device='cuda') +torch.manual_seed(6) +vanilla_mha_layer = nn.MultiheadAttention(E_q, nheads, dropout=dropout, batch_first=True, bias=bias, device='cuda') + +# ``nn.MultiheadAttention`` uses a non conventional initialization for layers, so do this for exact parity :( +mha_layer.out_proj.weight = nn.Parameter(vanilla_mha_layer.out_proj.weight.clone().detach()) +mha_layer.packed_proj.weight = nn.Parameter(vanilla_mha_layer.in_proj_weight.clone().detach()) +mha_layer.out_proj.bias = nn.Parameter(vanilla_mha_layer.out_proj.bias.clone().detach()) +mha_layer.packed_proj.bias = nn.Parameter(vanilla_mha_layer.in_proj_bias.clone().detach()) + +new_mha_layer = torch.compile(mha_layer) +# warmup compile +nested_result_warmup = new_mha_layer(query, query, query, is_causal=True) + +# benchmark +nested_result, nested_time, nested_peak_memory = benchmark(new_mha_layer, query, query, query, is_causal=True) +padded_nested_result = nested_result.to_padded_tensor(0.0) + +# For the vanilla ``nn.MultiheadAttention``, we need to construct the ``key_padding_mask`` +# Further, ``nn.MultiheadAttention`` forces one to materialize the ``attn_mask`` even if using ``is_causal`` +src_key_padding_mask = torch.where(padded_query == 0.0, -math.inf, 0)[:, :, 0] +attn_mask = torch.empty((N, S, S), device=device).fill_(float('-inf')) +for i, s in enumerate(sentence_lengths): + attn_mask[i, :s, :s] = nn.Transformer.generate_square_subsequent_mask(s) +attn_mask = attn_mask.unsqueeze(1).expand(N, nheads, S, S).reshape(N*nheads, S, S) + +vanilla_mha_layer = torch.compile(vanilla_mha_layer) +# warmup compile +warmup_vanilla_result = vanilla_mha_layer(padded_query, + padded_query, + padded_query, + attn_mask=attn_mask, + key_padding_mask=src_key_padding_mask, + need_weights=False, + is_causal=True) + +# benchmark +(padded_result, _), padded_time, padded_peak_memory = benchmark(vanilla_mha_layer, + padded_query, + padded_query, + padded_query, + key_padding_mask=src_key_padding_mask, + need_weights=False, + attn_mask=attn_mask, + is_causal=True) + +print(f"{padded_time=:.5f}, padded_peak_memory={padded_peak_memory/1e9:.2f} GB") +print(f"{nested_time=:.5f}, nested_peak_memory={nested_peak_memory/1e9:.2f} GB") +print("Max difference between vanilla and nested result", (padded_result - padded_nested_result).abs().max().item()) +print(f"Nested speedup: {(padded_time/nested_time):.2f}") +print(f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB") + +###################################################################################### +# For reference, here are some sample outputs on A100: +# +# .. code:: +# +# padded_time=0.03454, padded_peak_memory=4.14 GB +# nested_time=0.00612, nested_peak_memory=0.76 GB +# Max difference between vanilla and nested result 0.0 +# Nested speedup: 5.65 +# Nested peak memory reduction 3.39 GB +# +# We can also see the same for backward pass + +for i, entry_length in enumerate(sentence_lengths): + # padding-specific step: remove output projection bias from padded entries for fair comparison + padded_result[i, entry_length:, :] = 0.0 + +_, padded_bw_time, padded_bw_peak_mem = benchmark(lambda : padded_result.sum().backward()) +_, nested_bw_time, nested_bw_peak_mem = benchmark(lambda : padded_nested_result.sum().backward()) + +print(f"{padded_bw_time=:.5f}, padded_bw_peak_mem={padded_bw_peak_mem/1e9:.2f} GB") +print(f"{nested_bw_time=:.5f}, nested_bw_peak_mem={nested_bw_peak_mem/1e9:.2f} GB") +print(f"Nested backward speedup: {(padded_bw_time/nested_bw_time):.2f}") +print(f"Nested backward peak memory reduction {((padded_bw_peak_mem - nested_bw_peak_mem)/1e9):.2f} GB") + +print("Difference in out_proj.weight.grad", (mha_layer.out_proj.weight.grad - vanilla_mha_layer.out_proj.weight.grad).abs().max().item()) +print("Difference in packed_proj.weight.grad", (mha_layer.packed_proj.weight.grad - vanilla_mha_layer.in_proj_weight.grad).abs().max().item()) +print("Difference in out_proj.bias.grad", (mha_layer.out_proj.bias.grad - vanilla_mha_layer.out_proj.bias.grad).abs().max().item()) +print("Difference in packed_proj.bias.grad", (mha_layer.packed_proj.bias.grad - vanilla_mha_layer.in_proj_bias.grad).abs().max().item()) + +################################################################################## +# Sample outputs on A100: +# +# .. code:: +# +# padded_bw_time=2.09337, padded_bw_peak_mem=5.10 GB +# nested_bw_time=0.01452, nested_bw_peak_mem=3.24 GB +# Nested backward speedup: 144.13 +# Nested backward peak memory reduction 1.86 GB +# Difference in out_proj.weight.grad 0.000244140625 +# Difference in packed_proj.weight.grad 0.001556396484375 +# Difference in out_proj.bias.grad 0.0 +# Difference in packed_proj.bias.grad 0.001953125 +# + +################################################################################## +# GPT-style layer +# --------------- +# A basic GPT-style transformer layer consists of a causal self-attention layer +# followed by a feed-forward network (FFN) with skip connections. Implementing +# this is fairly straightforward using the ``MultiheadAttention`` layer above and +# gives equivalent results to an ``nn.TransformerEncoderLayer`` with +# ``is_causal=True``. +# +# We demonstrate examples of implementing the rest of the ``nn`` layers +# `here `_ +# but omit that from this tutorial for brevity. + + +############################################################################### +# Going one step further +# ---------------------- +# So far, we have demonstrated how to implement a performant ``MultiheadAttention`` +# layer that follows the traditional ``nn.MultiheadAttention``. Going back to our +# classification of modifications to the transformer architecture, remember that we +# classified the modifications into layer type, layer ordering, and modifications +# to the attention score. We trust that changing layer type and layer ordering +# (such as swapping ``LayerNorm`` for ``RMSNorm``) is fairly straightforward. +# +# In this section, we will discuss various functionalities using the +# aforementioned building blocks, including the following: +# +# * Cross Attention +# * Fully masked rows no longer cause NaNs +# * Modifying attention score: ALiBi with FlexAttention and NJT +# * Packed Projection + +############################################################################### +# Cross Attention +# --------------- +# Cross attention is a form of attention where the query and key/value tensors +# are from different sequences. +# +# One example of this is in ``nn.TransformerDecoderLayer`` where the query comes +# from the decoder and the key/value come from the encoder. +# +# The above MultiheadAttention layer nicely generalizes to this case with nested +# tensors for both query and key/value. + +query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device) +_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device) + +print(f"Total sequence length in nested query {q_len.sum().item()}, max sequence length {q_len.max().item()}") +print(f"Total sequence length in nested key/value {kv_len.sum().item()}, max sequence length {kv_len.max().item()}") +out = new_mha_layer(query, key, value, is_causal=False) + +######################################################################################## +# As above, we can compare this against the vanilla compiled ``nn.MultiheadAttention``. + +torch.manual_seed(6) +query, _, _, q_len = gen_batch(N, E_q, E_k, E_v, device) +_, key, value, kv_len = gen_batch(N, E_q, E_k, E_v, device) +padded_query, padded_key, padded_value = ( + t.to_padded_tensor(0.0) for t in (query, key, value) +) + +key_padding_mask = torch.where(padded_key == 0.0, -math.inf, 0)[:, :, 0] + +# warmup compile +warmup_nested_result = new_mha_layer(query, key, value, is_causal=False) +warmup_vanilla_result = vanilla_mha_layer(padded_query, + padded_key, + padded_value, + key_padding_mask=key_padding_mask, + need_weights=False, + is_causal=False) + +nested_result, nested_time, nested_peak_memory = benchmark(new_mha_layer, query, key, value, is_causal=False) +(padded_result, _), padded_time, padded_peak_memory = benchmark(vanilla_mha_layer, + padded_query, + padded_key, + padded_value, + key_padding_mask=key_padding_mask, + need_weights=False, + is_causal=False) +padded_nested_result = nested_result.to_padded_tensor(0.0) +for i, entry_length in enumerate(q_len): + # padding-specific step: remove output projection bias from padded entries for fair comparison + padded_result[i, entry_length:, :] = 0.0 + +print("Max difference between vanilla and nested result", (padded_result - padded_nested_result).abs().max().item()) +print(f"Nested speedup: {(padded_time/nested_time):.2f}") +print(f"Nested peak memory reduction {((padded_peak_memory - nested_peak_memory)/1e9):.2f} GB") + +################################################################################## +# Sample outputs on A100: +# +# .. code:: +# +# Max difference between vanilla and nested result 0.0 +# Nested speedup: 4.01 +# Nested peak memory reduction 1.40 GB +# + +################################################################################ +# Fully masked rows no longer cause NaNs +# -------------------------------------- +# +# There has been a long standing issue with ``nn.MultiheadAttention`` and +# ``scaled_dot_product_attention`` where if a row was fully masked out, the output +# of the attention layer would be NaN. See `issue `_. +# This is because the softmax over an empty set is undefined. +# +# Thanks to `this PR `_ +# this is no longer the case. Instead, fully masked rows in ``scaled_dot_product_attention``. +# For cases where ``nn.MHA`` does not employ the "fast-path", this will also apply. +# +# Using a custom MHA layer with NJTs is strongly recommended over the +# existing "fast-path" in ``nn.MultiheadAttention`` as NJT's ability to model raggedness +# appropriately makes it possible to properly express empty sequences. + + +################################################################################ +# FlexAttention + NJT +# --------------------------------------------------------------------- +# NJT also composes with the ``FlexAttention`` module. This is a generalization +# of the ``MultiheadAttention`` layer that allows for arbitrary modifications +# to the attention score. The example below takes the ``alibi_mod`` +# that implements `ALiBi `_ from +# `attention gym `_ and uses it +# with nested input tensors. + +from torch.nn.attention.flex_attention import flex_attention + +def generate_alibi_bias(H: int): + """Returns an alibi bias score_mod given the number of heads H + Args: + H: number of heads + Returns: + alibi_bias: alibi bias score_mod + """ + def alibi_mod(score, b, h, q_idx, kv_idx): + scale = torch.exp2(-((h + 1) * 8.0 / H)) + bias = (q_idx - kv_idx) * scale + return score + bias + return alibi_mod + +query, key, value, _ = gen_batch(N, E_q, E_k, E_v, device) +n_heads, D = 8, E_q // 8 +alibi_score_mod = generate_alibi_bias(n_heads) +query = ( + query.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +) +key = key.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +value = ( + value.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +) +out_flex2 = flex_attention(query, key, value, score_mod=alibi_score_mod) + +############################################################################### +# In addition, one can also use the ``block_mask`` utility of ``FlexAttention`` +# with NJTs via the ``create_nested_block_mask`` function. This is useful for +# taking advantage of the sparsity of the mask to speed up the attention computation. +# In particular, the function creates a sparse block mask for a "stacked sequence" of all +# the variable length sequences in the NJT combined into one, while properly masking out +# inter-sequence attention. In the following example, we show how to create a +# causal block mask using this utility. + +from torch.nn.attention.flex_attention import create_nested_block_mask + +def causal_mask(b, h, q_idx, kv_idx): + return q_idx >= kv_idx + +query, key, value, _ = gen_batch(N, E_q, E_k, E_v, device) +block_mask = create_nested_block_mask(causal_mask, 1, 1, query, _compile=True) +query = ( + query.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +) +key = key.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +value = ( + value.unflatten(-1, [n_heads, D]).transpose(1, 2).detach().requires_grad_() +) +out_flex = flex_attention(query, key, value, block_mask=block_mask) + +############################################################################### +# Packed Projection +# ----------------- +# +# Packed projection is a technique that makes use of the fact that when the input +# for projection (matrix multiplications) are the same (self-attention), we can pack the projection +# weights and biases into single tensors. It is especially useful when the individual +# projections are memory bound rather than compute bound. There are +# two examples that we will demonstrate here: +# +# * Input projection for MultiheadAttention +# * SwiGLU activation in feed-forward network of Transformer Layer +# +# Input projection for MultiheadAttention +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# When doing self-attention, the ``query``, ``key``, and ``value`` +# are the same tensor. Each of these tensors is projected with a +# ``Linear(E_q, E_total)`` layer. Instead, we can pack this into one layer, +# which is what we do in the MultiheadAttention layer above. +# +# Let us compare the performance of the packed projection against the usual method: + +class InputProjection(nn.Module): + def __init__(self, E_q, E_total, bias=False, device=None, dtype=None): + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.q_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.k_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + self.v_proj = nn.Linear(E_q, E_total, bias=bias, **factory_kwargs) + + def forward(self, x): + return self.q_proj(x), self.k_proj(x), self.v_proj(x) + +class PackedInputProjection(nn.Module): + def __init__(self, E_q, E_total, bias=False, device=None, dtype=None): + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + self.packed_proj = nn.Linear(E_q, E_total * 3, bias=bias, **factory_kwargs) + + def forward(self, query): + return torch.chunk(self.packed_proj(query), 3, dim=-1) + +B, D, dtype = 256, 8192, torch.bfloat16 + +torch.set_float32_matmul_precision('high') +in_proj = torch.compile(InputProjection(D, D, device='cuda', dtype=torch.bfloat16)) +packed_in_proj = torch.compile(PackedInputProjection(D, D, device='cuda', dtype=torch.bfloat16)) + +q, _, _, sequence_lengths = gen_batch(B, D, D, D, device='cuda', dtype=torch.bfloat16) + +# warmup +in_proj(q) +packed_in_proj(q) + +# benchmark +(q_out, k_out, v_out), time, _ = benchmark(in_proj, q) +(q_out, k_out, v_out), time_packed, _ = benchmark(packed_in_proj, q) +# On my A100 prints 1.05x speedup +print(f"InputProjection: {time:5f} s, PackedInputProjection: {time_packed:5f} s, speedup: {time/time_packed:.2f}x") + +################################################## +# SwiGLU feed forward network of Transformer Layer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Swish-Gated Linear Unit (SwiGLU) is a non-linear activation function that is increasingly popular in the feed-forward +# network of the transformer layer (e.g. Llama). A feed-forward network with SwiGLU activation is defined as: + +class SwiGLUFFN(nn.Module): + def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier=None, device=None, dtype=None): + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs) + self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs) + self.w3 = nn.Linear(dim, hidden_dim, bias=False, **factory_kwargs) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + +######################################################################## +# An alternative way of implementing this that uses packed projection is + +class PackedSwiGLUFFN(nn.Module): + def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier=None, device=None, dtype=None): + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w13 = nn.Linear(dim, 2 * hidden_dim, bias=False, **factory_kwargs) + self.w2 = nn.Linear(hidden_dim, dim, bias=False, **factory_kwargs) + + def forward(self, x): + x1, x3 = torch.chunk(self.w13(x), 2, dim=-1) + return self.w2(F.silu(x1) * x3) + +################################################################################ +# We can compare the performance of the two implementations as follows +# Depending on your hardware, you might see different results. On an A100 I see +# 1.12x speedup for D=128. +D = 128 + +swigluffn = torch.compile(SwiGLUFFN(D, D * 4, 256, device='cuda', dtype=torch.bfloat16)) +packed_swigluffn = torch.compile(PackedSwiGLUFFN(D, D * 4, 256, device='cuda', dtype=torch.bfloat16)) + +q, _, _, sentence_lengths = gen_batch(D, D, D, D, device="cuda", dtype=torch.bfloat16) + +# warmup +swigluffn(q) +packed_swigluffn(q) + +# benchmark +_, time, _ = benchmark(swigluffn, q) +_, time_packed, _ = benchmark(packed_swigluffn, q) +# On my A100 prints 1.08x speedup +print(f"SwiGLUFFN: {time} s, PackedSwiGLUFFN: {time_packed} s, speedup: {time/time_packed:.2f}x") + +################################################################################ +# Extended examples +# ----------------- +# +# We intend to update this tutorial to demonstrate more examples of how to use +# the various performant building blocks such as KV-Caching, Grouped Query Attention +# etc. Further, there are several good examples of using various performant building blocks to +# implement various transformer architectures. Some examples include +# +# * `gpt-fast `_ +# * `segment-anything-fast `_ +# * `lucidrains implementation of NaViT with nested tensors `_ +# * `torchtune's implementation of VisionTransformer `_ + +################################################################################ +# Conclusion +# ---------- +# +# In this tutorial, we have introduced the low level building blocks PyTorch +# provides for writing transformer layers and demonstrated examples how to compose +# them. It is our hope that this tutorial has educated the reader on the ease with +# which flexible and performant transformer layers can be implemented by users of PyTorch. diff --git a/lychee.toml b/lychee.toml new file mode 100644 index 00000000000..26fefcfbc5b --- /dev/null +++ b/lychee.toml @@ -0,0 +1 @@ +exclude_path = [".jenkins/build.sh"] diff --git a/prototype_source/README.txt b/prototype_source/README.txt index 4ab9ce8f6a9..9428fe3d124 100644 --- a/prototype_source/README.txt +++ b/prototype_source/README.txt @@ -7,7 +7,7 @@ Prototype Tutorials 2. graph_mode_static_quantization_tutorial.py Graph Mode Post Training Static Quantization in PyTorch https://pytorch.org/tutorials/prototype/graph_mode_static_quantization_tutorial.html - + 3. graph_mode_dynamic_bert_tutorial.rst Graph Mode Dynamic Quantization on BERT https://github.com/pytorch/tutorials/blob/main/prototype_source/graph_mode_dynamic_bert_tutorial.rst @@ -30,9 +30,16 @@ Prototype Tutorials 8. fx_graph_mode_ptq_dynamic.py FX Graph Mode Post Training Dynamic Quantization - https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html + https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_dynamic.html 9. fx_graph_mode_quant_guide.py FX Graph Mode Quantization User Guide - https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html + https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html + +10 flight_recorder_tutorial.rst + Flight Recorder User Guide + https://pytorch.org/tutorials/prototype/flight_recorder_tutorial.html +11 python_extension_autoload.rst + Autoloading Out-of-Tree Extension + https://pytorch.org/tutorials/prototype/python_extension_autoload.html diff --git a/prototype_source/flight_recorder_tutorial.rst b/prototype_source/flight_recorder_tutorial.rst new file mode 100644 index 00000000000..2e643b133cc --- /dev/null +++ b/prototype_source/flight_recorder_tutorial.rst @@ -0,0 +1,304 @@ +(prototype) Flight Recorder for Debugging Stuck Jobs +==================================================== +**Author**: `Chirag Pandya `_, `Junjie Wang `_ + +What you will learn +------------------- +* Learn about a new tool for debugging stuck jobs during distributed training. +* Learn how you can enable the tool and use the collected data for analyzing stuck jobs. + +Prerequisites +------------- + +- PyTorch version 2.5 or later. +- `tabulate `__. You can install by running ``pip install tabulate``. + + +Overview +-------- +An AI distributed training job refers to the process of training a machine learning model using multiple devices, such +as GPUs or CPUs, connected in a network. This approach allows for faster and more efficient training of large models +that require significant computational resources. +An engineer’s goal is to complete an AI training job as quickly as possible and make continuous improvements so that +subsequent training can be done faster. A trained, usable model is the final desired outcome. +One of the biggest impediment to completing training is the concept of a *stuck job*. + +A distributed AI training job is considered `stuck` when it stops making meaningful progress for an extended period of +time. + +A job can get stuck for various reasons: + +- **Data Starvation:** This occurs when the training job is not receiving data at the expected rate, possibly due to issues with the data pipeline or the data source. + +- **Resource Constraints:** If the system running the job does not have enough computational resources (such as CPU, GPU, or memory), the job might not be able to proceed. + +- **Network Issues:** In a distributed training setup, different parts of the model or data may be processed on different devices. If there are network issues, communication between these devices may be disrupted, causing the job to get stuck. + +- **Software Bugs or Errors:** Errors in the training code or the underlying libraries and frameworks can also cause a job to get stuck. + +- **Synchronization Issues:** In distributed training, different parts of the computation are often run in parallel and need to be synchronized at certain points. If this synchronization fails, the job can get stuck. For example, a deadlock can occur if one or more ranks fail to join a collective while the remaining ranks have joined. This results in an indefinite wait for the job to progress. + +Flight Recorder, as the name suggests, captures diagnostics information as collectives run. The captured diagnostic +information is used to help identify the root causes of issues when jobs become stuck. +Flight Recorder consists of two core parts: + +- The collection portion: when enabled, information about collectives is recorded in an in-memory circular buffer. Upon job timeout, or on demand, the in-memory buffer can be retrieved or dumped to file. + +- An analyzer script is available in the `tools/flight_recorder `__ directory (details below). + The analyzer script runs known heuristics using the collected data and attempts to automatically identify the underlying issue that caused the job to stall. + +Enabling Flight Recorder +------------------------ +There are three required environment variables to get the initial version of Flight Recorder working. + +- ``TORCH_NCCL_TRACE_BUFFER_SIZE = (0, N)``: Setting ``N`` to a positive number enables collection. + ``N`` represents the number of entries that will be kept internally in a circular buffer. + We recommended to set this value at *2000*. The default value is ``2000``. +- ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout. + If enabled, there will be one file per rank output in the job's running directory. The default value is ``false``. +- ``TORCH_NCCL_DEBUG_INFO_TEMP_FILE``: Setting the path where the flight recorder will be dumped with file prefix. One file per + rank. The default value is ``/tmp/nccl_trace_rank_``. + +**Optional settings:** + +- ``TORCH_NCCL_TRACE_CPP_STACK = (true, false)``: Setting this to true enables C++ stack traces to be captured in Flight Recorder. + C++ stack traces can be useful in providing the exact code path from a PyTorch Python call down to the primitive + C++ implementation. Also see ``TORCH_SYMBOLIZE_MODE`` in additional settings. +- ``TORCH_NCCL_ENABLE_TIMING = (true, false)``: Setting this to ``true`` will enable additional cuda events at the start of each collective and + records the *duration* of each collective. This may incur some CPU overhead. In the collected data, the + *duration* field indicates how long each collective took to execute. + +Additional Settings +------------------- + +- ``TORCH_SYMBOLIZE_MODE = (dladdr, addr2line, fast)``: This setting determines the program used to retrieve C++ traces from a running program. + The default setting is ``addr2line``. + + ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``. + Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data. +- If you prefer not to have the flight recorder data dumped into the local disk but rather onto your own storage, you can define your own writer class. + This class should inherit from class ``::c10d::DebugInfoWriter`` `(code) `__ + and then register the new writer using ``::c10d::DebugInfoWriter::registerWriter`` `(code) `__ + before we initiate PyTorch distributed. + +Retrieving Flight Recorder Data via an API +------------------------------------------ + +You can also retrieve Flight Recorder data with an API call. +The API with the default arguments is shown below: + +.. code:: python + + torch._C._distributed_c10d._dump_nccl_trace(includeCollectives=True, includeStackTraces=True, onlyActive=False) + +To view the data, you can ``unpickle`` it as shown below: + +.. code:: python + + t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace()) + print(t) + +Flight Recorder File Formats +---------------------------- + +Flight Recorder files are dumped in ``pickle`` format. Files are written to local disks or mounted shared NFS +folders. + +The contents of a Flight Recorder ``unpickled`` file are shown below: + +.. code-block:: json + + { + "version": "2.5", + "pg_config": { + "0": { + "name": "0", + "desc": "default_pg", + "ranks": "[0, 1]" + } + }, + "pg_status": { + "0": { + "last_enqueued_collective": 2, + "last_started_collective": -1, + "last_completed_collective": 2 + } + }, + "entries": [ + { + "frames": [ + { + "name": "test_short_pickle", + "filename": "pytorch/test/distributed/test_c10d_nccl.py", + "line": 3647 + }, + { + "name": "spawn_main", + "filename": ".conda/envs/pytorch-3.10/lib/python3.10/multiprocessing/spawn.py", + "line": 116 + }, + { + "name": "", + "filename": "", + "line": 1 + } + ], + "record_id": 0, + "pg_id": 0, + "process_group": ("0", "default_pg"), + "collective_seq_id": 1, + "p2p_seq_id": 0, + "op_id": 1, + "profiling_name": "nccl:all_reduce", + "time_created_ns": 1724779239936775119, + "input_sizes": [[3, 4]], + "input_dtypes": ["Float"], + "output_sizes": [[3, 4]], + "output_dtypes": ["Float"], + "state": "completed", + "time_discovered_started_ns": null, + "time_discovered_completed_ns": 1724779239975811724, + "retired": true, + "timeout_ms": 600000, + "is_p2p": false + }, + ... + ] + } + +Analyzing Flight Recorder Dumps +------------------------------- + +We have convenient scripts available in `pytorch/tools/flight_recorder` directory for analyzing captured +data. + +To run the convenience script, follow these steps: + +1. Copy all files from a rank into a single directory. + +2. To run the script, use this command: + +.. code:: shell + + python fr_trace.py [-o ] + +If you install the PyTorch nightly build or build from scratch with ``USE_DISTRIBUTED=1``, you can directly use the following +command directly: + +.. code:: shell + + torchfrtrace [-o ] + + +Currently, we support two modes for the analyzer script. The first mode allows the script to apply some heuristics to the parsed flight +recorder dumps to generate a report identifying potential culprits for the timeout. The second mode is simply outputs the raw dumps. +By default, the script prints flight recoder dumps for all ranks and all ``ProcessGroups``(PGs). This can be narrowed down to certain +ranks and PGs using the *--selected-ranks* argument for ranks and *--pg-filters* argument for PGs. An example command is: + +Caveat: tabulate module is needed, so you might need pip install it first. + +.. code:: shell + + python fr_trace.py -j [--selected-ranks i j k ...] [--pg-filters tp dp] + torchfrtrace -j [--selected-ranks i j k ...] [--pg-filters 0 2] + +An End-to-End Example +------------------------------------ +To demonstrate the use of Flight Recorder, we will use a small program where we induce mismatched collectives. +In this example, ``rank0`` is programmed to do an additional collective. +The Flight Recorder dump files are saved to the ``/tmp`` directory. +For demonstration purposes, we named this program ``crash.py``. + +.. note:: + Please note that this is a simplified example. In real-world scenarios, the process would involve more + complexities. + +.. code:: python + + import torch + import torch.distributed as dist + import os + from datetime import timedelta + + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + assert world_size <= 8, "world size must be less than or equal to 8" + os.environ["TORCH_NCCL_DEBUG_INFO_TEMP_FILE"] = "/tmp/trace_" + os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1" + os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "2000" + device = torch.device(f"cuda:{local_rank}") + print(f"{local_rank=} {world_size=} master addr: {os.environ['MASTER_ADDR']} master port: {os.environ['MASTER_PORT']} {device=}") + + # Initialize the process group with a small timeout so that jobs fail quickly + dist.init_process_group("nccl", world_size=world_size, rank=local_rank, timeout=timedelta(seconds=1)) + + a = torch.full((3, 4), float(local_rank), device=device) + # Write some collectives to populate Flight Recorder data + for i in range(2): + print(f"calling allreduce on {local_rank=}") + f = dist.all_reduce(a) + + # rank0 is doing an additional collective + if local_rank == 0: + print("rank0 is doing an allreduce on tensor b, but other ranks forgot") + b = torch.full((4,5), float(local_rank), device=device) + f = dist.all_reduce(b) + + for i in range(2): + print(f"calling allreduce on {local_rank=}") + f = dist.all_reduce(a) + + torch.cuda.synchronize(device=device) + print(f"{local_rank=} exiting") + + +To run this program, use ``torchrun``: + + +.. code:: python + + torchrun --nnodes=1 --nproc_per_node=2 crash.py + +You should see two files in the ``/tmp`` directory: + +.. code:: bash + + $ls /tmp/trace* + # Expected output + /tmp/trace_0 /tmp/trace_1 + +Finally, to analyze these two files, we use the ``torchfrtrace`` command: + +.. code:: bash + + torchfrtrace --prefix "trace_" /tmp/ + +The output from the trace command is meant to be human-readable. It includes information about the +set of collectives that caused a failure. +The output for the command above is shown below. +We can clearly see that rank 1 did not join the "all_reduce" collective. + +.. code-block:: bash + $torchfrtrace --prefix "trace_" /tmp/ + Not all ranks joining collective 5 at entry 4 + group info: 0:default_pg + collective: nccl:all_reduce + missing ranks: {1} + input sizes: [[3, 4]] + output sizes: [[3, 4]] + expected ranks: 2 + collective state: scheduled + collective stack trace: + all_reduce at /home/cpio/local/pytorch/torch/distributed/distributed_c10d.py:2696 + wrapper at /home/cpio/local/pytorch/torch/distributed/c10d_logger.py:83 + at /home/cpio/test/crash.py:44 + + + +Conclusion +---------- +In this tutorial, we have learned about a new PyTorch diagnostic tool called Flight Recorder. +We have discussed how to enable Flight Recorder to collect diagnostic data from a machine. +Additionally, we explored how to analyze the data captured from the Flight Recorder using a +convenience script located in the `tools/flight_recorder `__ +directory of the PyTorch repository. diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py index 98ece5f3d31..fc29e5fa97b 100644 --- a/prototype_source/fx_graph_mode_ptq_dynamic.py +++ b/prototype_source/fx_graph_mode_ptq_dynamic.py @@ -1,6 +1,6 @@ """ (prototype) FX Graph Mode Post Training Dynamic Quantization -=========================================================== +============================================================ **Author**: `Jerry Zhang `_ @@ -171,7 +171,8 @@ def tokenize(self, path): model.load_state_dict( torch.load( model_data_filepath + 'word_language_model_quantize.pth', - map_location=torch.device('cpu') + map_location=torch.device('cpu'), + weights_only=True ) ) diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst index c0b692275a0..da16d04dbce 100644 --- a/prototype_source/fx_graph_mode_ptq_static.rst +++ b/prototype_source/fx_graph_mode_ptq_static.rst @@ -157,7 +157,7 @@ Download the `torchvision resnet18 model `_ file -while those for ``QConfigMapping`` can be found in the `qconfig_mapping ` +while those for ``QConfigMapping`` can be found in the `qconfig_mapping ` .. code:: python @@ -320,7 +320,7 @@ We can now print the size and accuracy of the quantized model. # ModuleAttributeError: 'ConvReLU2d' object has no attribute '_modules' # save the whole model directly # torch.save(quantized_model, fx_graph_mode_model_file_path) - # loaded_quantized_model = torch.load(fx_graph_mode_model_file_path) + # loaded_quantized_model = torch.load(fx_graph_mode_model_file_path, weights_only=False) # save with state_dict # torch.save(quantized_model.state_dict(), fx_graph_mode_model_file_path) @@ -328,7 +328,7 @@ We can now print the size and accuracy of the quantized model. # model_to_quantize = copy.deepcopy(float_model) # prepared_model = prepare_fx(model_to_quantize, {"": qconfig}) # loaded_quantized_model = convert_fx(prepared_model) - # loaded_quantized_model.load_state_dict(torch.load(fx_graph_mode_model_file_path)) + # loaded_quantized_model.load_state_dict(torch.load(fx_graph_mode_model_file_path), weights_only=True) # save with script torch.jit.save(torch.jit.script(quantized_model), fx_graph_mode_model_file_path) diff --git a/prototype_source/fx_graph_mode_quant_guide.rst b/prototype_source/fx_graph_mode_quant_guide.rst index 9072e488a4b..4ae8496ed52 100644 --- a/prototype_source/fx_graph_mode_quant_guide.rst +++ b/prototype_source/fx_graph_mode_quant_guide.rst @@ -142,7 +142,7 @@ Refactor your code to make it symbolically traceable If it is easy to refactor the code and make the code symbolically traceable, we can refactor the code and remove the use of non-traceable constructs in python. -More information about symbolic tracing support can be found in: (TODO: link) +More information about symbolic tracing support can be found `here `_. before: diff --git a/prototype_source/gpu_quantization_torchao_tutorial.py b/prototype_source/gpu_quantization_torchao_tutorial.py new file mode 100644 index 00000000000..f901f8abd31 --- /dev/null +++ b/prototype_source/gpu_quantization_torchao_tutorial.py @@ -0,0 +1,320 @@ +""" +(prototype) GPU Quantization with TorchAO +====================================================== + +**Author**: `HDCharles `_ + +In this tutorial, we will walk you through the quantization and optimization +of the popular `segment anything model `_. These +steps will mimic some of those taken to develop the +`segment-anything-fast `_ +repo. This step-by-step guide demonstrates how you can +apply these techniques to speed up your own models, especially those +that use transformers. To that end, we will focus on widely applicable +techniques, such as optimizing performance with ``torch.compile`` and +quantization and measure their impact. + +""" + + +###################################################################### +# Set up Your Environment +# -------------------------------- +# +# First, let's configure your environment. This guide was written for CUDA 12.1. +# We have run this tutorial on an A100-PG509-200 power limited to 330.00 W. If you +# are using a different hardware, you might see different performance numbers. +# +# +# .. code-block:: bash +# +# > conda create -n myenv python=3.10 +# > pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 +# > pip install git+https://github.com/facebookresearch/segment-anything.git +# > pip install git+https://github.com/pytorch-labs/ao.git +# +# Segment Anything Model checkpoint setup: +# +# 1. Go to the `segment-anything repo checkpoint `_ and download the ``vit_h`` checkpoint. Alternatively, you can use ``wget`` (for example, ``wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth --directory-prefix=``). +# 2. Pass in that directory by editing the code below to say: +# +# .. code-block:: bash +# +# {sam_checkpoint_base_path}= +# + +import torch +from torchao.quantization.quant_api import quantize_, int8_dynamic_activation_int8_weight +from torchao.utils import unwrap_tensor_subclass, TORCH_VERSION_AT_LEAST_2_5 +from segment_anything import sam_model_registry +from torch.utils.benchmark import Timer + +sam_checkpoint_base_path = "data" +model_type = 'vit_h' +model_name = 'sam_vit_h_4b8939.pth' +checkpoint_path = f"{sam_checkpoint_base_path}/{model_name}" +batchsize = 16 +only_one_block = True + + +@torch.no_grad() +def benchmark(f, *args, **kwargs): + for _ in range(3): + f(*args, **kwargs) + torch.cuda.synchronize() + + torch.cuda.reset_peak_memory_stats() + t0 = Timer( + stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + ) + res = t0.adaptive_autorange(.03, min_run_time=.2, max_run_time=20) + return {'time':res.median * 1e3, 'memory': torch.cuda.max_memory_allocated()/1e9} + +def get_sam_model(only_one_block=False, batchsize=1): + sam = sam_model_registry[model_type](checkpoint=checkpoint_path).cuda() + model = sam.image_encoder.eval() + image = torch.randn(batchsize, 3, 1024, 1024, device='cuda') + + # code to use just a single block of the model + if only_one_block: + model = model.blocks[0] + image = torch.randn(batchsize, 64, 64, 1280, device='cuda') + return model, image + + +###################################################################### +# In this tutorial, we focus on quantizing the ``image_encoder`` because the +# inputs to it are statically sized while the prompt encoder and mask +# decoder have variable sizes which makes them harder to quantize. +# +# We’ll focus on just a single block at first to make the analysis easier. +# +# Let's start by measuring the baseline runtime. + +try: + model, image = get_sam_model(only_one_block, batchsize) + fp32_res = benchmark(model, image) + print(f"base fp32 runtime of the model is {fp32_res['time']:0.2f}ms and peak memory {fp32_res['memory']:0.2f}GB") + # base fp32 runtime of the model is 186.16ms and peak memory 6.33GB +except Exception as e: + print("unable to run fp32 model: ", e) + + + +###################################################################### +# We can achieve an instant performance boost by converting the model to bfloat16. +# The reason we opt for bfloat16 over fp16 is due to its dynamic range, which is comparable to +# that of fp32. Both bfloat16 and fp32 possess 8 exponential bits, whereas fp16 only has 4. This +# larger dynamic range helps protect us from overflow errors and other issues that can arise +# when scaling and rescaling tensors due to quantization. +# + +model, image = get_sam_model(only_one_block, batchsize) +model = model.to(torch.bfloat16) +image = image.to(torch.bfloat16) +bf16_res = benchmark(model, image) +print(f"bf16 runtime of the block is {bf16_res['time']:0.2f}ms and peak memory {bf16_res['memory']: 0.2f}GB") +# bf16 runtime of the block is 25.43ms and peak memory 3.17GB + + +###################################################################### +# Just this quick change improves runtime by a factor of ~7x in the tests we have +# conducted (186.16ms to 25.43ms). +# +# Next, let's use ``torch.compile`` with our model to see how much the performance +# improves. +# + +model_c = torch.compile(model, mode='max-autotune') +comp_res = benchmark(model_c, image) +print(f"bf16 compiled runtime of the block is {comp_res['time']:0.2f}ms and peak memory {comp_res['memory']: 0.2f}GB") +# bf16 compiled runtime of the block is 19.95ms and peak memory 2.24GB + + +###################################################################### +# The first time this is run, you should see a sequence of ``AUTOTUNE`` +# outputs which occurs when inductor compares the performance between +# various kernel parameters for a kernel. This only happens once (unless +# you delete your cache) so if you run the cell again you should just get +# the benchmark output. +# +# ``torch.compile`` yields about another 27% improvement. This brings the +# model to a reasonable baseline where we now have to work a bit harder +# for improvements. +# +# Next, let's apply quantization. Quantization for GPUs comes in three main forms +# in `torchao `_ which is just native +# pytorch+python code. This includes: +# +# * int8 dynamic quantization +# * int8 weight-only quantization +# * int4 weight-only quantization +# +# Different models, or sometimes different layers in a model can require different techniques. +# For models which are heavily compute bound, dynamic quantization tends +# to work the best since it swaps the normal expensive floating point +# matmul ops with integer versions. Weight-only quantization works better +# in memory bound situations where the benefit comes from loading less +# weight data, rather than doing less computation. The torchao APIs: +# +# ``int8_dynamic_activation_int8_weight()``, +# ``int8_weight_only()`` or +# ``int4_weight_only()`` +# +# can be used to easily apply the desired quantization technique and then +# once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is +# complete and we can see our speedup. +# +# .. note:: +# You might experience issues with these on older versions of PyTorch. If you run +# into an issue, you can use ``apply_dynamic_quant`` and +# ``apply_weight_only_int8_quant`` instead as drop in replacement for the two +# above (no replacement for int4). +# +# The difference between the two APIs is that ``int8_dynamic_activation`` API +# alters the weight tensor of the linear module so instead of doing a +# normal linear, it does a quantized operation. This is helpful when you +# have non-standard linear ops that do more than one thing. The ``apply`` +# APIs directly swap the linear modules for a quantized module which +# works on older versions but doesn’t work with non-standard linear +# modules. +# +# In this case Segment Anything is compute-bound so we’ll use dynamic quantization: +# + +del model_c, model, image +model, image = get_sam_model(only_one_block, batchsize) +model = model.to(torch.bfloat16) +image = image.to(torch.bfloat16) +quantize_(model, int8_dynamic_activation_int8_weight()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) +model_c = torch.compile(model, mode='max-autotune') +quant_res = benchmark(model_c, image) +print(f"bf16 compiled runtime of the quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") +# bf16 compiled runtime of the quantized block is 19.04ms and peak memory 3.58GB + + +###################################################################### +# With quantization, we have improved performance a bit more but memory usage increased +# significantly. +# +# This is for two reasons: +# +# 1) Quantization adds overhead to the model +# since we need to quantize and dequantize the input and output. For small +# batch sizes this overhead can actually make the model go slower. +# 2) Even though we are doing a quantized matmul, such as ``int8 x int8``, +# the result of the multiplication gets stored in an int32 tensor +# which is twice the size of the result from the non-quantized model. +# If we can avoid creating this int32 tensor, our memory usage will improve a lot. +# +# We can fix #2 by fusing the integer matmul with the subsequent rescale +# operation since the final output will be bf16, if we immediately convert +# the int32 tensor to bf16 and instead store that we’ll get better +# performance in terms of both runtime and memory. +# +# The way to do this, is to enable the option +# ``force_fuse_int_mm_with_mul`` in the inductor config. +# + +del model_c, model, image +model, image = get_sam_model(only_one_block, batchsize) +model = model.to(torch.bfloat16) +image = image.to(torch.bfloat16) +torch._inductor.config.force_fuse_int_mm_with_mul = True +quantize_(model, int8_dynamic_activation_int8_weight()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) +model_c = torch.compile(model, mode='max-autotune') +quant_res = benchmark(model_c, image) +print(f"bf16 compiled runtime of the fused quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") +# bf16 compiled runtime of the fused quantized block is 18.78ms and peak memory 2.37GB + + +###################################################################### +# The fusion improves performance by another small bit (about 6% over the +# baseline in total) and removes almost all the memory increase, the +# remaining amount (2.37GB quantized vs 2.24GB unquantized) is due to +# quantization overhead which cannot be helped. +# +# We’re still not done though, we can apply a few general purpose +# optimizations to get our final best-case performance. +# +# 1) We can sometimes improve performance by disabling epilogue fusion +# since the autotuning process can be confused by fusions and choose +# bad kernel parameters. +# 2) We can apply coordinate descent tuning in all directions to enlarge +# the search area for kernel parameters. +# + +del model_c, model, image +model, image = get_sam_model(only_one_block, batchsize) +model = model.to(torch.bfloat16) +image = image.to(torch.bfloat16) +torch._inductor.config.epilogue_fusion = False +torch._inductor.config.coordinate_descent_tuning = True +torch._inductor.config.coordinate_descent_check_all_directions = True +torch._inductor.config.force_fuse_int_mm_with_mul = True +quantize_(model, int8_dynamic_activation_int8_weight()) +if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) +model_c = torch.compile(model, mode='max-autotune') +quant_res = benchmark(model_c, image) +print(f"bf16 compiled runtime of the final quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") +# bf16 compiled runtime of the final quantized block is 18.16ms and peak memory 2.39GB + + +###################################################################### +# As you can see, we’ve squeezed another small improvement from the model, +# taking our total improvement to over 10x compared to our original. To +# get a final estimate of the impact of quantization lets do an apples to +# apples comparison on the full model since the actual improvement will +# differ block by block depending on the shapes involved. +# + +try: + del model_c, model, image + model, image = get_sam_model(False, batchsize) + model = model.to(torch.bfloat16) + image = image.to(torch.bfloat16) + model_c = torch.compile(model, mode='max-autotune') + quant_res = benchmark(model_c, image) + print(f"bf16 compiled runtime of the compiled full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") + # bf16 compiled runtime of the compiled full model is 729.65ms and peak memory 23.96GB + + del model_c, model, image + model, image = get_sam_model(False, batchsize) + model = model.to(torch.bfloat16) + image = image.to(torch.bfloat16) + quantize_(model, int8_dynamic_activation_int8_weight()) + if not TORCH_VERSION_AT_LEAST_2_5: + # needed for subclass + compile to work on older versions of pytorch + unwrap_tensor_subclass(model) + model_c = torch.compile(model, mode='max-autotune') + quant_res = benchmark(model_c, image) + print(f"bf16 compiled runtime of the quantized full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB") + # bf16 compiled runtime of the quantized full model is 677.28ms and peak memory 24.93GB +except Exception as e: + print("unable to run full model: ", e) + + + +###################################################################### +# Conclusion +# ----------------- +# In this tutorial, we have learned about the quantization and optimization techniques +# on the example of the segment anything model. +# +# In the end, we achieved a full-model apples to apples quantization speedup +# of about 7.7% on batch size 16 (677.28ms to 729.65ms). We can push this a +# bit further by increasing the batch size and optimizing other parts of +# the model. For example, this can be done with some form of flash attention. +# +# For more information visit +# `torchao `_ and try it on your own +# models. +# diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst index 5d76ddef79a..949002a55dc 100644 --- a/prototype_source/graph_mode_dynamic_bert_tutorial.rst +++ b/prototype_source/graph_mode_dynamic_bert_tutorial.rst @@ -1,5 +1,5 @@ (prototype) Graph Mode Dynamic Quantization on BERT -============================================== +=================================================== **Author**: `Supriya Rao `_ diff --git a/prototype_source/inductor_cpp_wrapper_tutorial.rst b/prototype_source/inductor_cpp_wrapper_tutorial.rst index 199a66b2b28..4bcc9009075 100644 --- a/prototype_source/inductor_cpp_wrapper_tutorial.rst +++ b/prototype_source/inductor_cpp_wrapper_tutorial.rst @@ -21,7 +21,7 @@ thereby reducing the Python overhead within the graph. Enabling the API ------------- +---------------- This feature is still in prototype stage. To activate this feature, add the following to your code: .. code:: python diff --git a/prototype_source/inductor_windows_cpu.rst b/prototype_source/inductor_windows_cpu.rst new file mode 100644 index 00000000000..96e1bf46909 --- /dev/null +++ b/prototype_source/inductor_windows_cpu.rst @@ -0,0 +1,130 @@ +How to use TorchInductor on Windows CPU +======================================= + +**Author**: `Zhaoqiong Zheng `_, `Xu, Han `_ + + + +TorchInductor is a compiler backend that transforms FX Graphs generated by TorchDynamo into highly optimized C++/Triton kernels. +This tutorial will guide you through the process of using TorchInductor on a Windows CPU. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to compile and execute a Python function with PyTorch, optimized for Windows CPU + * Basics of TorchInductor's optimization using C++/Triton kernels. + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.5 or later + * Microsoft Visual C++ (MSVC) + * Miniforge for Windows + +Install the Required Software +----------------------------- + +First, let's install the required software. C++ compiler is required for TorchInductor optimization. +We will use Microsoft Visual C++ (MSVC) for this example. + +1. Download and install `MSVC `_. + +2. During the installation, choose **Desktop Development with C++** in the **Desktop & Mobile** section in **Workloads** table. Then install the software + +.. note:: + + We recommend C++ compiler `Clang `_ and `Intel Compiler `_. + Please check `Alternative Compiler for better performance <#alternative-compiler-for-better-performance>`_. + +3. Download and install `Miniforge3-Windows-x86_64.exe `__. + +Set Up the Environment +---------------------- + +#. Open the command line environment via ``cmd.exe``. +#. Activate ``MSVC`` with the following command: + + .. code-block:: sh + + "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvars64.bat" +#. Activate ``conda`` with the following command: + + .. code-block:: sh + + "C:/ProgramData/miniforge3/Scripts/activate.bat" +#. Create and activate a custom conda environment: + + .. code-block:: sh + + conda create -n inductor_cpu_windows python=3.10 -y + conda activate inductor_cpu_windows + +#. Install `PyTorch 2.5 `_ or later. + +Using TorchInductor on Windows CPU +---------------------------------- + +Here’s a simple example to demonstrate how to use TorchInductor: + +.. code-block:: python + + + import torch + def foo(x, y): + a = torch.sin(x) + b = torch.cos(y) + return a + b + opt_foo1 = torch.compile(foo) + print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10))) + +Here is the sample output that this code might return: + +.. code-block:: sh + + tensor([[-3.9074e-02, 1.3994e+00, 1.3894e+00, 3.2630e-01, 8.3060e-01, + 1.1833e+00, 1.4016e+00, 7.1905e-01, 9.0637e-01, -1.3648e+00], + [ 1.3728e+00, 7.2863e-01, 8.6888e-01, -6.5442e-01, 5.6790e-01, + 5.2025e-01, -1.2647e+00, 1.2684e+00, -1.2483e+00, -7.2845e-01], + [-6.7747e-01, 1.2028e+00, 1.1431e+00, 2.7196e-02, 5.5304e-01, + 6.1945e-01, 4.6654e-01, -3.7376e-01, 9.3644e-01, 1.3600e+00], + [-1.0157e-01, 7.7200e-02, 1.0146e+00, 8.8175e-02, -1.4057e+00, + 8.8119e-01, 6.2853e-01, 3.2773e-01, 8.5082e-01, 8.4615e-01], + [ 1.4140e+00, 1.2130e+00, -2.0762e-01, 3.3914e-01, 4.1122e-01, + 8.6895e-01, 5.8852e-01, 9.3310e-01, 1.4101e+00, 9.8318e-01], + [ 1.2355e+00, 7.9290e-02, 1.3707e+00, 1.3754e+00, 1.3768e+00, + 9.8970e-01, 1.1171e+00, -5.9944e-01, 1.2553e+00, 1.3394e+00], + [-1.3428e+00, 1.8400e-01, 1.1756e+00, -3.0654e-01, 9.7973e-01, + 1.4019e+00, 1.1886e+00, -1.9194e-01, 1.3632e+00, 1.1811e+00], + [-7.1615e-01, 4.6622e-01, 1.2089e+00, 9.2011e-01, 1.0659e+00, + 9.0892e-01, 1.1932e+00, 1.3888e+00, 1.3898e+00, 1.3218e+00], + [ 1.4139e+00, -1.4000e-01, 9.1192e-01, 3.0175e-01, -9.6432e-01, + -1.0498e+00, 1.4115e+00, -9.3212e-01, -9.0964e-01, 1.0127e+00], + [ 5.7244e-04, 1.2799e+00, 1.3595e+00, 1.0907e+00, 3.7191e-01, + 1.4062e+00, 1.3672e+00, 6.8502e-02, 8.5216e-01, 8.6046e-01]]) + +Using an Alternative Compiler for Better Performance +------------------------------------------- + +To enhance performance on Windows inductor, you can use the Intel Compiler or LLVM Compiler. However, they rely on the runtime libraries from Microsoft Visual C++ (MSVC). Therefore, your first step should be to install MSVC. + +Intel Compiler +^^^^^^^^^^^^^^ + +#. Download and install `Intel Compiler `_ with Windows version. +#. Set Windows Inductor Compiler with the CXX environment variable ``set CXX=icx-cl``. + +Intel also provides a comprehensive step-by-step guide, complete with performance data. Please check `Intel® oneAPI DPC++/C++ Compiler Boosts PyTorch* Inductor Performance on Windows* for CPU Devices `_. + +LLVM Compiler +^^^^^^^^^^^^^ + +#. Download and install `LLVM Compiler `_ and choose win64 version. +#. Set Windows Inductor Compiler with the CXX environment variable ``set CXX=clang-cl``. + +Conclusion +---------- + +In this tutorial, we have learned how to use Inductor on Windows CPU with PyTorch. In addition, we discussed +further performance improvements with Intel Compiler and LLVM Compiler. diff --git a/prototype_source/ios_coreml_workflow.rst b/prototype_source/ios_coreml_workflow.rst index bfaccd77a10..db9abcc5076 100644 --- a/prototype_source/ios_coreml_workflow.rst +++ b/prototype_source/ios_coreml_workflow.rst @@ -1,128 +1,10 @@ (Prototype) Convert Mobilenetv2 to Core ML ========================================== -**Author**: `Tao Xu `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------- +Redirecting in 3 seconds... -Core ML provides access to powerful and efficient NPUs(Neural Process Unit) on modern iPhone devices. This tutorial shows how to prepare a computer vision model (mobilenetv2) to use the PyTorch Core ML mobile backend. +.. raw:: html -Note that this feature is currently in the “prototype” phase and only supports a limited numbers of operators, but we expect to solidify the integration and expand our operator support over time. The APIs are subject to change in the future. - -Environment Setup (MacOS) -------------------------- - -Let's start off by creating a new conda environment. - -.. code:: shell - - conda create --name 1.10 python=3.8 --yes - conda activate 1.10 - -Next, since the Core ML delegate is a prototype feature, let's install the PyTorch nightly build and coremltools - -.. code:: shell - - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - - pip3 install coremltools==5.0b5 protobuf==3.20.1 - - -Model Preparation -------------------- - -To convert a pre-trained mobilenetv2 model to be Core ML compatible, we're going to use the ``to_backend()`` API, which is a prototype feature for delegating model executions to some specific backends. The following python code shows how to use it to convert the mobilenetv2 torchscript model. - -.. code:: python - - import torch - import torchvision - - from torch.backends._coreml.preprocess import ( - CompileSpec, - TensorSpec, - CoreMLComputeUnit, - ) - - def mobilenetv2_spec(): - return { - "forward": CompileSpec( - inputs=( - TensorSpec( - shape=[1, 3, 224, 224], - ), - ), - outputs=( - TensorSpec( - shape=[1, 1000], - ), - ), - backend=CoreMLComputeUnit.ALL, - allow_low_precision=True, - ), - } - - - def main(): - model = torchvision.models.mobilenet_v2(pretrained=True) - model.eval() - example = torch.rand(1, 3, 224, 224) - model = torch.jit.trace(model, example) - compile_spec = mobilenetv2_spec() - mlmodel = torch._C._jit_to_backend("coreml", model, compile_spec) - mlmodel._save_for_lite_interpreter("./mobilenetv2_coreml.ptl") - - - if __name__ == "__main__": - main() - - -First, we need to call ``.eval()`` to set the model to inference mode. Secondly, we defined a ``mobilenetv2_spec()`` function to tell Core ML what the model looks like. Note that the ``CoreMLComputeUnit`` corresponds to `Apple's processing unit `_ whose value can be ``CPU``, ``CPUAndGPU`` and ``ALL``. In our example, we set the ``backend`` type to ``ALL`` which means Core ML will try to run the model on Neural Engine. Finally, we called the ``to_backend`` API to convert the torchscript model to a Core ML compatible model and save it to the disk. - -Run the python script. If everything works well, you should see following outputs from coremltools - -.. code:: shell - - Converting Frontend ==> MIL Ops: 100%|███████████████████████████████████████████████████████████████████████████████▊| 384/385 [00:00<00:00, 1496.98 ops/s] - Running MIL Common passes: 0%| - 0/33 [00:00 NeuralNetwork Ops: 100%|██████████████████████████████████████████████████████████████████████████| 495/495 [00:00<00:00, 1977.15 ops/s] - [W backend_detail.cpp:376] Warning: Backend [coreml] is not available. Execution of this Module is still possible by saving and loading on a device where the backend is available. (function codegen_backend_module) - -We can safely ignore the warning above, as we don't plan to run our model on desktop. - -iOS app integration ---------------------- - -Now that the model is ready, we can integrate it to our app. We'll be using the pytorch nightly cocoapods which contains the code for executing the Core ML model. Simply add the following code to your Podfile - -.. code:: shell - - pod LibTorch-Lite-Nightly - -In this tutorial, we'll be reusing our `HelloWorld `_ project. Feel free to walk through the code there. - -To benchmark the latency, you can simply put the following code before and after the PyTorch ``forward`` function - -.. code:: objective-c - - caffe2::Timer t; - auto outputTensor = _impl.forward({tensor}).toTensor().cpu(); - std::cout << "forward took: " << t.MilliSeconds() << std::endl; - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a mobilenetv2 model to a Core ML compatible model. Please be aware of that Core ML feature is still under development, new operators/models will continue to be added. APIs are subject to change in the future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. - -Learn More ----------- - -- The `Mobilenetv2 `_ from Torchvision -- Information about `Core ML `_ + diff --git a/prototype_source/ios_gpu_workflow.rst b/prototype_source/ios_gpu_workflow.rst index 0e87ad815f9..8915e1c4fad 100644 --- a/prototype_source/ios_gpu_workflow.rst +++ b/prototype_source/ios_gpu_workflow.rst @@ -1,142 +1,10 @@ (Prototype) Use iOS GPU in PyTorch ================================== -**Author**: `Tao Xu `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------- +Redirecting in 3 seconds... -This tutorial introduces the steps to run your models on iOS GPU. We'll be using the mobilenetv2 model as an example. Since the mobile GPU features are currently in the prototype stage, you'll need to build a custom pytorch binary from source. For the time being, only a limited number of operators are supported, and certain client side APIs are subject to change in the future versions. +.. raw:: html -Model Preparation -------------------- - -Since GPUs consume weights in a different order, the first step we need to do is to convert our TorchScript model to a GPU compatible model. This step is also known as "prepacking". - -PyTorch with Metal -^^^^^^^^^^^^^^^^^^ -To do that, we'll install a pytorch nightly binary that includes the Metal backend. Go ahead run the command below - -.. code:: shell - - conda install pytorch -c pytorch-nightly - // or - pip3 install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - -Also, you can build a custom pytorch binary from source that includes the Metal backend. Just checkout the pytorch source code from github and run the command below - -.. code:: shell - - cd PYTORCH_ROOT - USE_PYTORCH_METAL_EXPORT=ON python setup.py install --cmake - -The command above will build a custom pytorch binary from master. The ``install`` argument simply tells ``setup.py`` to override the existing PyTorch on your desktop. Once the build finished, open another terminal to check the PyTorch version to see if the installation was successful. As the time of writing of this recipe, the version is ``1.8.0a0+41237a4``. You might be seeing different numbers depending on when you check out the code from master, but it should be greater than 1.7.0. - -.. code:: python - - import torch - torch.__version__ #1.8.0a0+41237a4 - -Metal Compatible Model -^^^^^^^^^^^^^^^^^^^^^^ - -The next step is going to be converting the mobilenetv2 torchscript model to a Metal compatible model. We'll be leveraging the ``optimize_for_mobile`` API from the ``torch.utils`` module. As shown below - -.. code:: python - - import torch - import torchvision - from torch.utils.mobile_optimizer import optimize_for_mobile - - model = torchvision.models.mobilenet_v2(pretrained=True) - scripted_model = torch.jit.script(model) - optimized_model = optimize_for_mobile(scripted_model, backend='metal') - print(torch.jit.export_opnames(optimized_model)) - optimized_model._save_for_lite_interpreter('./mobilenetv2_metal.pt') - -Note that the ``torch.jit.export_opnames(optimized_model)`` is going to dump all the optimized operators from the ``optimized_mobile``. If everything works well, you should be able to see the following ops being printed out from the console - - -.. code:: shell - - ['aten::adaptive_avg_pool2d', - 'aten::add.Tensor', - 'aten::addmm', - 'aten::reshape', - 'aten::size.int', - 'metal::copy_to_host', - 'metal_prepack::conv2d_run'] - -Those are all the ops we need to run the mobilenetv2 model on iOS GPU. Cool! Now that you have the ``mobilenetv2_metal.pt`` saved on your disk, let's move on to the iOS part. - - -Use PyTorch iOS library with Metal ---------------------- -The PyTorch iOS library with Metal support ``LibTorch-Lite-Nightly`` is available in Cocoapods. You can read the `Using the Nightly PyTorch iOS Libraries in CocoaPods `_ section from the iOS tutorial for more detail about its usage. - -We also have the `HelloWorld-Metal example `_ that shows how to conect all pieces together. - -Note that if you run the HelloWorld-Metal example, you may notice that the results are slighly different from the `results `_ we got from the CPU model as shown in the iOS tutorial. - -.. code:: shell - - - timber wolf, grey wolf, gray wolf, Canis lupus - - malamute, malemute, Alaskan malamute - - Eskimo dog, husky - -This is because by default Metal uses fp16 rather than fp32 to compute. The precision loss is expected. - - -Use LibTorch-Lite Built from Source ---------------------- - -You can also build a custom LibTorch-Lite from Source and use it to run GPU models on iOS Metal. In this section, we'll be using the `HelloWorld example `_ to demonstrate this process. - -First, make sure you have deleted the **build** folder from the "Model Preparation" step in PyTorch root directory. Then run the command below - -.. code:: shell - - IOS_ARCH=arm64 USE_PYTORCH_METAL=1 ./scripts/build_ios.sh - -Note ``IOS_ARCH`` tells the script to build a arm64 version of Libtorch-Lite. This is because in PyTorch, Metal is only available for the iOS devices that support the Apple A9 chip or above. Once the build finished, follow the `Build PyTorch iOS libraries from source `_ section from the iOS tutorial to setup the XCode settings properly. Don't forget to copy the ``./mobilenetv2_metal.pt`` to your XCode project and modify the model file path accordingly. - -Next we need to make some changes in ``TorchModule.mm`` - -.. code:: objective-c - - ... - // #import - // If it's built from source with Xcode, comment out the line above - // and use following headers - #include - #include - #include - ... - - - (NSArray*)predictImage:(void*)imageBuffer { - c10::InferenceMode mode; - at::Tensor tensor = torch::from_blob(imageBuffer, {1, 3, 224, 224}, at::kFloat).metal(); - auto outputTensor = _impl.forward({tensor}).toTensor().cpu(); - ... - } - ... - -As you can see, we simply just call ``.metal()`` to move our input tensor from CPU to GPU, and then call ``.cpu()`` to move the result back. Internally, ``.metal()`` will copy the input data from the CPU buffer to a GPU buffer with a GPU compatible memory format. When ``.cpu()`` is invoked, the GPU command buffer will be flushed and synced. After `forward` finished, the final result will then be copied back from the GPU buffer back to a CPU buffer. - -The last step we have to do is to add the ``Accelerate.framework`` and the ``MetalPerformanceShaders.framework`` to your xcode project (Open your project via XCode, go to your project target’s "General" tab, locate the "Frameworks, Libraries and Embedded Content" section and click the "+" button). - -If everything works fine, you should be able to see the inference results on your phone. - - -Conclusion ----------- - -In this tutorial, we demonstrated how to convert a mobilenetv2 model to a GPU compatible model. We walked through a HelloWorld example to show how to use the C++ APIs to run models on iOS GPU. Please be aware of that GPU feature is still under development, new operators will continue to be added. APIs are subject to change in the future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here `_ if you have any. - -Learn More ----------- - -- The `Mobilenetv2 `_ from Torchvision -- To learn more about how to use ``optimize_for_mobile``, please refer to the `Mobile Perf Recipe `_ + diff --git a/prototype_source/lite_interpreter.rst b/prototype_source/lite_interpreter.rst new file mode 100644 index 00000000000..73e950d72e2 --- /dev/null +++ b/prototype_source/lite_interpreter.rst @@ -0,0 +1,9 @@ +(Prototype) Introduce lite interpreter workflow in Android and iOS +======================= + +This tutorial has been moved to https://pytorch.org/tutorials/recipes/mobile_interpreter.html + + +.. raw:: html + + diff --git a/prototype_source/maskedtensor_sparsity.py b/prototype_source/maskedtensor_sparsity.py index 74024f8e229..1985135714e 100644 --- a/prototype_source/maskedtensor_sparsity.py +++ b/prototype_source/maskedtensor_sparsity.py @@ -186,19 +186,19 @@ mt_dense = mt_sparse_coo.to_dense() ###################################################################### -# :meth:`MaskedTensor.is_sparse()` -- this will check if the :class:`MaskedTensor`'s layout +# :meth:`MaskedTensor.is_sparse` -- this will check if the :class:`MaskedTensor`'s layout # matches any of the supported sparse layouts (currently COO and CSR). # -print("mt_dense.is_sparse: ", mt_dense.is_sparse()) -print("mt_sparse_coo.is_sparse: ", mt_sparse_coo.is_sparse()) -print("mt_sparse_csr.is_sparse: ", mt_sparse_csr.is_sparse()) +print("mt_dense.is_sparse: ", mt_dense.is_sparse) +print("mt_sparse_coo.is_sparse: ", mt_sparse_coo.is_sparse) +print("mt_sparse_csr.is_sparse: ", mt_sparse_csr.is_sparse) ###################################################################### # :meth:`MaskedTensor.is_sparse_coo()` # -print("mt_dense.is_sparse_coo: ", mt_dense.is_sparse_coo()) +print("mt_dense.is_sparse_coo(): ", mt_dense.is_sparse_coo()) print("mt_sparse_coo.is_sparse_coo: ", mt_sparse_coo.is_sparse_coo()) print("mt_sparse_csr.is_sparse_coo: ", mt_sparse_csr.is_sparse_coo()) @@ -206,7 +206,7 @@ # :meth:`MaskedTensor.is_sparse_csr()` # -print("mt_dense.is_sparse_csr: ", mt_dense.is_sparse_csr()) +print("mt_dense.is_sparse_csr(): ", mt_dense.is_sparse_csr()) print("mt_sparse_coo.is_sparse_csr: ", mt_sparse_coo.is_sparse_csr()) print("mt_sparse_csr.is_sparse_csr: ", mt_sparse_csr.is_sparse_csr()) diff --git a/prototype_source/max_autotune_on_CPU_tutorial.rst b/prototype_source/max_autotune_on_CPU_tutorial.rst new file mode 100644 index 00000000000..47374744938 --- /dev/null +++ b/prototype_source/max_autotune_on_CPU_tutorial.rst @@ -0,0 +1,215 @@ +Using Max-Autotune Compilation on CPU for Better Performance +================================================================================ + +**Author**: `Jiong Gong `__, `Leslie Fang `__, `Chunyuan Wu `__ + +In this tutorial, you will learn how to boost your PyTorch models' performance on CPU by +leveraging the max-autotune mode in the Inductor CPU backend. Explore the activation +process, understand the differences from traditional methods, and integrate max-autotune +into your code for enhanced computational efficiency. Dive into the use of advanced +GEMM templates for faster processing and superior runtime performance. + +Prerequisites: +---------------- +- `torch.compile and TorchInductor concepts in PyTorch `__ + +Introduction +------------ +The ``max-autotune`` mode for the Inductor CPU backend in ``torch.compile`` (`RFC link `_) +profiles multiple implementations of operations at compile time and selects the best-performing one, +trading longer compilation times for improved runtime performance. This enhancement is particularly beneficial for GEMM-related operations. +In the Inductor CPU backend, we’ve introduced a C++ template-based GEMM implementation as an alternative to the ATen-based approach that relies on oneDNN and MKL libraries. +This is similar to the max-autotune mode on CUDA, where implementations from ATen, Triton, and CUTLASS are considered. + +We have covered most popular data types, including FP32, BF16, FP16, and INT8, with epilogue fusions for x86 CPUs. + +While the development is still in progress, we have already seen promising speedups over pure ATen-based GEMMs as measured by the three benchmark suites and the inference of LLMs. + +Activating the ``max-autotune`` mode +------------------------------------- +To activate the ``max-autotune`` mode in PyTorch, set the ``mode`` argument to ``max-autotune`` when compiling your model using ``torch.compile``. +If you prefer to bypass the tuning process and always use the C++ template implementations, you can configure this via an environment variable: +``export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=CPP``. + + +Example +------------ +The below code is an example of using the ``max-autotune`` mode on a simple neural network with a linear layer followed by a ReLU activation. + +In the C++ template-based GEMM implementation, we will pre-pack the weight for good cache usage. +In the case of inference which is the primary scenario of CPU AI workloads, +model weights are constant and we pack them upfront during compilation +so that the data accesses are contiguous within the cache blocks. +Thus, We only support frozen model with ``torch.no_grad`` or the inference mode. +You need to set the environment variable ``export TORCHINDUCTOR_FREEZING=1`` +and ensure that both the compilation and inference steps are executed within the ``torch.no_grad`` context. + +.. code:: python + + import torch + from torch._inductor import config + config.trace.log_autotuning_results = True # enable the log of autotuning results + + class M(torch.nn.Module): + def __init__( + self, + in_features, + out_features, + bias, + **kwargs, + ): + super().__init__() + self.linear = torch.nn.Linear( + in_features, + out_features, + bias, + **kwargs, + ) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.linear(x) + x = self.relu(x) + return x + + amp_enabled = True + batch_size = 64 + in_features = 16 + out_features = 32 + bias = True + + x = torch.randn(batch_size, in_features) + model = M(in_features, out_features, bias) + + with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + compiled = torch.compile(model, mode="max-autotune") # turn on "max-autotune" mode + y = compiled(x) + + +When running the above code snippet, you will see the autotuning result (the performance numbers are for demonstration purposes). +In this example, C++ template outperforms ATen kernel so that it will be selected. + +.. code:: shell + + AUTOTUNE linear_unary(64x16, 32x16, 32) + cpp_packed_gemm_0 0.2142 ms 100.0% + _linear_pointwise 0.2441 ms 87.7% + + +We could check the generated output code by setting ``export TORCH_LOGS="+output_code"``. +When C++ template is selected, we won't have ``torch.ops.mkldnn._linear_pointwise.default`` (for bfloat16) or ``torch.ops.mkl._mkl_linear.default`` (for float32) +in the generated code anymore, instead, we'll find kernel based on CPP GEMM template ``cpp_fused__to_copy_relu_1`` +(only part of the code is demonstrated below for simplicity) with the bias and relu epilogues fused inside the C++ GEMM template kernel. + +The generated code differs by CPU architecture and is implementation-specific, which is subject to change. + +.. code:: python + + cpp_fused__to_copy_relu_1 = async_compile.cpp_pybinding(['const bfloat16*', 'const bfloat16*', 'const bfloat16*', 'bfloat16*'], ''' + + ... + + template + inline void kernel_micro_gemm_amx_kernel_32_2( + AMXState& amx_state, + const bfloat16* __restrict__ A, + const bfloat16* __restrict__ B, + float* __restrict__ C, + int64_t K, + int64_t lda, + int64_t ldb, + int64_t ldc, + uint8_t tilecfg_rows + ) { + ... + } + + ... + + template + inline void kernel_micro_gemm( + AMXState& amx_state, + const bfloat16* __restrict__ A, + const bfloat16* __restrict__ B, + float* __restrict__ C, + int64_t M, + int64_t N, + int64_t K, + int64_t lda, + int64_t ldb, + int64_t ldc + ) { + ... + } + + extern "C" + void kernel(const bfloat16* X, const bfloat16* W, const bfloat16* inp, bfloat16* Y) + { + constexpr int64_t num_threads = 40; + constexpr int64_t N = 32; + constexpr int64_t K = 16; + constexpr int64_t M = static_cast(64L); + ... + #pragma omp parallel num_threads(40) + { + const int tid = omp_get_thread_num(); + ... + for (int64_t mc_block_id = 0; mc_block_id < num_Mc_blocks_per_thread; mc_block_id++) { + ... + for (int64_t nc = n_block_start; nc < n_block_end; nc += Nc_blocks) { + ... + for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) { + ... + for (int64_t nci = nc; nci < nc_block_end; nci++) { + if (kc == k_block_start) { + kernel_micro_gemm(false)>( + ... + ); + + } else { + kernel_micro_gemm(true)>( + ... + ); + + } + } + } + { + { + // Epilogue fusion here for bias and relu + #pragma GCC ivdep + for(int64_t x0=static_cast(0L); x0(m_end + ((-1L)*m_start)); x0+=static_cast(1L)) + { + for(int64_t x1=static_cast(0L); x1(16L*(c10::div_floor_integer(static_cast((n_end + ((-1L)*n_start))), static_cast(16L)))); x1+=static_cast(16L)) + { + auto tmp0 = at::vec::Vectorized::loadu(inp + static_cast(n_start + x1), static_cast(16)); + auto tmp2 = at::vec::Vectorized::loadu(local_acc_buf + static_cast(x1 + (Nc_blocks*Nr*x0)), static_cast(16)); + auto tmp1 = at::vec::convert(tmp0); + auto tmp3 = tmp1 + tmp2; + auto tmp4 = at::vec::convert(tmp3); + auto tmp5 = static_cast(0.0); + auto tmp6 = at::vec::Vectorized(tmp5); + auto tmp7 = at::vec::maximum(tmp3, tmp6); + auto tmp8 = at::vec::convert(tmp7); + tmp8.store(Y + static_cast(n_start + x1 + (32L*m_start) + (32L*x0)), static_cast(16)); + } + + ... + + } + } + + } + } + } + ... + } + } + ''') + +Conclusion +------------ +In this tutorial, we introduced max-autotune support on CPU with GEMM template. We explained the API to activate this feature, and demonstrated +the generated code of the GEMM template. + +This feature is in prototype stage. If you have any feature requests or run into any issues, please file a bug report at `GitHub issues `_. diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 0d2898cc4ac..ecf099c1e02 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -1,38 +1,47 @@ """ -NestedTensors +Getting Started with Nested Tensors =============================================================== -NestedTensors are similar to regular tensors, except for their shape: +Nested tensors generalize the shape of regular dense tensors, allowing for representation +of ragged-sized data. -* for a regular tensor, each dimension has a size +* for a regular tensor, each dimension is regular and has a size -* for a nestedtensor, not all dimensions have regular sizes; some of them are jagged +* for a nested tensor, not all dimensions have regular sizes; some of them are ragged -Nestedtensors are a natural solution for representing sequential data within various domains: +Nested tensors are a natural solution for representing sequential data within various domains: -* in NLP, sentences can have variable lengths, so a batch of sentences forms a nestedtensor +* in NLP, sentences can have variable lengths, so a batch of sentences forms a nested tensor -* in CV, images can have variable shapes, so a batch of images forms a nestedtensor +* in CV, images can have variable shapes, so a batch of images forms a nested tensor -In this tutorial, we will demonstrate basic usage of nestedtensors and motivate their usefulness -for operating on sequential data of varying lengths with a real-world example. +In this tutorial, we will demonstrate basic usage of nested tensors and motivate their usefulness +for operating on sequential data of varying lengths with a real-world example. In particular, +they are invaluable for building transformers that can efficiently operate on ragged sequential +inputs. Below, we present an implementation of multi-head attention using nested tensors that, +combined usage of ``torch.compile``, out-performs operating naively on tensors with padding. -NestedTensor are currently a prototype feature and are subject to change. +Nested tensors are currently a prototype feature and are subject to change. """ +import numpy as np +import timeit import torch import torch.nn.functional as F +from torch import nn + +torch.manual_seed(1) +np.random.seed(1) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ###################################################################### -# NestedTensor Initialization -# ---------------- +# Nested tensor initialization +# ---------------------------- # - -###################################################################### -# From the Python frontend, a nestedtensor can be created from a list of tensors. +# From the Python frontend, a nested tensor can be created from a list of tensors. # We denote nt[i] as the ith tensor component of a nestedtensor. nt = torch.nested.nested_tensor([torch.arange(12).reshape( 2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float, device=device) @@ -66,10 +75,8 @@ ###################################################################### # Nested Tensor Operations -# ---------------- +# ------------------------ # - -###################################################################### # As each operation must be explicitly implemented for nestedtensors, # operation coverage for nestedtensors is currently narrower than that of regular tensors. # For now, only basic operations such as index, dropout, softmax, transpose, reshape, linear, bmm are covered. @@ -123,7 +130,7 @@ ###################################################################### # Why Nested Tensor -# ---------------- +# ----------------- # ###################################################################### @@ -145,12 +152,15 @@ print(f"{nested_sentences=}") ###################################################################### -# This techinque of padding a batch of data to its max length is not optimal. +# This technique of padding a batch of data to its max length is not optimal. # The padded data is not needed for computation and wastes memory by allocating # larger tensors than necessary. # Further, not all operations have the same semnatics when applied to padded data. # For matrix multiplications in order to ignore the padded entries, one needs to pad # with 0 while for softmax one has to pad with -inf to ignore specific entries. +# The primary objective of nested tensor is to facilitate operations on ragged +# data using the standard PyTorch tensor UX, thereby eliminating the need +# for inefficient and complex padding and masking. padded_sentences_for_softmax = torch.tensor([[1.0, 2.0, float("-inf")], [3.0, 4.0, 5.0]]) print(F.softmax(padded_sentences_for_softmax, -1)) @@ -159,199 +169,83 @@ ###################################################################### # Let us take a look at a practical example: the multi-head attention component # utilized in `Transformers `__. -# The nestedtensor version is straightforward. -import math - -def mha_nested(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nheads: int, - W_q: torch.Tensor, W_k: torch.Tensor, W_v: torch.Tensor, W_out: torch.Tensor, - b_q: torch.Tensor = None, b_k: torch.Tensor = None, b_v: torch.Tensor = None, b_out: torch.Tensor = None, - dropout_p: float = 0.0) -> torch.Tensor: - """Compute multi-head attention with nested tensors. - Args: - query (torch.Tensor): query of shape (N, L_t, E_q) - key (torch.Tensor): key of shape (N, L_s, E_k) - value (torch.Tensor): value of shape (N, L_s, E_v) - nheads (int): number of heads in multi-head attention - W_q (torch.Tensor): Weight for query input projection of shape (E_total, E_q) - W_k (torch.Tensor): Weight for key input projection of shape (E_total, E_k) - W_v (torch.Tensor): Weight for value input projection of shape (E_total, E_v) - W_out (torch.Tensor): Weight for output projection of shape (E_out, E_total) - b_q (torch.Tensor, optional): Bias for query input projection of shape E_total. Default: None. Defaults to None. - b_k (torch.Tensor, optional): Bias for key input projection of shape E_total. Default: None. Defaults to None. - b_v (torch.Tensor, optional): Bias for value input projection of shape E_total. Default: None. Defaults to None. - b_out (torch.Tensor, optional): Bias for output projection of shape E_out. Default: None. Defaults to None. - dropout_p (float, optional): Dropout probability. Defaults to 0.0. - - Where: - N is the batch size - L_t is the target sequence length (jagged) - L_s is the source sequence length (jagged) - E_q is the embedding size for query - E_k is the embedding size for key - E_v is the embedding size for value - E_total is the embedding size for all heads combined - E_out is the output embedding size - Returns: - torch.Tensor: Output of shape (N, L_t, E_out) +# We can implement this in such a way that it can operate on either padded +# or nested tensors. +class MultiHeadAttention(nn.Module): """ - - N = query.size(0) - E_total = W_q.size(0) - assert E_total % nheads == 0, "Embedding dim is not divisible by nheads" - E_head = E_total // nheads - - # apply input projection - # (N, L_t, E_q) -> (N, L_t, E_total) - query = F.linear(query, W_q, b_q) - # (N, L_s, E_k) -> (N, L_s, E_total) - key = F.linear(key, W_k, b_k) - # (N, L_s, E_v) -> (N, L_s, E_total) - value = F.linear(value, W_v, b_v) - - # reshape query, key, value to separate by head - # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head) - query = query.reshape(N, -1, nheads, E_head).transpose(1, 2) - # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) - key = key.reshape(N, -1, nheads, E_head).transpose(1, 2) - # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) - value = value.reshape(N, -1, nheads, E_head).transpose(1, 2) - - # query matmul key^T - # (N, nheads, L_t, E_head) x (N, nheads, L_s, E_head)^T -> (N, nheads, L_t, L_s) - keyT = key.transpose(-1, -2) - attn_weights = torch.matmul(query, keyT) - - # scale down - attn_weights = attn_weights * (1.0 / math.sqrt(E_head)) - - # softmax - attn_weights = F.softmax(attn_weights, dim=-1) - - # dropout - if dropout_p > 0.0: - attn_weights = F.dropout(attn_weights, p=dropout_p) - - # attention_weights matmul value - # (N, nheads, L_t, L_s) x (N, nheads, L_s, E_head) -> (N, nheads, L_t, E_head) - attn_output = torch.matmul(attn_weights, value) - - # merge heads - # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total) - attn_output = attn_output.transpose(1, 2).reshape(N, -1, E_total) - - # apply output projection - # (N, L_t, E_total) -> (N, L_t, E_out) - attn_output = F.linear(attn_output, W_out, b_out) - - return attn_output - -###################################################################### -# The 0-padded tensor version additionally requires masks -# for more complicated treatments at padded entries. -def mha_padded(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nheads: int, - attn_mask_q: torch.Tensor, attn_mask_kv: torch.Tensor, - W_q: torch.Tensor, W_k: torch.Tensor, W_v: torch.Tensor, W_out: torch.Tensor, - b_q: torch.Tensor = None, b_k: torch.Tensor = None, b_v: torch.Tensor = None, b_out: torch.Tensor = None, - dropout_p: float = 0.0) -> torch.Tensor: - """Compute multi-head attention for padded out dense tensors. + Computes multi-head attention. Supports nested or padded tensors. Args: - query (torch.Tensor): query of shape (N, L_t, E_q) - key (torch.Tensor): key of shape (N, L_s, E_k) - value (torch.Tensor): value of shape (N, L_s, E_v) - nheads (int): number of heads in multi-head attention - attn_mask_q (torch.Tensor): boolean mask indicating locations that should not take part in attention for query, shape (N, L_t) - attn_mask_kv (torch.Tensor): boolean mask indicating locations that should not take part in attention for key and value, shape (N, L_s) - W_q (torch.Tensor): Weight for query input projection of shape (E_total, E_q) - W_k (torch.Tensor): Weight for key input projection of shape (E_total, E_k) - W_v (torch.Tensor): Weight for value input projection of shape (E_total, E_v) - W_out (torch.Tensor): Weight for output projection of shape (E_out, E_total) - b_q (torch.Tensor, optional): Bias for query input projection of shape E_total.. Defaults to None. - b_k (torch.Tensor, optional): Bias for key input projection of shape E_total.. Defaults to None. - b_v (torch.Tensor, optional): Bias for value input projection of shape E_total.. Defaults to None. - b_out (torch.Tensor, optional): Bias for output projection of shape E_out. Defaults to None. - dropout_p (float, optional): Dropout probability. Defaults to 0.0. - - Where: - N is the batch size - L_t is the target sequence length (padded) - L_s is the source sequence length (padded) - E_q is the embedding size for query - E_k is the embedding size for key - E_v is the embedding size for value - E_total is the embedding size for all heads combined - E_out is the output embedding size - Returns: - torch.Tensor: Output of shape (N, L_t, E_out) + E_q (int): Size of embedding dim for query + E_k (int): Size of embedding dim for key + E_v (int): Size of embedding dim for value + E_total (int): Total embedding dim of combined heads post input projection. Each head + has dim E_total // nheads + nheads (int): Number of heads + dropout_p (float, optional): Dropout probability. Default: 0.0 """ - N = query.size(0) - L_t = query.size(1) - L_s = key.size(1) - E_total = W_q.size(0) - assert E_total % nheads == 0, "Embedding dim is not divisible by nheads" - assert L_t == L_s, "This implementation assumes equal query and key sequence lengths" - E_head = E_total // nheads - - # apply input projection - # (N, L_t, E_q) -> (N, L_t, E_total) - query = F.linear(query, W_q, b_q) - # (N, L_s, E_k) -> (N, L_s, E_total) - key = F.linear(key, W_k, b_k) - # (N, L_s, E_v) -> (N, L_s, E_total) - value = F.linear(value, W_v, b_v) - - # reshape query, key, value to separate by head - # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head) -> (N * nheads, L_t, E_head) - query = query.reshape(N, -1, nheads, E_head).transpose(1, 2).reshape(N * nheads, -1, E_head) - # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) -> (N * nheads, L_s, E_head) - key = key.reshape(N, -1, nheads, E_head).transpose(1, 2).reshape(N * nheads, -1, E_head) - # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) -> (N * nheads, L_s, E_head) - value = value.reshape(N, -1, nheads, E_head).transpose(1, 2).reshape(N * nheads, -1, E_head) - - # query bmm key^T - # (N * nheads, L_t, E_head) x (N * nheads, L_s, E_head)^T -> (N * nheads, L_t, L_s) - keyT = key.transpose(-1, -2) - attn_weights = torch.bmm(query, keyT) - - # scale down - attn_weights = attn_weights * (1.0 / math.sqrt(E_head)) - - # Have to manipulate masks in order to apply them to the attention weights - key_padding_mask = attn_mask_q.view(N, 1, 1, L_t).expand(-1, nheads, -1, -1).reshape(N*nheads, 1, L_t).to(device=device) - attn_mask = torch.zeros(key_padding_mask.shape, device=device, dtype=torch.float32) - attn_mask = attn_mask.masked_fill_(key_padding_mask, float("-inf")) - - # Zero out the attention weights where the mask is True by adding -inf prior to softmax - attn_weights.add_(attn_mask) - - # softmax - attn_weights = F.softmax(attn_weights, dim=-1).nan_to_num_(0.0) - - # dropout - if dropout_p > 0.0: - attn_weights = F.dropout(attn_weights, p=dropout_p) - - # attention_weights bmm value - # (N * nheads, L_t, L_s) x (N * nheads, L_s, E_head) -> (N * nheads, L_t, E_head) - attn_output = attn_weights.bmm(value) - - # merge heads - # (N * nheads, L_t, E_head) -> (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total) - attn_output = attn_output.reshape(N, nheads, -1, E_head).transpose(1, 2).reshape(N, -1, E_total) - - # apply output projection - # (N, L_t, E_total) -> (N, L_t, E_out) - attn_output = F.linear(attn_output, W_out, b_out) - - # padding-specific step: remove output projection bias from padded entries - attn_output[attn_mask_q, :] = 0.0 - - return attn_output + def __init__(self, E_q: int, E_k: int, E_v: int, E_total: int, + nheads: int, dropout_p: float = 0.0): + super().__init__() + self.nheads = nheads + self.dropout_p = dropout_p + self.query_proj = nn.Linear(E_q, E_total) + self.key_proj = nn.Linear(E_k, E_total) + self.value_proj = nn.Linear(E_v, E_total) + E_out = E_q + self.out_proj = nn.Linear(E_total, E_out) + assert E_total % nheads == 0, "Embedding dim is not divisible by nheads" + self.E_head = E_total // nheads + + def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor: + """ + Forward pass; runs the following process: + 1. Apply input projection + 2. Split heads and prepare for SDPA + 3. Run SDPA + 4. Apply output projection + + Args: + query (torch.Tensor): query of shape (N, L_t, E_q) + key (torch.Tensor): key of shape (N, L_s, E_k) + value (torch.Tensor): value of shape (N, L_s, E_v) + + Returns: + attn_output (torch.Tensor): output of shape (N, L_t, E_q) + """ + # Step 1. Apply input projection + # TODO: demonstrate packed projection + query = self.query_proj(query) + key = self.key_proj(key) + value = self.value_proj(value) + + # Step 2. Split heads and prepare for SDPA + # reshape query, key, value to separate by head + # (N, L_t, E_total) -> (N, L_t, nheads, E_head) -> (N, nheads, L_t, E_head) + query = query.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + key = key.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + # (N, L_s, E_total) -> (N, L_s, nheads, E_head) -> (N, nheads, L_s, E_head) + value = value.unflatten(-1, [self.nheads, self.E_head]).transpose(1, 2) + + # Step 3. Run SDPA + # (N, nheads, L_t, E_head) + attn_output = F.scaled_dot_product_attention( + query, key, value, dropout_p=dropout_p, is_causal=True) + # (N, nheads, L_t, E_head) -> (N, L_t, nheads, E_head) -> (N, L_t, E_total) + attn_output = attn_output.transpose(1, 2).flatten(-2) + + # Step 4. Apply output projection + # (N, L_t, E_total) -> (N, L_t, E_out) + attn_output = self.out_proj(attn_output) + + return attn_output ###################################################################### # set hyperparameters following `the Transformer paper `__ N = 512 -E_q, E_k, E_v, E_total, E_out = 512, 512, 512, 512, 512 +E_q, E_k, E_v, E_total = 512, 512, 512, 512 +E_out = E_q nheads = 8 ###################################################################### @@ -360,9 +254,7 @@ def mha_padded(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nhea ###################################################################### # Let us generate some realistic fake data from Zipf's law. -import numpy as np - -def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: +def zipf_sentence_lengths(alpha: float, batch_size: int) -> torch.Tensor: # generate fake corpus by unigram Zipf distribution # from wikitext-2 corpus, we get rank "." = 3, "!" = 386, "?" = 858 sentence_lengths = np.empty(batch_size, dtype=int) @@ -372,124 +264,108 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: while word != 3 and word != 386 and word != 858: sentence_lengths[ibatch] += 1 word = np.random.zipf(alpha) - return sentence_lengths + return torch.tensor(sentence_lengths) -alpha = 1.2 +###################################################################### +# Create nested tensor batch inputs +def gen_batch(N, E_q, E_k, E_v, device): + # generate semi-realistic data using Zipf distribution for sentence lengths + sentence_lengths = zipf_sentence_lengths(alpha=1.2, batch_size=N) -sentence_lengths = zipf_sentence_lengths(alpha, N) -L_t = np.max(sentence_lengths) -L_s = L_t + # Note: the torch.jagged layout is a nested tensor layout that supports a single ragged + # dimension and works with torch.compile. The batch items each have shape (B, S*, D) + # where B = batch size, S* = ragged sequence length, and D = embedding dimension. + query = torch.nested.nested_tensor([ + torch.randn(l.item(), E_q, device=device) + for l in sentence_lengths + ], layout=torch.jagged) -###################################################################### -# create inputs - -# create parameters -W_q, b_q = torch.randn((E_total, E_q), device=device), torch.randn(E_total, device=device) -W_k, b_k = torch.randn((E_total, E_k), device=device), torch.randn(E_total, device=device) -W_v, b_v = torch.randn((E_total, E_v), device=device), torch.randn(E_total, device=device) -W_out, b_out = torch.randn((E_out, E_total), device=device), torch.randn(E_out, device=device) - -# create nested input -queries = [] -keys = [] -values = [] -for i in range(N): - l = sentence_lengths[i] - s = l - queries.append(torch.randn((l, E_q), device=device)) - keys .append(torch.randn((s, E_k), device=device)) - values .append(torch.randn((s, E_v), device=device)) -query = torch.nested.nested_tensor(queries) -key = torch.nested.nested_tensor(keys) -value = torch.nested.nested_tensor(values) - -# pad input -padded_query = torch.nested.to_padded_tensor(query, 0.0, (N, L_t, E_q)) -padded_key = torch.nested.to_padded_tensor(key, 0.0, (N, L_s, E_k)) -padded_value = torch.nested.to_padded_tensor(value, 0.0, (N, L_s, E_v)) - -# create attention masks -attn_mask_q = torch.zeros((N, L_t), dtype=torch.bool) -attn_mask_kv = torch.zeros((N, L_s), dtype=torch.bool) - -# We need to mask out the padding entries in the attention weights. -for i, entry_length in enumerate(sentence_lengths): - attn_mask_q[i, entry_length:] = True - attn_mask_kv[i, entry_length:] = True + key = torch.nested.nested_tensor([ + torch.randn(s.item(), E_k, device=device) + for s in sentence_lengths + ], layout=torch.jagged) + + value = torch.nested.nested_tensor([ + torch.randn(s.item(), E_v, device=device) + for s in sentence_lengths + ], layout=torch.jagged) + + return query, key, value, sentence_lengths + +query, key, value, sentence_lengths = gen_batch(N, E_q, E_k, E_v, device) ###################################################################### -# check correctness and performance +# Generate padded forms of query, key, value for comparison +def jagged_to_padded(jt, padding_val): + # TODO: do jagged -> padded directly when this is supported + return torch.nested.to_padded_tensor( + torch.nested.nested_tensor(list(jt.unbind())), + padding_val) -import timeit +padded_query, padded_key, padded_value = ( + jagged_to_padded(t, 0.0) for t in (query, key, value) +) -t0 = timeit.default_timer() -out_nested = mha_nested( - query, key, value, nheads, - W_q, W_k, W_v, W_out, - b_q=b_q, b_k=b_k, b_v=b_v, b_out=b_out, - dropout_p=dropout_p) - -t1 = timeit.default_timer() -out_padded = mha_padded( - padded_query, padded_key, padded_value, nheads, - attn_mask_q, attn_mask_kv, - W_q, W_k, W_v, W_out, - b_q=b_q, b_k=b_k, b_v=b_v, b_out=b_out, - dropout_p=dropout_p) -t2 = timeit.default_timer() - -print("nested and padded calculations differ by", (torch.nested.to_padded_tensor(out_nested, 0.0, (N, L_t, E_out)) - out_padded).abs().max().item()) -print("nestedtensor multi-head attention takes", t1 - t0, "seconds") -print("padded tensor multi-head attention takes", t2 - t1, "seconds") +###################################################################### +# Construct the model +mha = MultiHeadAttention(E_q, E_k, E_v, E_total, nheads, dropout_p).to(device=device) ###################################################################### -# Although the nestedtensor version avoids wasted computation on padding, it is not faster -# then the equivalent padded tensor version. This is because the nestedtensor version -# has implemented a few of the kernels, like softmax, in a non optimal way. -# -# There are plans to implement performance critical operations using the new Pytorch 2.0 stack -# For now, some performant kernels are provided for specific use cases, e.g. -# self-attention evaluation by multi-head attention formula. +# Check correctness and performance +def benchmark(func, *args, **kwargs): + torch.cuda.synchronize() + begin = timeit.default_timer() + output = func(*args, **kwargs) + torch.cuda.synchronize() + end = timeit.default_timer() + return output, (end - begin) + +output_nested, time_nested = benchmark(mha, query, key, value) +output_padded, time_padded = benchmark(mha, padded_query, padded_key, padded_value) + +# padding-specific step: remove output projection bias from padded entries for fair comparison +for i, entry_length in enumerate(sentence_lengths): + output_padded[i, entry_length:] = 0.0 + +print("=== without torch.compile ===") +print("nested and padded calculations differ by", (jagged_to_padded(output_nested, 0.0) - output_padded).abs().max().item()) +print("nested tensor multi-head attention takes", time_nested, "seconds") +print("padded tensor multi-head attention takes", time_padded, "seconds") + +# warm up compile first... +compiled_mha = torch.compile(mha) +compiled_mha(query, key, value) +# ...now benchmark +compiled_output_nested, compiled_time_nested = benchmark( + compiled_mha, query, key, value) + +# warm up compile first... +compiled_mha(padded_query, padded_key, padded_value) +# ...now benchmark +compiled_output_padded, compiled_time_padded = benchmark( + compiled_mha, padded_query, padded_key, padded_value) + +# padding-specific step: remove output projection bias from padded entries for fair comparison +for i, entry_length in enumerate(sentence_lengths): + compiled_output_padded[i, entry_length:] = 0.0 -# embeddings are assumed to be the same -E = E_total -mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device) -mha_lib.eval() +print("=== with torch.compile ===") +print("nested and padded calculations differ by", (jagged_to_padded(compiled_output_nested, 0.0) - compiled_output_padded).abs().max().item()) +print("nested tensor multi-head attention takes", compiled_time_nested, "seconds") +print("padded tensor multi-head attention takes", compiled_time_padded, "seconds") ###################################################################### -# extract parameters for correctness check -mha_lib.in_proj_weight.requires_grad_(False) -mha_lib.in_proj_bias.requires_grad_(False) -mha_lib.out_proj.weight.requires_grad_(False) -mha_lib.out_proj.bias.requires_grad_(False) -W_q, b_q = mha_lib.in_proj_weight[: E, :], mha_lib.in_proj_bias[: E] -W_k, b_k = mha_lib.in_proj_weight[E : 2 * E, :], mha_lib.in_proj_bias[E : 2 * E] -W_v, b_v = mha_lib.in_proj_weight[2 * E :, :], mha_lib.in_proj_bias[2 * E :] -W_out, b_out = mha_lib.out_proj.weight, mha_lib.out_proj.bias +# Note that without ``torch.compile``, the overhead of the python subclass nested tensor +# can make it slower than the equivalent computation on padded tensors. However, once +# ``torch.compile`` is enabled, operating on nested tensors gives a multiple x speedup. +# Avoiding wasted computation on padding becomes only more valuable as the percentage +# of padding in the batch increases. +print(f"Nested speedup: {compiled_time_padded / compiled_time_nested:.3f}") ###################################################################### -# If we set need_weights to False this will enable the fast path in the library. -# Under the hood this will call _scaled_dot_product_attention. If your tensors -# are on CUDA, than a fused, efficient attention kernel will be used. For -# more detailed performance characteristics look at the benchmark in -# pytorch/benchmarks/transformer/sdp.py - -with torch.inference_mode(): - t0 = timeit.default_timer() - out_lib, out_lib_weights = mha_lib(query, query, query, need_weights=False) - - t1 = timeit.default_timer() - padded_out = mha_padded( - padded_query, padded_query, padded_query, nheads, - attn_mask_q, attn_mask_q, - W_q, W_k, W_v, W_out, - b_q=b_q, b_k=b_k, b_v=b_v, b_out=b_out, - dropout_p=dropout_p) - t2 = timeit.default_timer() - -nested_time = t1 - t0 -padded_time = t2 - t1 -print("Nested and padded calculations differ by", (torch.nested.to_padded_tensor(out_lib, 0.0) - padded_out).abs().max().item()) -print("Nested library multi-head attention takes", nested_time, "seconds") -print("Padded tensor multi-head attention takes", padded_time, "seconds") -print(f"Nested Speedup: {padded_time / nested_time:.3f}") \ No newline at end of file +# Conclusion +# ---------- +# In this tutorial, we have learned how to perform basic operations with nested tensors and +# how implement multi-head attention for transformers in a way that avoids computation on padding. +# For more information, check out the docs for the +# `torch.nested `__ namespace. diff --git a/prototype_source/nnapi_mobilenetv2.rst b/prototype_source/nnapi_mobilenetv2.rst index 3036fdefa46..ef7edc92d12 100644 --- a/prototype_source/nnapi_mobilenetv2.rst +++ b/prototype_source/nnapi_mobilenetv2.rst @@ -1,218 +1,10 @@ (Beta) Convert MobileNetV2 to NNAPI ======================================== -Introduction ------------- +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -This tutorial shows how to prepare a computer vision model to use -`Android's Neural Networks API (NNAPI) `_. -NNAPI provides access to powerful and efficient computational cores -on many modern Android devices. +Redirecting in 3 seconds... -PyTorch's NNAPI is currently in the "prototype" phase and only supports -a limited range of operators, but we expect to solidify the integration -and expand our operator support over time. +.. raw:: html - -Environment ------------ - -Install PyTorch and torchvision. - -``pip install torch==1.10.0 torchvision==0.11.1`` - - -Model Preparation ------------------ - -First, we must prepare our model to execute with NNAPI. -This step runs on your training server or laptop. -The key conversion function to call is -``torch.backends._nnapi.prepare.convert_model_to_nnapi``, -but some extra steps are required to ensure that -the model is properly structured. -Most notably, quantizing the model is required -in order to run the model on certain accelerators. - -You can copy/paste this entire Python script and run it, -or make your own modifications. -By default, it will save the models to ``~/mobilenetv2-nnapi/``. -Please create that directory first. - -.. code:: python - - #!/usr/bin/env python - import sys - import os - import torch - import torch.utils.bundled_inputs - import torch.utils.mobile_optimizer - import torch.backends._nnapi.prepare - import torchvision.models.quantization.mobilenet - from pathlib import Path - - - # This script supports 3 modes of quantization: - # - "none": Fully floating-point model. - # - "core": Quantize the core of the model, but wrap it a - # quantizer/dequantizer pair, so the interface uses floating point. - # - "full": Quantize the model, and use quantized tensors - # for input and output. - # - # "none" maintains maximum accuracy - # "core" sacrifices some accuracy for performance, - # but maintains the same interface. - # "full" maximized performance (with the same accuracy as "core"), - # but requires the application to use quantized tensors. - # - # There is a fourth option, not supported by this script, - # where we include the quant/dequant steps as NNAPI operators. - def make_mobilenetv2_nnapi(output_dir_path, quantize_mode): - quantize_core, quantize_iface = { - "none": (False, False), - "core": (True, False), - "full": (True, True), - }[quantize_mode] - - model = torchvision.models.quantization.mobilenet.mobilenet_v2(pretrained=True, quantize=quantize_core) - model.eval() - - # Fuse BatchNorm operators in the floating point model. - # (Quantized models already have this done.) - # Remove dropout for this inference-only use case. - if not quantize_core: - model.fuse_model() - assert type(model.classifier[0]) == torch.nn.Dropout - model.classifier[0] = torch.nn.Identity() - - input_float = torch.zeros(1, 3, 224, 224) - input_tensor = input_float - - # If we're doing a quantized model, we need to trace only the quantized core. - # So capture the quantizer and dequantizer, use them to prepare the input, - # and replace them with identity modules so we can trace without them. - if quantize_core: - quantizer = model.quant - dequantizer = model.dequant - model.quant = torch.nn.Identity() - model.dequant = torch.nn.Identity() - input_tensor = quantizer(input_float) - - # Many NNAPI backends prefer NHWC tensors, so convert our input to channels_last, - # and set the "nnapi_nhwc" attribute for the converter. - input_tensor = input_tensor.contiguous(memory_format=torch.channels_last) - input_tensor.nnapi_nhwc = True - - # Trace the model. NNAPI conversion only works with TorchScript models, - # and traced models are more likely to convert successfully than scripted. - with torch.no_grad(): - traced = torch.jit.trace(model, input_tensor) - nnapi_model = torch.backends._nnapi.prepare.convert_model_to_nnapi(traced, input_tensor) - - # If we're not using a quantized interface, wrap a quant/dequant around the core. - if quantize_core and not quantize_iface: - nnapi_model = torch.nn.Sequential(quantizer, nnapi_model, dequantizer) - model.quant = quantizer - model.dequant = dequantizer - # Switch back to float input for benchmarking. - input_tensor = input_float.contiguous(memory_format=torch.channels_last) - - # Optimize the CPU model to make CPU-vs-NNAPI benchmarks fair. - model = torch.utils.mobile_optimizer.optimize_for_mobile(torch.jit.script(model)) - - # Bundle sample inputs with the models for easier benchmarking. - # This step is optional. - class BundleWrapper(torch.nn.Module): - def __init__(self, mod): - super().__init__() - self.mod = mod - def forward(self, arg): - return self.mod(arg) - nnapi_model = torch.jit.script(BundleWrapper(nnapi_model)) - torch.utils.bundled_inputs.augment_model_with_bundled_inputs( - model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)]) - torch.utils.bundled_inputs.augment_model_with_bundled_inputs( - nnapi_model, [(torch.utils.bundled_inputs.bundle_large_tensor(input_tensor),)]) - - # Save both models. - model._save_for_lite_interpreter(str(output_dir_path / ("mobilenetv2-quant_{}-cpu.pt".format(quantize_mode)))) - nnapi_model._save_for_lite_interpreter(str(output_dir_path / ("mobilenetv2-quant_{}-nnapi.pt".format(quantize_mode)))) - - - if __name__ == "__main__": - for quantize_mode in ["none", "core", "full"]: - make_mobilenetv2_nnapi(Path(os.environ["HOME"]) / "mobilenetv2-nnapi", quantize_mode) - - -Running Benchmarks ------------------- - -Now that the models are ready, we can benchmark them on our Android devices. -See `our performance recipe `_ for details. -The best-performing models are likely to be the "fully-quantized" models: -``mobilenetv2-quant_full-cpu.pt`` and ``mobilenetv2-quant_full-nnapi.pt``. - -Because these models have bundled inputs, we can run the benchmark as follows: - -.. code:: shell - - ./speed_benchmark_torch --pthreadpool_size=1 --model=mobilenetv2-quant_full-nnapi.pt --use_bundled_input=0 --warmup=5 --iter=200 - -Adjusting increasing the thread pool size can can reduce latency, -at the cost of increased CPU usage. -Omitting that argument will use one thread per big core. -The CPU models can get improved performance (at the cost of memory usage) -by passing ``--use_caching_allocator=true``. - - -Running model on host --------------------- - -We can now run models on your linux machine using the reference implementation -of NNAPI. You need to build the NNAPI library from Android source code: - -* Make sure you have at least 200GB of disk space -* Follow `these instructions `_ to install ``repo`` - -.. code:: shell - - mkdir ~/android-nnapi && cd ~/android-nnapi - repo init -u https://android.googlesource.com/platform/manifest -b master - repo sync --network-only -j 16 - repo sync -l - . build/envsetup.sh - lunch aosp_x86_64-eng - mm -j16 out/host/linux-x86/lib64/libneuralnetworks.so - - -With the host build of ``libneuralnetworks.so`` you can run Pytorch NNAPI models on -your linux machine: - -.. code:: python - - #!/usr/bin/env python - import ctypes - import torch - from pathlib import Path - - ctypes.cdll.LoadLibrary(Path.home() / "android-nnapi/out/host/linux-x86/lib64/libneuralnetworks.so") - model = torch.jit.load(Path.home() / "mobilenetv2-nnapi/mobilenetv2-quant_full-nnapi.pt") - print(model(*model.get_all_bundled_inputs()[0])) - - -Integration ------------ - -The converted models are ordinary TorchScript models. -You can use them in your app just like any other PyTorch model. -See `https://pytorch.org/mobile/android/ `_ -for an introduction to using PyTorch on Android. - - -Learn More ----------- - -- Learn more about optimization in our - `Mobile Performance Recipe `_ -- `MobileNetV2 `_ from torchvision -- Information about `NNAPI `_ + diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py index b5ec2a7e133..a630d27e6a6 100644 --- a/prototype_source/numeric_suite_tutorial.py +++ b/prototype_source/numeric_suite_tutorial.py @@ -271,7 +271,7 @@ def forward(self, x, y): ############################################################################### # Numeric Suite for Dynamic Quantization -# ------------------------------------- +# -------------------------------------- # # Numeric Suite APIs are designed in such as way that they work for both dynamic quantized model and static quantized model. We will use a model with both LSTM and Linear modules to demonstrate the usage of Numeric Suite on dynamic quantized model. This model is the same one used in the tutorial of dynamic quantization on LSTM word language model [1]. # diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst index 92a1f5f32ca..927f5f694b8 100644 --- a/prototype_source/prototype_index.rst +++ b/prototype_source/prototype_index.rst @@ -80,8 +80,8 @@ Prototype features are not available as part of binary distributions like PyPI o :card_description: Learn how to use Post Training Quantization in PyTorch 2 Export. :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png :link: ../prototype/pt2e_quant_ptq.html - :tags: Quantization - + :tags: Quantization + .. customcarditem:: :header: PyTorch 2 Export Quantization-Aware Training :card_description: Learn how to use Quantization-Aware-Training in PyTorch 2 Export. @@ -89,6 +89,12 @@ Prototype features are not available as part of binary distributions like PyPI o :link: ../prototype/pt2e_quant_qat.html :tags: Quantization +.. customcarditem:: + :header: PyTorch 2 Export Quantization with X86 Backend through Inductor + :card_description: Learn how to use PT2 Export Quantization with X86 Backend through Inductor. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/pt2e_quant_x86_inductor.html + :tags: Quantization .. Sparsity @@ -97,7 +103,7 @@ Prototype features are not available as part of binary distributions like PyPI o :card_description: Prune BERT to be 2:4 sparse and accelerate for inference. :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png :link: prototype/semi_structured_sparse.html - :tags: Model-Optimziation + :tags: Model-Optimiziation .. Mobile @@ -197,11 +203,11 @@ Prototype features are not available as part of binary distributions like PyPI o .. customcarditem:: :header: MaskedTensor: Simplifying Adagrad Sparse Semantics - :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience + :card_description: See a showcase on how masked tensors can enable sparse semantics and provide for a cleaner dev experience :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png :link: ../prototype/maskedtensor_adagrad.html :tags: MaskedTensor - + .. Model-Optimization .. customcarditem:: @@ -211,6 +217,36 @@ Prototype features are not available as part of binary distributions like PyPI o :link: ../prototype/inductor_cpp_wrapper_tutorial.html :tags: Model-Optimization +.. customcarditem:: + :header: Inductor Windows CPU Tutorial + :card_description: Speed up your models with Inductor On Windows CPU + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/inductor_windows_cpu.html + :tags: Model-Optimization + +.. customcarditem:: + :header: Use max-autotune compilation on CPU to gain additional performance boost + :card_description: Tutorial for max-autotune mode on CPU to gain additional performance boost + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/max_autotune_on_CPU_tutorial.html + :tags: Model-Optimization + +.. Distributed +.. customcarditem:: + :header: Flight Recorder Tutorial + :card_description: Debug stuck jobs easily with Flight Recorder + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/flight_recorder_tutorial.html + :tags: Distributed, Debugging, FlightRecorder + +.. Integration +.. customcarditem:: + :header: Out-of-tree extension autoloading in Python + :card_description: Learn how to improve the seamless integration of out-of-tree extension with PyTorch based on the autoloading mechanism. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../prototype/python_extension_autoload.html + :tags: Extending-PyTorch, Frontend-APIs + .. End of tutorial card section .. raw:: html @@ -232,8 +268,10 @@ Prototype features are not available as part of binary distributions like PyPI o prototype/fx_graph_mode_quant_guide.html prototype/fx_graph_mode_ptq_dynamic.html prototype/fx_graph_mode_ptq_static.html + prototype/flight_recorder_tutorial.html prototype/graph_mode_dynamic_bert_tutorial.html prototype/inductor_cpp_wrapper_tutorial.html + prototype/inductor_windows_cpu.html prototype/pt2e_quantizer.html prototype/pt2e_quant_ptq.html prototype/pt2e_quant_qat.html @@ -250,3 +288,5 @@ Prototype features are not available as part of binary distributions like PyPI o prototype/maskedtensor_sparsity.html prototype/maskedtensor_advanced_semantics.html prototype/maskedtensor_adagrad.html + prototype/python_extension_autoload.html + prototype/max_autotune_CPU_with_gemm_template_tutorial.html diff --git a/prototype_source/pt2e_quant_ptq.rst b/prototype_source/pt2e_quant_ptq.rst index 7f46c86e42e..4873bce7d55 100644 --- a/prototype_source/pt2e_quant_ptq.rst +++ b/prototype_source/pt2e_quant_ptq.rst @@ -51,7 +51,6 @@ The PyTorch 2 export quantization API looks like this: .. code:: python import torch - from torch._export import capture_pre_autograd_graph class M(torch.nn.Module): def __init__(self): super().__init__() @@ -65,9 +64,9 @@ The PyTorch 2 export quantization API looks like this: m = M().eval() # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - m = capture_pre_autograd_graph(m, *example_inputs) + # This is available for pytorch 2.5+, for more details on lower pytorch versions + # please check `Export the model with torch.export` section + m = torch.export.export_for_training(m, example_inputs).module() # we get a model with aten ops @@ -77,7 +76,7 @@ The PyTorch 2 export quantization API looks like this: convert_pt2e, ) - from torch.ao.quantization.quantizer import ( + from torch.ao.quantization.quantizer.xnnpack_quantizer import ( XNNPACKQuantizer, get_symmetric_quantization_config, ) @@ -274,16 +273,13 @@ and rename it to ``data/resnet18_pretrained_float.pth``. def load_model(model_file): model = resnet18(pretrained=False) - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) model.to("cpu") return model def print_size_of_model(model): - if isinstance(model, torch.jit.RecursiveScriptModule): - torch.jit.save(model, "temp.p") - else: - torch.jit.save(torch.jit.script(model), "temp.p") + torch.save(model.state_dict(), "temp.p") print("Size (MB):", os.path.getsize("temp.p")/1e6) os.remove("temp.p") @@ -351,18 +347,28 @@ Here is how you can use ``torch.export`` to export the model: .. code-block:: python - from torch._export import capture_pre_autograd_graph - example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) + # for pytorch 2.5+ + exported_model = torch.export.export_for_training(model_to_quantize, example_inputs).module() + + # for pytorch 2.4 and before + # from torch._export import capture_pre_autograd_graph + # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) + # or capture with dynamic dimensions + # for pytorch 2.5+ + dynamic_shapes = tuple( + {0: torch.export.Dim("dim")} if i == 0 else None + for i in range(len(example_inputs)) + ) + exported_model = torch.export.export_for_training(model_to_quantize, example_inputs, dynamic_shapes=dynamic_shapes).module() + + # for pytorch 2.4 and before + # dynamic_shape API may vary as well # from torch._export import dynamic_dim # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)]) -``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. - - Import the Backend Specific Quantizer and Configure how to Quantize the Model ----------------------------------------------------------------------------- @@ -454,7 +460,7 @@ we offer in the long term might change based on feedback from PyTorch users. out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8) return out_i8 -* Reference Quantized Model Representation (available in the nightly build) +* Reference Quantized Model Representation We will have a special representation for selected ops, for example, quantized linear. Other ops are represented as ``dq -> float32_op -> q`` and ``q/dq`` are decomposed into more primitive operators. You can get this representation by using ``convert_pt2e(..., use_reference_representation=True)``. @@ -485,8 +491,6 @@ Now we can compare the size and model accuracy with baseline model. .. code-block:: python # Baseline model size and accuracy - scripted_float_model_file = "resnet18_scripted.pth" - print("Size of baseline model") print_size_of_model(float_model) @@ -495,6 +499,8 @@ Now we can compare the size and model accuracy with baseline model. # Quantized model size and accuracy print("Size of model after quantization") + # export again to remove unused weights + quantized_model = torch.export.export_for_training(quantized_model, example_inputs).module() print_size_of_model(quantized_model) top1, top5 = evaluate(quantized_model, criterion, data_loader_test) diff --git a/prototype_source/pt2e_quant_ptq_x86_inductor.rst b/prototype_source/pt2e_quant_ptq_x86_inductor.rst index 915148be81e..39214a51749 100644 --- a/prototype_source/pt2e_quant_ptq_x86_inductor.rst +++ b/prototype_source/pt2e_quant_ptq_x86_inductor.rst @@ -1,187 +1,10 @@ -PyTorch 2 Export Post Training Quantization with X86 Backend through Inductor -======================================================================================== +Quantization in PyTorch 2.0 Export Tutorial +=========================================== -**Author**: `Leslie Fang `_, `Weiwen Xia `_, `Jiong Gong `_, `Jerry Zhang `_ +This tutorial has been moved. -Prerequisites -^^^^^^^^^^^^^^^ +Redirecting in 3 seconds... -- `PyTorch 2 Export Post Training Quantization `_ -- `TorchInductor and torch.compile concepts in PyTorch `_ +.. raw:: html -Introduction -^^^^^^^^^^^^^^ - -This tutorial introduces the steps for utilizing the PyTorch 2 Export Quantization flow to generate a quantized model customized -for the x86 inductor backend and explains how to lower the quantized model into the inductor. - -The new quantization 2 flow uses the PT2 Export to capture the model into a graph and perform quantization transformations on top of the ATen graph. This approach is expected to have significantly higher model coverage, better programmability, and a simplified UX. -TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels. - -This flow of quantization 2 with Inductor mainly includes three steps: - -- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. -- Step 2: Apply the Quantization flow based on the captured FX Graph, including defining the backend-specific quantizer, generating the prepared model with observers, - performing the prepared model's calibration, and converting the prepared model into the quantized model. -- Step 3: Lower the quantized model into inductor with the API ``torch.compile``. - -The high-level architecture of this flow could look like this: - -:: - - float_model(Python) Example Input - \ / - \ / - —-------------------------------------------------------- - | export | - —-------------------------------------------------------- - | - FX Graph in ATen - | X86InductorQuantizer - | / - —-------------------------------------------------------- - | prepare_pt2e | - | | | - | Calibrate/Train | - | | | - | convert_pt2e | - —-------------------------------------------------------- - | - Quantized Model - | - —-------------------------------------------------------- - | Lower into Inductor | - —-------------------------------------------------------- - | - Inductor - -Combining Quantization in PyTorch 2 Export and TorchInductor, we have flexibility and productivity with the new Quantization frontend -and outstanding out-of-box performance with the compiler backend. Especially on Intel fourth generation (SPR) Xeon processors which can -further boost the models' performance by leveraging the -`advanced-matrix-extensions `_ feature. - -Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_. - -1. Capture FX Graph ---------------------- - -We will start by performing the necessary imports, capturing the FX Graph from the eager module. - -:: - - import torch - import torchvision.models as models - import copy - from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e - import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq - from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer - from torch._export import capture_pre_autograd_graph - - # Create the Eager Model - model_name = "resnet18" - model = models.__dict__[model_name](pretrained=True) - - # Set the model to eval mode - model = model.eval() - - # Create the data, using the dummy data here as an example - traced_bs = 50 - x = torch.randn(traced_bs, 3, 224, 224).contiguous(memory_format=torch.channels_last) - example_inputs = (x,) - - # Capture the FX Graph to be quantized - with torch.no_grad(): - # if you are using the PyTorch nightlies or building from source with the pytorch master, - # use the API of `capture_pre_autograd_graph` - # Note 1: `capture_pre_autograd_graph` is also a short-term API, it will be updated to use the official `torch.export` API when that is ready. - exported_model = capture_pre_autograd_graph( - model, - example_inputs - ) - # Note 2: if you are using the PyTorch 2.1 release binary or building from source with the PyTorch 2.1 release branch, - # please use the API of `torch._dynamo.export` to capture the FX Graph. - # exported_model, guards = torch._dynamo.export( - # model, - # *copy.deepcopy(example_inputs), - # aten_graph=True, - # ) - - -Next, we will have the FX Module to be quantized. - -2. Apply Quantization ----------------------------- - -After we capture the FX Module to be quantized, we will import the Backend Quantizer for X86 CPU and configure how to -quantize the model. - -:: - - quantizer = X86InductorQuantizer() - quantizer.set_global(xiq.get_default_x86_inductor_quantization_config()) - -.. note:: - - The default quantization configuration in ``X86InductorQuantizer`` uses 8-bits for both activations and weights. - When Vector Neural Network Instruction is not available, the oneDNN backend silently chooses kernels that assume - `multiplications are 7-bit x 8-bit `_. In other words, potential - numeric saturation and accuracy issue may happen when running on CPU without Vector Neural Network Instruction. - -After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. -``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. - -:: - - prepared_model = prepare_pt2e(exported_model, quantizer) - -Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. - -:: - - # We use the dummy data as an example here - prepared_model(*example_inputs) - - # Alternatively: user can define the dataset to calibrate - # def calibrate(model, data_loader): - # model.eval() - # with torch.no_grad(): - # for image, target in data_loader: - # model(image) - # calibrate(prepared_model, data_loader_test) # run calibration on sample data - -Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. - -:: - - converted_model = convert_pt2e(prepared_model) - -After these steps, we finished running the quantization flow and we will get the quantized model. - - -3. Lower into Inductor ------------------------- - -After we get the quantized model, we will further lower it to the inductor backend. - -:: - - optimized_model = torch.compile(converted_model) - - # Running some benchmark - optimized_model(*example_inputs) - - -Put all these codes together, we will have the toy example code. -Please note that since the Inductor ``freeze`` feature does not turn on by default yet, run your example code with ``TORCHINDUCTOR_FREEZING=1``. - -For example: - -:: - - TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_pytorch_2_1.py - -4. Conclusion ---------------- - -With this tutorial, we introduce how to use Inductor with X86 CPU in PyTorch 2 Quantization. Users can learn about -how to use ``X86InductorQuantizer`` to quantize a model and lower it into the inductor with X86 CPU devices. + diff --git a/prototype_source/pt2e_quant_qat.rst b/prototype_source/pt2e_quant_qat.rst index d40b640128c..8f11b0730c5 100644 --- a/prototype_source/pt2e_quant_qat.rst +++ b/prototype_source/pt2e_quant_qat.rst @@ -18,7 +18,7 @@ to the post training quantization (PTQ) flow for the most part: prepare_qat_pt2e, convert_pt2e, ) - from torch.ao.quantization.quantizer import ( + from torch.ao.quantization.quantizer.xnnpack_quantizer import ( XNNPACKQuantizer, get_symmetric_quantization_config, ) @@ -36,9 +36,9 @@ to the post training quantization (PTQ) flow for the most part: m = M() # Step 1. program capture - # NOTE: this API will be updated to torch.export API in the future, but the captured - # result shoud mostly stay the same - m = capture_pre_autograd_graph(m, *example_inputs) + # This is available for pytorch 2.5+, for more details on lower pytorch versions + # please check `Export the model with torch.export` section + m = torch.export.export_for_training(m, example_inputs).module() # we get a model with aten ops # Step 2. quantization-aware training @@ -64,7 +64,7 @@ respectively. Define Helper Functions and Prepare the Dataset -------------------------------------------- +----------------------------------------------- To run the code in this tutorial using the entire ImageNet dataset, first download ImageNet by following the instructions in @@ -172,7 +172,7 @@ prepare the data. These steps are very similar to the ones defined in the def load_model(model_file): model = resnet18(pretrained=False) - state_dict = torch.load(model_file) + state_dict = torch.load(model_file, weights_only=True) model.load_state_dict(state_dict) return model @@ -272,24 +272,35 @@ Here is how you can use ``torch.export`` to export the model: from torch._export import capture_pre_autograd_graph example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph(float_model, example_inputs) + # for pytorch 2.5+ + exported_model = torch.export.export_for_training(float_model, example_inputs).module() + # for pytorch 2.4 and before + # from torch._export import capture_pre_autograd_graph + # exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) .. code:: python # or, to capture with dynamic dimensions: - from torch._export import dynamic_dim - example_inputs = (torch.rand(2, 3, 224, 224),) - exported_model = capture_pre_autograd_graph( - float_model, - example_inputs, - constraints=[dynamic_dim(example_inputs[0], 0)], + # for pytorch 2.5+ + dynamic_shapes = tuple( + {0: torch.export.Dim("dim")} if i == 0 else None + for i in range(len(example_inputs)) ) -.. note:: - - ``capture_pre_autograd_graph`` is a short term API, it will be updated to use the offical ``torch.export`` API when that is ready. - + exported_model = torch.export.export_for_training(float_model, example_inputs, dynamic_shapes=dynamic_shapes).module() + + # for pytorch 2.4 and before + # dynamic_shape API may vary as well + # from torch._export import dynamic_dim + + # example_inputs = (torch.rand(2, 3, 224, 224),) + # exported_model = capture_pre_autograd_graph( + # float_model, + # example_inputs, + # constraints=[dynamic_dim(example_inputs[0], 0)], + # ) + Import the Backend Specific Quantizer and Configure how to Quantize the Model ----------------------------------------------------------------------------- diff --git a/prototype_source/pt2e_quant_x86_inductor.rst b/prototype_source/pt2e_quant_x86_inductor.rst new file mode 100644 index 00000000000..f9836d6e371 --- /dev/null +++ b/prototype_source/pt2e_quant_x86_inductor.rst @@ -0,0 +1,313 @@ +PyTorch 2 Export Quantization with X86 Backend through Inductor +================================================================== + +**Author**: `Leslie Fang `_, `Weiwen Xia `_, `Jiong Gong `_, `Jerry Zhang `_ + +Prerequisites +--------------- + +- `PyTorch 2 Export Post Training Quantization `_ +- `PyTorch 2 Export Quantization-Aware Training `_ +- `TorchInductor and torch.compile concepts in PyTorch `_ +- `Inductor C++ Wrapper concepts `_ + +Introduction +-------------- + +This tutorial introduces the steps for utilizing the PyTorch 2 Export Quantization flow to generate a quantized model customized +for the x86 inductor backend and explains how to lower the quantized model into the inductor. + +The pytorch 2 export quantization flow uses the torch.export to capture the model into a graph and perform quantization transformations on top of the ATen graph. +This approach is expected to have significantly higher model coverage, better programmability, and a simplified UX. +TorchInductor is the new compiler backend that compiles the FX Graphs generated by TorchDynamo into optimized C++/Triton kernels. + +This flow of quantization 2 with Inductor supports both static and dynamic quantization. Static quantization works best for CNN models, like ResNet-50. And dynamic quantization is more suitable for NLP models, like RNN and BERT. +For the difference between the two quantization types, please refer to the `following page `__. + +The quantization flow mainly includes three steps: + +- Step 1: Capture the FX Graph from the eager Model based on the `torch export mechanism `_. +- Step 2: Apply the Quantization flow based on the captured FX Graph, including defining the backend-specific quantizer, generating the prepared model with observers, + performing the prepared model's calibration or quantization-aware training, and converting the prepared model into the quantized model. +- Step 3: Lower the quantized model into inductor with the API ``torch.compile``. + +The high-level architecture of this flow could look like this: + +:: + + float_model(Python) Example Input + \ / + \ / + —-------------------------------------------------------- + | export | + —-------------------------------------------------------- + | + FX Graph in ATen + | X86InductorQuantizer + | / + —-------------------------------------------------------- + | prepare_pt2e | + | | | + | Calibrate/Train | + | | | + | convert_pt2e | + —-------------------------------------------------------- + | + Quantized Model + | + —-------------------------------------------------------- + | Lower into Inductor | + —-------------------------------------------------------- + | + Inductor + +Combining Quantization in PyTorch 2 Export and TorchInductor, we have flexibility and productivity with the new Quantization frontend +and outstanding out-of-box performance with the compiler backend. Especially on Intel fourth generation (SPR) Xeon processors which can +further boost the models' performance by leveraging the +`advanced-matrix-extensions `_ feature. + +Post Training Quantization +---------------------------- + +Now, we will walk you through a step-by-step tutorial for how to use it with `torchvision resnet18 model `_ +for post training quantization. + +1. Capture FX Graph +^^^^^^^^^^^^^^^^^^^^^ + +We will start by performing the necessary imports, capturing the FX Graph from the eager module. + +:: + + import torch + import torchvision.models as models + import copy + from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e + import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq + from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer + from torch._export import capture_pre_autograd_graph + + # Create the Eager Model + model_name = "resnet18" + model = models.__dict__[model_name](pretrained=True) + + # Set the model to eval mode + model = model.eval() + + # Create the data, using the dummy data here as an example + traced_bs = 50 + x = torch.randn(traced_bs, 3, 224, 224).contiguous(memory_format=torch.channels_last) + example_inputs = (x,) + + # Capture the FX Graph to be quantized + with torch.no_grad(): + # if you are using the PyTorch nightlies or building from source with the pytorch master, + # use the API of `capture_pre_autograd_graph` + # Note 1: `capture_pre_autograd_graph` is also a short-term API, it will be updated to use the official `torch.export` API when that is ready. + exported_model = capture_pre_autograd_graph( + model, + example_inputs + ) + # Note 2: if you are using the PyTorch 2.1 release binary or building from source with the PyTorch 2.1 release branch, + # please use the API of `torch._dynamo.export` to capture the FX Graph. + # exported_model, guards = torch._dynamo.export( + # model, + # *copy.deepcopy(example_inputs), + # aten_graph=True, + # ) + + +Next, we will have the FX Module to be quantized. + +2. Apply Quantization +^^^^^^^^^^^^^^^^^^^^^^^ + +After we capture the FX Module to be quantized, we will import the Backend Quantizer for X86 CPU and configure how to +quantize the model. + +:: + + quantizer = X86InductorQuantizer() + quantizer.set_global(xiq.get_default_x86_inductor_quantization_config()) + +.. note:: + + The default quantization configuration in ``X86InductorQuantizer`` uses 8-bits for both activations and weights. + When Vector Neural Network Instruction is not available, the oneDNN backend silently chooses kernels that assume + `multiplications are 7-bit x 8-bit `_. In other words, potential + numeric saturation and accuracy issue may happen when running on CPU without Vector Neural Network Instruction. + +The quantization config is for static quantization by default. To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. + +.. code-block:: python + + quantizer = X86InductorQuantizer() + quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_dynamic=True)) + + +After we import the backend-specific Quantizer, we will prepare the model for post-training quantization. +``prepare_pt2e`` folds BatchNorm operators into preceding Conv2d operators, and inserts observers in appropriate places in the model. + +:: + + prepared_model = prepare_pt2e(exported_model, quantizer) + +Now, we will calibrate the ``prepared_model`` after the observers are inserted in the model. This step is needed for static quantization only. + +:: + + # We use the dummy data as an example here + prepared_model(*example_inputs) + + # Alternatively: user can define the dataset to calibrate + # def calibrate(model, data_loader): + # model.eval() + # with torch.no_grad(): + # for image, target in data_loader: + # model(image) + # calibrate(prepared_model, data_loader_test) # run calibration on sample data + +Finally, we will convert the calibrated Model to a quantized Model. ``convert_pt2e`` takes a calibrated model and produces a quantized model. + +:: + + converted_model = convert_pt2e(prepared_model) + +After these steps, we finished running the quantization flow and we will get the quantized model. + + +3. Lower into Inductor +^^^^^^^^^^^^^^^^^^^^^^^^ + +After we get the quantized model, we will further lower it to the inductor backend. The default Inductor wrapper +generates Python code to invoke both generated kernels and external kernels. Additionally, Inductor supports +C++ wrapper that generates pure C++ code. This allows seamless integration of the generated and external kernels, +effectively reducing Python overhead. In the future, leveraging the C++ wrapper, we can extend the capability +to achieve pure C++ deployment. For more comprehensive details about C++ Wrapper in general, please refer to the +dedicated tutorial on `Inductor C++ Wrapper Tutorial `_. + +:: + + # Optional: using the C++ wrapper instead of default Python wrapper + import torch._inductor.config as config + config.cpp_wrapper = True + +:: + + with torch.no_grad(): + optimized_model = torch.compile(converted_model) + + # Running some benchmark + optimized_model(*example_inputs) + +In a more advanced scenario, int8-mixed-bf16 quantization comes into play. In this instance, +a Convolution or GEMM operator produces BFloat16 output data type instead of Float32 in the absence +of a subsequent quantization node. Subsequently, the BFloat16 tensor seamlessly propagates through +subsequent pointwise operators, effectively minimizing memory usage and potentially enhancing performance. +The utilization of this feature mirrors that of regular BFloat16 Autocast, as simple as wrapping the +script within the BFloat16 Autocast context. + +:: + + with torch.autocast(device_type="cpu", dtype=torch.bfloat16, enabled=True), torch.no_grad(): + # Turn on Autocast to use int8-mixed-bf16 quantization. After lowering into Inductor CPP Backend, + # For operators such as QConvolution and QLinear: + # * The input data type is consistently defined as int8, attributable to the presence of a pair + of quantization and dequantization nodes inserted at the input. + # * The computation precision remains at int8. + # * The output data type may vary, being either int8 or BFloat16, contingent on the presence + # of a pair of quantization and dequantization nodes at the output. + # For non-quantizable pointwise operators, the data type will be inherited from the previous node, + # potentially resulting in a data type of BFloat16 in this scenario. + # For quantizable pointwise operators such as QMaxpool2D, it continues to operate with the int8 + # data type for both input and output. + optimized_model = torch.compile(converted_model) + + # Running some benchmark + optimized_model(*example_inputs) + +Put all these codes together, we will have the toy example code. +Please note that since the Inductor ``freeze`` feature does not turn on by default yet, run your example code with ``TORCHINDUCTOR_FREEZING=1``. + +For example: + +:: + + TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_pytorch_2_1.py + +With PyTorch 2.1 release, all CNN models from TorchBench test suite have been measured and proven effective comparing with Inductor FP32 inference path. Please refer +to `this document `_ +for detail benchmark number. + +Quantization Aware Training +----------------------------- + +The PyTorch 2 Export Quantization-Aware Training (QAT) is now supported on X86 CPU using X86InductorQuantizer, +followed by the subsequent lowering of the quantized model into Inductor. +For a more in-depth understanding of PT2 Export Quantization-Aware Training, +we recommend referring to the dedicated `PyTorch 2 Export Quantization-Aware Training `_. + +The PyTorch 2 Export QAT flow is largely similar to the PTQ flow: + +.. code:: python + + import torch + from torch._export import capture_pre_autograd_graph + from torch.ao.quantization.quantize_pt2e import ( + prepare_qat_pt2e, + convert_pt2e, + ) + import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq + from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(1024, 1000) + + def forward(self, x): + return self.linear(x) + + example_inputs = (torch.randn(1, 1024),) + m = M() + + # Step 1. program capture + # NOTE: this API will be updated to torch.export API in the future, but the captured + # result shoud mostly stay the same + exported_model = capture_pre_autograd_graph(m, example_inputs) + # we get a model with aten ops + + # Step 2. quantization-aware training + # Use Backend Quantizer for X86 CPU + # To apply dynamic quantization, add an argument ``is_dynamic=True`` when getting the config. + quantizer = X86InductorQuantizer() + quantizer.set_global(xiq.get_default_x86_inductor_quantization_config(is_qat=True)) + prepared_model = prepare_qat_pt2e(exported_model, quantizer) + + # train omitted + + converted_model = convert_pt2e(prepared_model) + # we have a model with aten ops doing integer computations when possible + + # move the quantized model to eval mode, equivalent to `m.eval()` + torch.ao.quantization.move_exported_model_to_eval(converted_model) + + # Lower the model into Inductor + with torch.no_grad(): + optimized_model = torch.compile(converted_model) + _ = optimized_model(*example_inputs) + +Please note that the Inductor ``freeze`` feature is not enabled by default. +To use this feature, you need to run example code with ``TORCHINDUCTOR_FREEZING=1``. + +For example: + +:: + + TORCHINDUCTOR_FREEZING=1 python example_x86inductorquantizer_qat.py + +Conclusion +------------ + +With this tutorial, we introduce how to use Inductor with X86 CPU in PyTorch 2 Quantization. Users can learn about +how to use ``X86InductorQuantizer`` to quantize a model and lower it into the inductor with X86 CPU devices. diff --git a/prototype_source/pt2e_quantizer.rst b/prototype_source/pt2e_quantizer.rst index 5305760cde9..be6d6949edd 100644 --- a/prototype_source/pt2e_quantizer.rst +++ b/prototype_source/pt2e_quantizer.rst @@ -8,7 +8,7 @@ Prerequisites: Required: -- `Torchdynamo concepts in PyTorch `__ +- `Torchdynamo concepts in PyTorch `__ - `Quantization concepts in PyTorch `__ @@ -302,6 +302,76 @@ functions that are used in the example: `get_bias_qspec `__ can be used to get the ``QuantizationSpec`` from ``QuantizationConfig`` for a specific pattern. +A Note on IR for PT2E Quantization Flow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +IR means the intermediate representation of the model, for example, ``torch`` IR (``torch.nn`` modules, ``torch.nn.functional`` ops) or ``aten`` IR (``torch.ops.aten.linear``, ...). PT2E Quantization Flow is using pre autograd aten IR (the output of `torch.export` API) so that we support training. As is shown before, we need to match the operator or operator patterns before we can attach annotations on them, So the question is how do we match the pattern? + +Motivation: Problem of Matching ``aten`` IR directly +-------------------------------------------------------- + +The most straightforward way might be matching ``aten`` IR directly. + +Example:: + + for n in gm.graph.nodes: + if n.op != "call_function" or n.target not in [ + torch.ops.aten.relu.default, + torch.ops.aten.relu_.default, + ]: + continue + relu_node = n + maybe_conv_node = n.args[0] + if ( + not isinstance(maybe_conv_node, Node) + or maybe_conv_node.op != "call_function" + or maybe_conv_node.target + not in [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + ] + ): + continue + + # annotate conv and relu nodes + ... + +However one problem for using this IR is that the representation might change if the PyTorch implementation for modules or functional ops changed. But this could be unexpected since modeling users typically assume that when the eager mode model code doesn't change, they should get the same model representation after program capture as well. One concrete effect for this problem is that if a ``Quantizer`` do annotations based on recognizing ``aten`` IR patterns, then it may fail to recognzing the pattern after PyTorch version update, and the same eager mode floating point may be left unquantized. + +Recommendation: Use ``SubgraphMatcherWithNameNodeMap`` for pattern matching +----------------------------------------------------------------------------- +Because of this, we recommend people to recognize the pattern through ``SubgraphMatcherWithNameNodeMap`` (an improved version of ``SubgraphMatcher`` that makes it easier to query the nodes that people want to annotate), through capturing a ``torch`` IR pattern (with the same program capture used for capturing the floating point model), instead of using the ``aten`` IR pattern directly. + +Example:: + + def conv_relu_pattern(input, weight, bias): + conv = torch.nn.functional.conv2d(input, weight, bias) + output = torch.nn.functional.relu(conv) + # returns an additional dict that includes a map from name to node that we want to annotate + return relu, {"input": input, "weight": weight, "bias": bias, "output": output} + + matcher = SubgraphMatcherWithNameNodeMap(conv_relu_pattern) + matches = matcher.match(model) + for match in matches: + # find input and output of the pattern + # annotate the nodes + name_node_map = match.name_node_map + input_node = name_node_map["input"] + weight_node = name_node_map["weight"] + bias_node = name_node_map["bias"] + output_node = name_node_map["relu"] + input_node.users[0].meta["quantization_annotation"] = ... + weight_node.users[0].meta["quantization_annotation"] = ... + bias_node.users[0].meta["quantization_annotation"] = ... + output_node.meta["quantization_annotation"] = ... + +With this, the ``Quantizer`` will still be valid even when the implementation for nn modules and functionals changes, the ``aten`` IR for floating point model will change, but since we capture the pattern again instead of hardcoding the ``aten`` IR for the pattern, we'll get the updated ``aten`` IR as well and will still be able to match the pattern. + +One caveat is that if inputs of the pattern has multiple users, we don't have a good way to identify which user node we want to annotate except for checking the aten op target. + +Another caveat is that we need to make sure we have an exhaustive list of examples (e.g. 2D, 3D, 4D inputs, real v.s. symbolic inputs, training=True v.s. training=False etc.) for the pattern to make sure cover different possible ``aten`` IR outcomes captured from the ``torch`` IR pattern. + +Note: We may provide some (pattern, list of example_inputs) or some pre-generated matcher object so people can just use them directly in the future. + Conclusion ^^^^^^^^^^^^^^^^^^^ diff --git a/prototype_source/python_extension_autoload.rst b/prototype_source/python_extension_autoload.rst new file mode 100644 index 00000000000..ee7af5d49ef --- /dev/null +++ b/prototype_source/python_extension_autoload.rst @@ -0,0 +1,184 @@ +Autoloading Out-of-Tree Extension +================================= + +**Author:** `Yuanhao Ji `__ + +The extension autoloading mechanism enables PyTorch to automatically +load out-of-tree backend extensions without explicit import statements. This +feature is beneficial for users as it enhances their +experience and enables them to follow the familiar PyTorch device +programming model without having to explicitly load or import device-specific +extensions. Additionally, it facilitates effortless +adoption of existing PyTorch applications with zero-code changes on +out-of-tree devices. For further details, refer to the +`[RFC] Autoload Device Extension `_. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to use out-of-tree extension autoloading in PyTorch + * Review examples with Intel Gaudi HPU, Huawei Ascend NPU + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.5 or later + +.. note:: + + This feature is enabled by default and can be disabled by using + ``export TORCH_DEVICE_BACKEND_AUTOLOAD=0``. + If you get an error like this: "Failed to load the backend extension", + this error is independent with PyTorch, you should disable this feature + and ask the out-of-tree extension maintainer for help. + +How to apply this mechanism to out-of-tree extensions? +------------------------------------------------------ + +For instance, suppose you have a backend named ``foo`` and a corresponding package named ``torch_foo``. Ensure that +your package is compatible with PyTorch 2.5 or later and includes the following snippet in its ``__init__.py`` file: + +.. code-block:: python + + def _autoload(): + print("Check things are working with `torch.foo.is_available()`.") + +Then, the only thing you need to do is define an entry point within your Python package: + +.. code-block:: python + + setup( + name="torch_foo", + version="1.0", + entry_points={ + "torch.backends": [ + "torch_foo = torch_foo:_autoload", + ], + } + ) + +Now you can import the ``torch_foo`` module by simply adding the ``import torch`` statement without the need to add ``import torch_foo``: + +.. code-block:: python + + >>> import torch + Check things are working with `torch.foo.is_available()`. + >>> torch.foo.is_available() + True + +In some cases, you might encounter issues with circular imports. The examples below demonstrate how you can address them. + +Examples +^^^^^^^^ + +In this example, we will be using Intel Gaudi HPU and Huawei Ascend NPU to determine how to +integrate your out-of-tree extension with PyTorch using the autoloading feature. + +`habana_frameworks.torch`_ is a Python package that enables users to run +PyTorch programs on Intel Gaudi by using the PyTorch ``HPU`` device key. + +.. _habana_frameworks.torch: https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html + +``habana_frameworks.torch`` is a submodule of ``habana_frameworks``, we add an entry point to +``__autoload()`` in ``habana_frameworks/setup.py``: + +.. code-block:: diff + + setup( + name="habana_frameworks", + version="2.5", + + entry_points={ + + 'torch.backends': [ + + "device_backend = habana_frameworks:__autoload", + + ], + + } + ) + +In ``habana_frameworks/init.py``, we use a global variable to track if our module has been loaded: + +.. code-block:: python + + import os + + is_loaded = False # A member variable of habana_frameworks module to track if our module has been imported + + def __autoload(): + # This is an entrypoint for pytorch autoload mechanism + # If the following condition is true, that means our backend has already been loaded, either explicitly + # or by the autoload mechanism and importing it again should be skipped to avoid circular imports + global is_loaded + if is_loaded: + return + import habana_frameworks.torch + +In ``habana_frameworks/torch/init.py``, we prevent circular imports by updating the state of the global variable: + +.. code-block:: python + + import os + + # This is to prevent torch autoload mechanism from causing circular imports + import habana_frameworks + + habana_frameworks.is_loaded = True + +`torch_npu`_ enables users to run PyTorch programs on Huawei Ascend NPU, it +leverages the ``PrivateUse1`` device key and exposes the device name +as ``npu`` to the end users. + +.. _torch_npu: https://github.com/Ascend/pytorch + +We define an entry point in `torch_npu/setup.py`_: + +.. _torch_npu/setup.py: https://github.com/Ascend/pytorch/blob/master/setup.py#L618 + +.. code-block:: diff + + setup( + name="torch_npu", + version="2.5", + + entry_points={ + + 'torch.backends': [ + + 'torch_npu = torch_npu:_autoload', + + ], + + } + ) + +Unlike ``habana_frameworks``, ``torch_npu`` uses the environment variable ``TORCH_DEVICE_BACKEND_AUTOLOAD`` +to control the autoloading process. For example, we set it to ``0`` to disable autoloading to prevent circular imports: + +.. code-block:: python + + # Disable autoloading before running 'import torch' + os.environ['TORCH_DEVICE_BACKEND_AUTOLOAD'] = '0' + + import torch + +How it works +------------ + +.. image:: ../_static/img/python_extension_autoload_impl.png + :alt: Autoloading implementation + :align: center + +Autoloading is implemented based on Python's `Entrypoints +`_ +mechanism. We discover and load all of the specific entry points +in ``torch/__init__.py`` that are defined by out-of-tree extensions. + +As shown above, after installing ``torch_foo``, your Python module can be imported +when loading the entrypoint that you have defined, and then you can do some necessary work when +calling it. + +See the implementation in this pull request: `[RFC] Add support for device extension autoloading +`_. + +Conclusion +---------- + +In this tutorial, we learned about the out-of-tree extension autoloading mechanism in PyTorch, which automatically +loads backend extensions eliminating the need to add additional import statements. We also learned how to apply +this mechanism to out-of-tree extensions by defining an entry point and how to prevent circular imports. +We also reviewed an example on how to use the autoloading mechanism with Intel Gaudi HPU and Huawei Ascend NPU. diff --git a/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst new file mode 100644 index 00000000000..43fd190e995 --- /dev/null +++ b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst @@ -0,0 +1,10 @@ +Quantization in PyTorch 2.0 Export Tutorial +=========================================== + +This tutorial has been moved. + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/prototype_source/semi_structured_sparse.rst b/prototype_source/semi_structured_sparse.rst index 463a5f5d7b2..c7b82fd43cd 100644 --- a/prototype_source/semi_structured_sparse.rst +++ b/prototype_source/semi_structured_sparse.rst @@ -1,5 +1,5 @@ (prototype) Accelerating BERT with semi-structured (2:4) sparsity -================================================================ +================================================================= **Author**: `Jesse Cai `_ Like other forms of sparsity, **semi-structured sparsity** is a model optimization technique that seeks to reduce the memory overhead and latency of a neural network at the expense of some model accuracy. @@ -315,6 +315,7 @@ Now that those are defined, we just need one additional helper function, which w We will get started by loading our model and tokenizer, and then setting up our dataset. .. code:: python + # load model model_name = "bert-base-cased" tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) @@ -344,6 +345,7 @@ Running the following code gives me an F1 score of 86.9. This is quite close to training_args = transformers.TrainingArguments( "trainer", num_train_epochs=1, + lr_scheduler_type="constant", per_device_train_batch_size=64, per_device_eval_batch_size=512, ) @@ -446,7 +448,7 @@ We will also evaluate the model to show the accuracy degradation of zero-shot pr with torch.inference_mode(): predictions = trainer.predict(tokenized_squad_dataset["validation"]) pruned = compute_metrics( - *predictions.predictions + *predictions.predictions, tokenized_squad_dataset["validation"], squad_dataset["validation"], ) @@ -475,7 +477,7 @@ Once we've reached a satisfied state, we can call ``squash_mask`` to fuse the ma # [ 0.0000, 0.0225, -0.0395, -0.0000, ..., -0.0000, 0.0684, -0.0344, -0.0000]], device='cuda:0', requires_grad=True) Accelerating 2:4 sparse models for inference -------------------------------------------- +--------i------------------------------------ Now that we have a model in this format, we can accelerate it for inference just like in the QuickStart Guide. .. code:: python @@ -498,7 +500,7 @@ Now that we have a model in this format, we can accelerate it for inference just print("sparse eval metrics: ", metrics_sparse) sparse_perf = measure_execution_time( model, - batch_sizes_perf_cuda, + batch_sizes, tokenized_squad_dataset["validation"], ) print("sparse perf metrics: ", sparse_perf) diff --git a/prototype_source/torchscript_freezing.py b/prototype_source/torchscript_freezing.py index ca21451d6e8..b5b467a247e 100644 --- a/prototype_source/torchscript_freezing.py +++ b/prototype_source/torchscript_freezing.py @@ -2,6 +2,8 @@ Model Freezing in TorchScript ============================= +.. warning:: TorchScript is no longer in active development. + In this tutorial, we introduce the syntax for *model freezing* in TorchScript. Freezing is the process of inlining Pytorch module parameters and attributes values into the TorchScript internal representation. Parameter and attribute diff --git a/prototype_source/tracing_based_selective_build.rst b/prototype_source/tracing_based_selective_build.rst index 811ca1cf897..a1b56072051 100644 --- a/prototype_source/tracing_based_selective_build.rst +++ b/prototype_source/tracing_based_selective_build.rst @@ -1,201 +1,10 @@ (prototype) Tracing-based Selective Build Mobile Interpreter in Android and iOS =============================================================================== +This tutorial has been replaced with a newer tutorial on this topic: https://pytorch.org/executorch/stable/kernel-library-selective-build.html -*Author*: Chen Lai , Dhruv Matani +Redirecting in 3 seconds... -.. warning:: - Tracing-based selective build a prototype feature to minimize library size. Since the traced result relies on the model input and traced environment, if the tracer runs in a different environment than mobile interpreter, the operator list might be different from the actual used operator list and missing operators error might raise. +.. raw:: html -Introduction ------------- - - -This tutorial introduces a new way to custom build mobile interpreter to further optimize mobile interpreter size. It restricts the set of operators included in the compiled binary to only the set of operators actually needed by target models. It is a technique to reduce the binary size of PyTorch for mobile deployments. Tracing Based Selective Build runs a model with specific representative inputs, and records which operators were called. The build then includes just those operators. - - -Following are the processes to use tracing-based selective approach to build a custom mobile interpreter. - -1. *Prepare model with bundled input* - -.. code:: python - - import numpy as np - import torch - import torch.jit - import torch.utils - import torch.utils.bundled_inputs - from PIL import Image - from torchvision import transforms - - # Step 1. Get the model - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scripted_module = torch.jit.script(model) - # Export full jit version model (not compatible lite interpreter), leave it here for comparison - scripted_module.save("deeplabv3_scripted.pt") - # Export lite interpreter version model (compatible with lite interpreter) - # path = "" - - scripted_module._save_for_lite_interpreter(f"${path}/deeplabv3_scripted.ptl") - - model_file = f"${path}/deeplabv3_scripted.ptl" - - # Step 2. Prepare inputs for the model - input_image_1 = Image.open(f"${path}/dog.jpg") - preprocess = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ]) - - input_tensor_1 = preprocess(input_image_1) - input_batch_1 = input_tensor_1.unsqueeze(0) # create a mini-batch as expected by the model - - scripted_module = torch.jit.load(model_file) - scripted_module.forward(input_batch_1) # optional, to validate the model can run with the input_batch_1 - - input_image_2 = Image.open(f"${path}/deeplab.jpg") - input_tensor_2 = preprocess(input_image_2) - input_batch_2 = input_tensor_2.unsqueeze(0) # create a mini-batch as expected by the model - - scripted_module = torch.jit.load(model_file) - scripted_module.forward(input_batch_2) # optional, to validate the model can run with the input_batch_2 - - # Step 3. Bundle the model with the prepared input from step2. Can bundle as many input as possible. - bundled_model_input = [ - (torch.utils.bundled_inputs.bundle_large_tensor(input_batch_1), ), - (torch.utils.bundled_inputs.bundle_large_tensor(input_batch_2), )] - bundled_model = torch.utils.bundled_inputs.bundle_inputs(scripted_module, bundled_model_input) - bundled_model._save_for_lite_interpreter(f"${path}/deeplabv3_scripted_with_bundled_input.ptl") - -2. Build tracer - -.. code:: shell - - MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ MAX_JOBS=16 TRACING_BASED=1 python setup.py develop - -3. Run tracer with the model with bundled input - -.. code:: shell - - ./build/bin/model_tracer --model_input_path ${path}/deeplabv3_scripted_with_bundled_input.ptl --build_yaml_path ${path}/deeplabv3_scripted.yaml - - - -Android -------- - -Get the Image Segmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation - -1. **Tracing-based build libtorch lite for android**: Build libtorch for android for all 4 android abis (``armeabi-v7a``, ``arm64-v8a``, ``x86``, ``x86_64``) by running - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 ./scripts/build_pytorch_android.sh - -if it will be tested on Pixel 4 emulator with ``x86``, use cmd ``BUILD_LITE_INTERPRETER=1 ./scripts/build_pytorch_android.sh x86`` to specify abi to save build time. - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 ./scripts/build_pytorch_android.sh x86 - - -After the build finish, it will show the library path: - -.. code-block:: bash - - BUILD SUCCESSFUL in 55s - 134 actionable tasks: 22 executed, 112 up-to-date - + find /Users/chenlai/pytorch/android -type f -name '*aar' - + xargs ls -lah - -rw-r--r-- 1 chenlai staff 13M Feb 11 11:48 /Users/chenlai/pytorch/android/pytorch_android/build/outputs/aar/pytorch_android-release.aar - -rw-r--r-- 1 chenlai staff 36K Feb 9 16:45 /Users/chenlai/pytorch/android/pytorch_android_torchvision/build/outputs/aar/pytorch_android_torchvision-release.aar - -2. **Use the PyTorch Android libraries built from source in the ImageSegmentation app**: Create a folder `libs` in the path, the path from repository root will be `ImageSegmentation/app/libs`. Copy `pytorch_android-release` to the path ``ImageSegmentation/app/libs/pytorch_android-release.aar``. Copy `pytorch_android_torchvision` (downloaded from `Pytorch Android Torchvision Nightly `_) to the path ``ImageSegmentation/app/libs/pytorch_android_torchvision.aar``. Update the `dependencies` part of ``ImageSegmentation/app/build.gradle`` to - -.. code:: gradle - - dependencies { - implementation 'androidx.appcompat:appcompat:1.2.0' - implementation 'androidx.constraintlayout:constraintlayout:2.0.2' - testImplementation 'junit:junit:4.12' - androidTestImplementation 'androidx.test.ext:junit:1.1.2' - androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' - - - implementation(name:'pytorch_android-release', ext:'aar') - implementation(name:'pytorch_android_torchvision', ext:'aar') - - implementation 'com.android.support:appcompat-v7:28.0.0' - implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' - } - -Update `all projects` part in ``ImageSegmentation/build.gradle`` to - - -.. code:: gradle - - allprojects { - repositories { - google() - jcenter() - flatDir { - dirs 'libs' - } - } - } - - -3. **Test app**: Build and run the `ImageSegmentation` app in Android Studio - - -iOS ---- - -Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation - - -1. **Build libtorch lite for iOS**: - -.. code-block:: bash - - SELECTED_OP_LIST=${path}/deeplabv3_scripted.yaml TRACING_BASED=1 IOS_PLATFORM=SIMULATOR ./scripts/build_ios.sh - - -2. **Remove Cocoapods from the project** (this step is only needed if you ran `pod install`): - - -.. code-block:: bash - - pod deintegrate - - -3. **Link ImageSegmentation demo app with the custom built library**: - -Open your project in XCode, go to your project Target’s **Build Phases - Link Binaries With Libraries**, click the **+** sign and add all the library files located in `build_ios/install/lib`. Navigate to the project **Build Settings**, set the value **Header Search Paths** to `build_ios/install/include` and **Library Search Paths** to `build_ios/install/lib`. -In the build settings, search for **other linker flags**. Add a custom linker flag below `-all_load`. -Finally, disable bitcode for your target by selecting the Build Settings, searching for Enable Bitcode, and set the value to **No**. - - -4. **Build and test the app in Xcode.** - - - -Conclusion ----------- - -In this tutorial, we demonstrated a new way to custom build PyTorch's efficient mobile interpreter - tracing-based selective build, in an Android and iOS app. - -We walked through an Image Segmentation example to show how to bundle inputs to a model, generated operator list by tracing the model with bundled input, and build a custom torch library from source with the operator list from tracing result. - -The custom build is still under development, and we will continue improving its size in the future. Note, however, that the APIs are subject to change in future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue here `. - -Learn More - - -- To learn more about PyTorch Mobile, please refer to PyTorch Mobile Home Page - -* To learn more about Image Segmentation, please refer to the Image Segmentation DeepLabV3 on Android Recipe _ + diff --git a/prototype_source/vulkan_workflow.rst b/prototype_source/vulkan_workflow.rst index 7cd3a5c9864..c7fbe34f5cd 100644 --- a/prototype_source/vulkan_workflow.rst +++ b/prototype_source/vulkan_workflow.rst @@ -1,3 +1,6 @@ +.. note:: + PyTorch Vulkan Backend is no longer maintained. Please review the `ExecuTorch Vulkan Delegate `_ implementation instead. + PyTorch Vulkan Backend User Workflow ==================================== @@ -182,7 +185,7 @@ Python API ``.vulkan()`` at the moment of writing of this tutorial is not exposed to Python API, but it is planned to be there. Android Java API ---------------- +---------------- For Android API to run model on Vulkan backend we have to specify this during model loading: diff --git a/recipes_source/android_native_app_with_custom_op.rst b/recipes_source/android_native_app_with_custom_op.rst index c03940b21ff..c9dbc093b21 100644 --- a/recipes_source/android_native_app_with_custom_op.rst +++ b/recipes_source/android_native_app_with_custom_op.rst @@ -1,735 +1,10 @@ Making Native Android Application that uses PyTorch prebuilt libraries ====================================================================== -**Author**: `Ivan Kobzarev `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -In this recipe, you will learn: +Redirecting in 3 seconds... - - How to make an Android Application that uses LibTorch API from native code (C++). +.. raw:: html - - How to use within this application TorchScript models with custom operators. - -The full setup of this app you can find in `PyTorch Android Demo Application Repository `_. - - -Setup -~~~~~ - -You will need a Python 3 environment with the following packages (and their dependencies) installed: - -- PyTorch 1.6 - -For Android development, you will need to install: - -- Android NDK - -:: - - wget https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip - unzip android-ndk-r19c-linux-x86_64.zip - export ANDROID_NDK=$(pwd)/android-ndk-r19c - - -- Android SDK - -:: - - wget https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip - unzip sdk-tools-linux-3859397.zip -d android_sdk - export ANDROID_HOME=$(pwd)/android_sdk - - - -- Gradle 4.10.3 - -Gradle is the most widely used build system for android applications, and we will need it to build our application. Download it and add to the path to use ``gradle`` in the command line. - -.. code-block:: shell - - wget https://services.gradle.org/distributions/gradle-4.10.3-bin.zip - unzip gradle-4.10.3-bin.zip - export GRADLE_HOME=$(pwd)/gradle-4.10.3 - export PATH="${GRADLE_HOME}/bin/:${PATH}" - -- JDK - -Gradle requires JDK, you need to install it and set environment variable ``JAVA_HOME`` to point to it. -For example you can install OpenJDK, following `instructions `_. - -- OpenCV SDK for Android - -Our custom operator will be implemented using the OpenCV library. To use it for Android, we need to download OpenCV SDK for Android with prebuilt libraries. -Download from `OpenCV releases page `_. Unzip it and set the environment variable ``OPENCV_ANDROID_SDK`` to it. - - -Preparing TorchScript Model With Custom C++ Operator -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TorchScript allows using custom C++ operators, to read about it with details you can read -`the dedicated tutorial `_. - -As a result, you can script the model that uses custom op, that uses OpenCV ``cv::warpPerspective`` function. - -.. code-block:: python - - import torch - import torch.utils.cpp_extension - - print(torch.version.__version__) - op_source = """ - #include - #include - - torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data_ptr()); - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data_ptr()); - - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{64, 64}); - - torch::Tensor output = - torch::from_blob(output_mat.ptr(), /*sizes=*/{64, 64}); - return output.clone(); - } - - static auto registry = - torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective); - """ - - torch.utils.cpp_extension.load_inline( - name="warp_perspective", - cpp_sources=op_source, - extra_ldflags=["-lopencv_core", "-lopencv_imgproc"], - is_python_module=False, - verbose=True, - ) - - print(torch.ops.my_ops.warp_perspective) - - - @torch.jit.script - def compute(x, y): - if bool(x[0][0] == 42): - z = 5 - else: - z = 10 - x = torch.ops.my_ops.warp_perspective(x, torch.eye(3)) - return x.matmul(y) + z - - - compute.save("compute.pt") - - -This snippet generates ``compute.pt`` file which is TorchScript model that uses custom op ``my_ops.warp_perspective``. - -You need to have installed OpenCV for development to run it. -For Linux systems that can be done using the next commands: -CentOS: - -.. code-block:: shell - - yum install opencv-devel - -Ubuntu: - -.. code-block:: shell - - apt-get install libopencv-dev - -Making Android Application -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After we succeeded in having ``compute.pt``, we want to use this TorchScript model within Android application. Using general TorchScript models (without custom operators) on Android, using Java API, you can find `here `_. We can not use this approach for our case, as our model uses a custom operator(``my_ops.warp_perspective``), default TorchScript execution will fail to find it. - -Registration of ops is not exposed to PyTorch Java API, thus we need to build Android Application with native part (C++) and using LibTorch C++ API to implement and register the same custom operator for Android. As our operator uses the OpenCV library - we will use prebuilt OpenCV Android libraries and use the same functions from OpenCV. - -Let's start creating Android application in ``NativeApp`` folder. - -.. code-block:: shell - - mkdir NativeApp - cd NativeApp - -Android Application Build Setup -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Android Application build consists of the main gradle part and native build CMake part. -All the listings here are the full file listing, that if to recreate the whole structure, -you will be able to build and install the result Android Application without any code additions. - -Gradle Build Setup ------------------- -We will need to add gradle setup files: build.gradle, gradle.properties, settings.gradle. -More about Android Gradle build configurations you can find `here `_. - -``NativeApp/settings.gradle`` - -.. code-block:: gradle - - include ':app' - - -``NativeApp/gradle.properties`` - -.. code-block:: gradle - - android.useAndroidX=true - android.enableJetifier=true - - -``NativeApp/build.gradle`` - -.. code-block:: gradle - - buildscript { - repositories { - google() - jcenter() - } - dependencies { - classpath 'com.android.tools.build:gradle:3.5.0' - } - } - - allprojects { - repositories { - google() - jcenter() - } - } - - -In ``NativeApp/build.gradle`` we specify Android gradle plugin version `3.5.0`. This version is not recent. Still, we use it as PyTorch android gradle builds use this version. - -``NativeApp/settings.gradle`` shows that out project contains only one module - ``app``, which will be our Android Application. - -.. code-block:: shell - - mkdir app - cd app - - -``NativeApp/app/build.gradle`` - -.. code-block:: gradle - - apply plugin: 'com.android.application' - - repositories { - jcenter() - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } - } - - android { - configurations { - extractForNativeBuild - } - compileSdkVersion 28 - buildToolsVersion "29.0.2" - defaultConfig { - applicationId "org.pytorch.nativeapp" - minSdkVersion 21 - targetSdkVersion 28 - versionCode 1 - versionName "1.0" - externalNativeBuild { - cmake { - arguments "-DANDROID_STL=c++_shared" - } - } - } - buildTypes { - release { - minifyEnabled false - } - } - externalNativeBuild { - cmake { - path "CMakeLists.txt" - } - } - sourceSets { - main { - jniLibs.srcDirs = ['src/main/jniLibs'] - } - } - } - - dependencies { - implementation 'com.android.support:appcompat-v7:28.0.0' - - implementation 'org.pytorch:pytorch_android:1.6.0-SNAPSHOT' - extractForNativeBuild 'org.pytorch:pytorch_android:1.6.0-SNAPSHOT' - } - - task extractAARForNativeBuild { - doLast { - configurations.extractForNativeBuild.files.each { - def file = it.absoluteFile - copy { - from zipTree(file) - into "$buildDir/$file.name" - include "headers/**" - include "jni/**" - } - } - } - } - - tasks.whenTaskAdded { task -> - if (task.name.contains('externalNativeBuild')) { - task.dependsOn(extractAARForNativeBuild) - } - } - -This gradle build script registers dependencies on pytorch_android snapshots, -that are published on nightly channels. - -As they are published to nexus sonatype repository - we need to register that repository: -``https://oss.sonatype.org/content/repositories/snapshots``. - -In our application we need to use LibTorch C++ API in our application native build part. For this, we need access to prebuilt binaries and headers. They are prepacked in PyTorch Android builds, which is published in Maven repositories. - -To use PyTorch Android prebuilt libraries from gradle dependencies (which is aar files) - -we should add registration for configuration ``extractForNativeBuild``, -add this configuration in dependencies and put its definition in the end. - -``extractForNativeBuild`` task will call ``extractAARForNativeBuild`` task that unpacks pytorch_android aar -to gradle build directory. - -Pytorch_android aar contains LibTorch headers in ``headers`` folder -and prebuilt libraries for different Android abis in ``jni`` folder: -``$ANDROID_ABI/libpytorch_jni.so``, ``$ANDROID_ABI/libfbjni.so``. -We will use them for our native build. - -The native build is registered in this ``build.gradle`` with lines: - -.. code-block:: gradle - - android { - ... - externalNativeBuild { - cmake { - path "CMakeLists.txt" - } - } - ... - defaultConfig { - externalNativeBuild { - cmake { - arguments "-DANDROID_STL=c++_shared" - } - } - } - -We will use ``CMake`` configuration for a native build. Here we also specify that we will dynamically link with STL, as we have several libraries. More about this, you can find `here `_. - - -Native Build CMake Setup ------------------------- - -The native build will be configured in ``NativeApp/app/CMakeLists.txt``: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.4.1) - set(TARGET pytorch_nativeapp) - project(${TARGET} CXX) - set(CMAKE_CXX_STANDARD 14) - - set(build_DIR ${CMAKE_SOURCE_DIR}/build) - - set(pytorch_testapp_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp) - file(GLOB pytorch_testapp_SOURCES - ${pytorch_testapp_cpp_DIR}/pytorch_nativeapp.cpp - ) - - add_library(${TARGET} SHARED - ${pytorch_testapp_SOURCES} - ) - - file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers") - file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}") - - target_compile_options(${TARGET} PRIVATE - -fexceptions - ) - - set(BUILD_SUBDIR ${ANDROID_ABI}) - - find_library(PYTORCH_LIBRARY pytorch_jni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH) - find_library(FBJNI_LIBRARY fbjni - PATHS ${PYTORCH_LINK_DIRS} - NO_CMAKE_FIND_ROOT_PATH) - - # OpenCV - if(NOT DEFINED ENV{OPENCV_ANDROID_SDK}) - message(FATAL_ERROR "Environment var OPENCV_ANDROID_SDK is not set") - endif() - - set(OPENCV_INCLUDE_DIR "$ENV{OPENCV_ANDROID_SDK}/sdk/native/jni/include") - - target_include_directories(${TARGET} PRIVATE - "${OPENCV_INCLUDE_DIR}" - ${PYTORCH_INCLUDE_DIRS}) - - set(OPENCV_LIB_DIR "$ENV{OPENCV_ANDROID_SDK}/sdk/native/libs/${ANDROID_ABI}") - - find_library(OPENCV_LIBRARY opencv_java4 - PATHS ${OPENCV_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH) - - target_link_libraries(${TARGET} - ${PYTORCH_LIBRARY} - ${FBJNI_LIBRARY} - ${OPENCV_LIBRARY} - log) - -Here we register only one source file ``pytorch_nativeapp.cpp``. - -On the previous step in ``NativeApp/app/build.gradle``, the task ``extractAARForNativeBuild`` extracts headers and native libraries to build directory. We set ``PYTORCH_INCLUDE_DIRS`` and ``PYTORCH_LINK_DIRS`` to them. - -After that, we find libraries ``libpytorch_jni.so`` and ``libfbjni.so`` and add them to the linking of our target. - -As we plan to use OpenCV functions to implement our custom operator ``my_ops::warp_perspective`` - we need to link to ``libopencv_java4.so``. It is packaged in OpenCV SDK for Android, that was downloaded on the Setup step. -In this configuration, we find it by environment variable ``OPENCV_ANDROID_SDK``. - -We also link with ``log`` library to be able to log our results to Android LogCat. - -As we link to OpenCV Android SDK's ``libopencv_java4.so``, we should copy it to ``NativeApp/app/src/main/jniLibs/${ANDROID_ABI}`` - -.. code-block:: shell - - cp -R $OPENCV_ANDROID_SDK/sdk/native/libs/* NativeApp/app/src/main/jniLibs/ - - -Adding the model file to the application ----------------------------------------- - -To package the TorschScript model ``compute.pt`` within our application we should copy it to assets folder: - -.. code-block:: shell - - mkdir -p NativeApp/app/src/main/assets - cp compute.pt NativeApp/app/src/main/assets - - -Android Application Manifest ----------------------------- - -Every Android application has a manifest. -Here we specify the application name, package, main activity. - -``NativeApp/app/src/main/AndroidManifest.xml``: - -.. code-block:: default - - - - - - - - - - - - - - - - -Sources -------- - -Java Code ---------- - -Now we are ready to implement our MainActivity in - -``NativeApp/app/src/main/java/org/pytorch/nativeapp/MainActivity.java``: - -.. code-block:: java - - package org.pytorch.nativeapp; - - import android.content.Context; - import android.os.Bundle; - import android.util.Log; - import androidx.appcompat.app.AppCompatActivity; - import java.io.File; - import java.io.FileOutputStream; - import java.io.IOException; - import java.io.InputStream; - import java.io.OutputStream; - - public class MainActivity extends AppCompatActivity { - - private static final String TAG = "PyTorchNativeApp"; - - public static String assetFilePath(Context context, String assetName) { - File file = new File(context.getFilesDir(), assetName); - if (file.exists() && file.length() > 0) { - return file.getAbsolutePath(); - } - - try (InputStream is = context.getAssets().open(assetName)) { - try (OutputStream os = new FileOutputStream(file)) { - byte[] buffer = new byte[4 * 1024]; - int read; - while ((read = is.read(buffer)) != -1) { - os.write(buffer, 0, read); - } - os.flush(); - } - return file.getAbsolutePath(); - } catch (IOException e) { - Log.e(TAG, "Error process asset " + assetName + " to file path"); - } - return null; - } - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - final String modelFileAbsoluteFilePath = - new File(assetFilePath(this, "compute.pt")).getAbsolutePath(); - NativeClient.loadAndForwardModel(modelFileAbsoluteFilePath); - } - } - - -In the previous step, when we copied our ``compute.pt`` to ``NativeApp/app/src/main/assets`` that file became an Android application asset, which will be packed in application. Android system provides only stream access to it. -To use this module from LibTorch, we need to materialize it as a file on the disk. ``assetFilePath`` function copies data from the asset input stream, writes it on the disk, and returns absolute file path for it. - -``OnCreate`` method of Activity is called just after Activity creation. In this method, we call ``assertFilePath`` and call ``NativeClient`` class that will dispatch it to native code through JNI call. - -``NativeClient`` is a helper class with an internal private class ``NativePeer``, which is responsible for working with the native part of our application. It has a static block that will load ``libpytorch_nativeapp.so``, that is build with ``CMakeLists.txt`` that we added on the previous step. The static block will be executed with the first reference of ``NativePeer`` class. It happens in ``NativeClient#loadAndForwardModel``. - -``NativeApp/app/src/main/java/org/pytorch/nativeapp/NativeClient.java``: - -.. code-block:: java - - package org.pytorch.nativeapp; - - public final class NativeClient { - - public static void loadAndForwardModel(final String modelPath) { - NativePeer.loadAndForwardModel(modelPath); - } - - private static class NativePeer { - static { - System.loadLibrary("pytorch_nativeapp"); - } - - private static native void loadAndForwardModel(final String modelPath); - } - } - -``NativePeer#loadAndForwardModel`` is declared as ``native``, it does not have definition in Java. Call to this method will be re-dispatched through JNI to C++ method in our ``libpytorch_nativeapp.so``, in ``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``. - -Native code ------------ - -Now we are ready to write a native part of our application. - -``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``: - -.. code-block:: cpp - - #include - #include - #include - #include - #include - #include - #define ALOGI(...) \ - __android_log_print(ANDROID_LOG_INFO, "PyTorchNativeApp", __VA_ARGS__) - #define ALOGE(...) \ - __android_log_print(ANDROID_LOG_ERROR, "PyTorchNativeApp", __VA_ARGS__) - - #include "jni.h" - - #include - #include - - namespace pytorch_nativeapp { - namespace { - torch::Tensor warp_perspective(torch::Tensor image, torch::Tensor warp) { - cv::Mat image_mat(/*rows=*/image.size(0), - /*cols=*/image.size(1), - /*type=*/CV_32FC1, - /*data=*/image.data_ptr()); - cv::Mat warp_mat(/*rows=*/warp.size(0), - /*cols=*/warp.size(1), - /*type=*/CV_32FC1, - /*data=*/warp.data_ptr()); - - cv::Mat output_mat; - cv::warpPerspective(image_mat, output_mat, warp_mat, /*dsize=*/{8, 8}); - - torch::Tensor output = - torch::from_blob(output_mat.ptr(), /*sizes=*/{8, 8}); - return output.clone(); - } - - static auto registry = - torch::RegisterOperators("my_ops::warp_perspective", &warp_perspective); - - template void log(const char *m, T t) { - std::ostringstream os; - os << t << std::endl; - ALOGI("%s %s", m, os.str().c_str()); - } - - struct JITCallGuard { - torch::autograd::AutoGradMode no_autograd_guard{false}; - torch::AutoNonVariableTypeMode non_var_guard{true}; - torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard{false}; - }; - } // namespace - - static void loadAndForwardModel(JNIEnv *env, jclass, jstring jModelPath) { - const char *modelPath = env->GetStringUTFChars(jModelPath, 0); - assert(modelPath); - JITCallGuard guard; - torch::jit::Module module = torch::jit::load(modelPath); - module.eval(); - torch::Tensor x = torch::randn({4, 8}); - torch::Tensor y = torch::randn({8, 5}); - log("x:", x); - log("y:", y); - c10::IValue t_out = module.forward({x, y}); - log("result:", t_out); - env->ReleaseStringUTFChars(jModelPath, modelPath); - } - } // namespace pytorch_nativeapp - - JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) { - JNIEnv *env; - if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { - return JNI_ERR; - } - - jclass c = env->FindClass("org/pytorch/nativeapp/NativeClient$NativePeer"); - if (c == nullptr) { - return JNI_ERR; - } - - static const JNINativeMethod methods[] = { - {"loadAndForwardModel", "(Ljava/lang/String;)V", - (void *)pytorch_nativeapp::loadAndForwardModel}, - }; - int rc = env->RegisterNatives(c, methods, - sizeof(methods) / sizeof(JNINativeMethod)); - - if (rc != JNI_OK) { - return rc; - } - - return JNI_VERSION_1_6; - } - - -This listing is quite long, and a few things intermixed here, we will follow control flow to understand how this code works. -The first place where the control flow arrives is ``JNI_OnLoad``. -This function is called after loading the library. It is responsible for registering native method, which is called when ``NativePeer#loadAndForwardModel`` called, here it is ``pytorch_nativeapp::loadAndForwardModel`` function. - -``pytorch_nativeapp::loadAndForwardModel`` takes as an argument model path. -First, we extract its ``const char*`` value and loading the module with ``torch::jit::load``. - -To load TorchScript model for mobile, we need to set these guards, because mobile build doesn't support -features like autograd for smaller build size, placed in ``struct JITCallGuard`` in this example. -It may change in the future. You can track the latest changes keeping an eye on the -`source in PyTorch GitHub `_. - -Implementation of method ``warp_perspective`` and registration of it is entirely the same as -in `tutorial for desktop build `_. - -Building the app ----------------- - -To specify to gradle where is Android SDK and Android NDK, we need to fill ``NativeApp/local.properties``. - -.. code-block:: shell - - cd NativeApp - echo "sdk.dir=$ANDROID_HOME" >> NativeApp/local.properties - echo "ndk.dir=$ANDROID_NDK" >> NativeApp/local.properties - - -To build the result ``apk`` file we run: - -.. code-block:: shell - - cd NativeApp - gradle app:assembleDebug - -To install the app on the connected device: - -.. code-block:: shell - - cd NativeApp - gradle app::installDebug - -After that, you can run the app on the device by clicking on PyTorchNativeApp icon. -Or you can do it from the command line: - -.. code-block:: shell - - adb shell am start -n org.pytorch.nativeapp/.MainActivity - -If you check the android logcat: - -.. code-block:: shell - - adb logcat -v brief | grep PyTorchNativeApp - - -You should see logs with tag 'PyTorchNativeApp' that prints x, y, and the result of the model forward, which we print with ``log`` function in ``NativeApp/app/src/main/cpp/pytorch_nativeapp.cpp``. - -:: - - I/PyTorchNativeApp(26968): x: -0.9484 -1.1757 -0.5832 0.9144 0.8867 1.0933 -0.4004 -0.3389 - I/PyTorchNativeApp(26968): -1.0343 1.5200 -0.7625 -1.5724 -1.2073 0.4613 0.2730 -0.6789 - I/PyTorchNativeApp(26968): -0.2247 -1.2790 1.0067 -0.9266 0.6034 -0.1941 0.7021 -1.5368 - I/PyTorchNativeApp(26968): -0.3803 -0.0188 0.2021 -0.7412 -0.2257 0.5044 0.6592 0.0826 - I/PyTorchNativeApp(26968): [ CPUFloatType{4,8} ] - I/PyTorchNativeApp(26968): y: -1.0084 1.8733 0.5435 0.1087 -1.1066 - I/PyTorchNativeApp(26968): -1.9926 1.1047 0.5311 -0.4944 1.9178 - I/PyTorchNativeApp(26968): -1.5451 0.8867 1.0473 -1.7571 0.3909 - I/PyTorchNativeApp(26968): 0.4039 0.5085 -0.2776 0.4080 0.9203 - I/PyTorchNativeApp(26968): 0.3655 1.4395 -1.4467 -0.9837 0.3335 - I/PyTorchNativeApp(26968): -0.0445 0.8039 -0.2512 -1.3122 0.6543 - I/PyTorchNativeApp(26968): -1.5819 0.0525 1.5680 -0.6442 -1.3090 - I/PyTorchNativeApp(26968): -1.6197 -0.0773 -0.5967 -0.1105 -0.3122 - I/PyTorchNativeApp(26968): [ CPUFloatType{8,5} ] - I/PyTorchNativeApp(26968): result: 16.0274 9.0330 6.0124 9.8644 11.0493 - I/PyTorchNativeApp(26968): 8.7633 6.9657 12.3469 10.3159 12.0683 - I/PyTorchNativeApp(26968): 12.4529 9.4559 11.7038 7.8396 6.9716 - I/PyTorchNativeApp(26968): 8.5279 9.1780 11.3849 8.4368 9.1480 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): 10.0000 10.0000 10.0000 10.0000 10.0000 - I/PyTorchNativeApp(26968): [ CPUFloatType{8,5} ] - - - -The full setup of this app you can find in `PyTorch Android Demo Application Repository `_. + diff --git a/recipes_source/bundled_inputs.rst b/recipes_source/bundled_inputs.rst index bae2a67832f..1bdf5c7b7d2 100644 --- a/recipes_source/bundled_inputs.rst +++ b/recipes_source/bundled_inputs.rst @@ -10,8 +10,11 @@ This tutorial introduces the steps to use PyTorch's utility to bundle example or The interface of the model remains unchanged (other than adding a few methods), so it can still be safely deployed to production. The advantage of this standardized interface is that tools that run models can use it instead of having some sort of external file (or worse, document) that tells you how to run the model properly. -Common case, bundling an input to a model that only uses 'forward' for inference +Common case ------------------- + +One of the common cases—bundling an input to a model that only uses 'forward' for inference. + 1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting .. code:: python @@ -52,8 +55,11 @@ Common case, bundling an input to a model that only uses 'forward' for inference print(bundled_model(*sample_inputs[0])) -Uncommon case, bundling and retrieving inputs for functions beyond 'forward' -------------------- +Uncommon case +-------------- + +An uncommon case would be bundling and retrieving inputs for functions beyond 'forward'. + 1. **Prepare model**: Convert your model to TorchScript through either tracing or scripting .. code:: python diff --git a/recipes_source/compiling_optimizer.rst b/recipes_source/compiling_optimizer.rst new file mode 100644 index 00000000000..951495ca4fa --- /dev/null +++ b/recipes_source/compiling_optimizer.rst @@ -0,0 +1,94 @@ +(beta) Compiling the optimizer with torch.compile +========================================================================================== + +**Author:** `Michael Lazos `_ + +The optimizer is a key algorithm for training any deep learning model. +Since it is responsible for updating every model parameter, it can often +become the bottleneck in training performance for large models. In this recipe, +we will apply ``torch.compile`` to the optimizer to observe the GPU performance +improvement. + +.. note:: + + This tutorial requires PyTorch 2.2.0 or later. + +Model Setup +~~~~~~~~~~~~~~~~~~~~~ +For this example, we'll use a simple sequence of linear layers. +Since we are only benchmarking the optimizer, the choice of model doesn't matter +because optimizer performance is a function of the number of parameters. + +Depending on what machine you are using, your exact results may vary. + +.. code-block:: python + + import torch + + model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] + ) + input = torch.rand(1024, device="cuda") + output = model(input) + output.sum().backward() + +Setting up and running the optimizer benchmark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In this example, we'll use the Adam optimizer +and create a helper function to wrap the step() +in ``torch.compile()``. + +.. note:: + + ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 + +.. code-block:: python + + # exit cleanly if we are on a device that doesn't support torch.compile + if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + + + opt = torch.optim.Adam(model.parameters(), lr=0.01) + + + @torch.compile(fullgraph=False) + def fn(): + opt.step() + + + # Let's define a helpful benchmarking function: + import torch.utils.benchmark as benchmark + + + def benchmark_torch_function_in_microseconds(f, *args, **kwargs): + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f} + ) + return t0.blocked_autorange().mean * 1e6 + + + # Warmup runs to compile the function + for _ in range(5): + fn() + + eager_runtime = benchmark_torch_function_in_microseconds(opt.step) + compiled_runtime = benchmark_torch_function_in_microseconds(fn) + + assert eager_runtime > compiled_runtime + + print(f"eager runtime: {eager_runtime}us") + print(f"compiled runtime: {compiled_runtime}us") + +Sample Results: + +* Eager runtime: 747.2437149845064us +* Compiled runtime: 392.07384741178us + +See Also +~~~~~~~~~ + +* For an in-depth technical overview, see +`Compiling the optimizer with PT2 `__ diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py new file mode 100644 index 00000000000..c0402729403 --- /dev/null +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -0,0 +1,117 @@ +""" +(beta) Running the compiled optimizer with an LR Scheduler +============================================================ + +**Author:** `Michael Lazos `_ +""" + +######################################################### +# The optimizer is a key algorithm for training any deep learning model. +# In this example, we will show how to pair the optimizer, which has been compiled using ``torch.compile``, +# with the LR schedulers to accelerate training convergence. +# +# .. note:: +# +# This tutorial requires PyTorch 2.3.0 or later. + +##################################################################### +# Model Setup +# ~~~~~~~~~~~~~~~~~~~~~ +# For this example, we'll use a simple sequence of linear layers. +# + +import torch + +# Create simple model +model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] +) +input = torch.rand(1024, device="cuda") + +# run forward pass +output = model(input) + +# run backward to populate the grads for our optimizer below +output.sum().backward() + + +##################################################################### +# Setting up and running the compiled optimizer with LR Scheduler +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In this section, we'll use the Adam optimizer with LinearLR Scheduler +# and create a helper function to wrap the ``step()`` call for each of them +# in ``torch.compile()``. +# +# .. note:: +# +# ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher. + + +# exit cleanly if we are on a device that doesn't support ``torch.compile`` +if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + +# !!! IMPORTANT !!! Wrap the lr in a Tensor if we are pairing the +# the optimizer with an LR Scheduler. +# Without this, torch.compile will recompile as the value of the LR +# changes. +opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) +sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) + +@torch.compile(fullgraph=False) +def fn(): + opt.step() + sched.step() + + +# Warmup runs to compile the function +for _ in range(5): + fn() + print(opt.param_groups[0]["lr"]) + + +###################################################################### +# Extension: What happens with a non-tensor LR? +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the +# LR in a tensor. + +# No longer wrap the LR in a tensor here +opt = torch.optim.Adam(model.parameters(), lr=0.01) +sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) + +@torch.compile(fullgraph=False) +def fn(): + opt.step() + sched.step() + +# Setup logging to view recompiles +torch._logging.set_logs(recompiles=True) + +# Warmup runs to compile the function +# We will now recompile on each iteration +# as the value of the lr is mutated. +for _ in range(5): + fn() + + +###################################################################### +# With this example, we can see that we recompile the optimizer a few times +# due to the guard failure on the ``lr`` in ``param_groups[0]``. + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# +# In this tutorial we showed how to pair the optimizer compiled with ``torch.compile`` +# with an LR Scheduler to accelerate training convergence. We used a model consisting +# of a simple sequence of linear layers with the Adam optimizer paired +# with a LinearLR scheduler to demonstrate the LR changing across iterations. +# +# See also: +# +# * `Compiled optimizer tutorial `__ - an intro into the compiled optimizer. +# * `Compiling the optimizer with PT2 `__ - deeper technical details on the compiled optimizer. diff --git a/recipes_source/distributed_async_checkpoint_recipe.rst b/recipes_source/distributed_async_checkpoint_recipe.rst new file mode 100644 index 00000000000..a7194f6c589 --- /dev/null +++ b/recipes_source/distributed_async_checkpoint_recipe.rst @@ -0,0 +1,291 @@ +Asynchronous Saving with Distributed Checkpoint (DCP) +===================================================== + +**Author:** `Lucas Pasqualin `__, `Iris Zhang `__, `Rodrigo Kumpera `__, `Chien-Chin Huang `__ + +Checkpointing is often a bottle-neck in the critical path for distributed training workloads, incurring larger and larger costs as both model and world sizes grow. +One excellent strategy for offsetting this cost is to checkpoint in parallel, asynchronously. Below, we expand the save example +from the `Getting Started with Distributed Checkpoint Tutorial `__ +to show how this can be integrated quite easily with ``torch.distributed.checkpoint.async_save``. + + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How to use DCP to generate checkpoints in parallel + * Effective strategies to optimize performance + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch v2.4.0 or later + * `Getting Started with Distributed Checkpoint Tutorial `__ + + +Asynchronous Checkpointing Overview +------------------------------------ +Before getting started with Asynchronous Checkpointing, it's important to understand it's differences and limitations as compared to synchronous checkpointing. +Specifically: + +* Memory requirements - Asynchronous checkpointing works by first copying models into internal CPU-buffers. + This is helpful since it ensures model and optimizer weights are not changing while the model is still checkpointing, + but does raise CPU memory by a factor of ``checkpoint_size_per_rank X number_of_ranks``. Additionally, users should take care to understand + the memory constraints of their systems. Specifically, pinned memory implies the usage of ``page-lock`` memory, which can be scarce as compared to + ``pageable`` memory. + +* Checkpoint Management - Since checkpointing is asynchronous, it is up to the user to manage concurrently run checkpoints. In general, users can + employ their own management strategies by handling the future object returned form ``async_save``. For most users, we recommend limiting + checkpoints to one asynchronous request at a time, avoiding additional memory pressure per request. + + + +.. code-block:: python + + import os + + import torch + import torch.distributed as dist + import torch.distributed.checkpoint as dcp + import torch.multiprocessing as mp + import torch.nn as nn + + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful + from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType + + CHECKPOINT_DIR = "checkpoint" + + + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(16, 16) + self.relu = nn.ReLU() + self.net2 = nn.Linear(16, 8) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + + def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355 " + + # initialize the process group + dist.init_process_group("nccl", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + + def cleanup(): + dist.destroy_process_group() + + + def run_fsdp_checkpoint_save_example(rank, world_size): + print(f"Running basic FSDP checkpoint saving example on rank {rank}.") + setup(rank, world_size) + + # create a model and move it to GPU with id rank + model = ToyModel().to(rank) + model = FSDP(model) + + loss_fn = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + + checkpoint_future = None + for step in range(10): + optimizer.zero_grad() + model(torch.rand(8, 16, device="cuda")).sum().backward() + optimizer.step() + + # waits for checkpointing to finish if one exists, avoiding queuing more then one checkpoint request at a time + if checkpoint_future is not None: + checkpoint_future.result() + + state_dict = { "app": AppState(model, optimizer) } + checkpoint_future = dcp.async_save(state_dict, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}") + + cleanup() + + + if __name__ == "__main__": + world_size = torch.cuda.device_count() + print(f"Running async checkpoint example on {world_size} devices.") + mp.spawn( + run_fsdp_checkpoint_save_example, + args=(world_size,), + nprocs=world_size, + join=True, + ) + + +Even more performance with Pinned Memory +----------------------------------------- +If the above optimization is still not performant enough, you can take advantage of an additional optimization for GPU models which utilizes a pinned memory buffer for checkpoint staging. +Specifically, this optimization attacks the main overhead of asynchronous checkpointing, which is the in-memory copying to checkpointing buffers. By maintaining a pinned memory buffer between +checkpoint requests users can take advantage of direct memory access to speed up this copy. + +.. note:: + The main drawback of this optimization is the persistence of the buffer in between checkpointing steps. Without + the pinned memory optimization (as demonstrated above), any checkpointing buffers are released as soon as + checkpointing is finished. With the pinned memory implementation, this buffer is maintained between steps, + leading to the same + peak memory pressure being sustained through the application life. + + +.. code-block:: python + + import os + + import torch + import torch.distributed as dist + import torch.distributed.checkpoint as dcp + import torch.multiprocessing as mp + import torch.nn as nn + + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful + from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType + from torch.distributed.checkpoint import StorageWriter + + CHECKPOINT_DIR = "checkpoint" + + + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(16, 16) + self.relu = nn.ReLU() + self.net2 = nn.Linear(16, 8) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + + def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355 " + + # initialize the process group + dist.init_process_group("nccl", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + + def cleanup(): + dist.destroy_process_group() + + + def run_fsdp_checkpoint_save_example(rank, world_size): + print(f"Running basic FSDP checkpoint saving example on rank {rank}.") + setup(rank, world_size) + + # create a model and move it to GPU with id rank + model = ToyModel().to(rank) + model = FSDP(model) + + loss_fn = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + + # The storage writer defines our 'staging' strategy, where staging is considered the process of copying + # checkpoints to in-memory buffers. By setting `cached_state_dict=True`, we enable efficient memory copying + # into a persistent buffer with pinned memory enabled. + # Note: It's important that the writer persists in between checkpointing requests, since it maintains the + # pinned memory buffer. + writer = StorageWriter(cached_state_dict=True) + checkpoint_future = None + for step in range(10): + optimizer.zero_grad() + model(torch.rand(8, 16, device="cuda")).sum().backward() + optimizer.step() + + state_dict = { "app": AppState(model, optimizer) } + if checkpoint_future is not None: + # waits for checkpointing to finish, avoiding queuing more then one checkpoint request at a time + checkpoint_future.result() + dcp.async_save(state_dict, storage_writer=writer, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}") + + cleanup() + + + if __name__ == "__main__": + world_size = torch.cuda.device_count() + print(f"Running fsdp checkpoint example on {world_size} devices.") + mp.spawn( + run_fsdp_checkpoint_save_example, + args=(world_size,), + nprocs=world_size, + join=True, + ) + + +Conclusion +---------- +In conclusion, we have learned how to use DCP's :func:`async_save` API to generate checkpoints off the critical training path. We've also learned about the +additional memory and concurrency overhead introduced by using this API, as well as additional optimizations which utilize pinned memory to speed things up +even further. + +- `Saving and loading models tutorial `__ +- `Getting started with FullyShardedDataParallel tutorial `__ diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst index c20dca6639c..950839966c5 100644 --- a/recipes_source/distributed_checkpoint_recipe.rst +++ b/recipes_source/distributed_checkpoint_recipe.rst @@ -1,7 +1,7 @@ Getting Started with Distributed Checkpoint (DCP) ===================================================== -**Author**: `Iris Zhang `__, `Rodrigo Kumpera `__, `Chien-Chin Huang `__ +**Author**: `Iris Zhang `__, `Rodrigo Kumpera `__, `Chien-Chin Huang `__, `Lucas Pasqualin `__ .. note:: |edit| View and edit this tutorial in `github `__. @@ -22,13 +22,18 @@ In this tutorial, we show how to use DCP APIs with a simple FSDP wrapped model. How DCP works -------------- -:func:`torch.distributed.checkpoint` enables saving and loading models from multiple ranks in parallel. -In addition, checkpointing automatically handles fully-qualified-name (FQN) mappings across models and optimizers, enabling load-time resharding across differing cluster topologies. +:func:`torch.distributed.checkpoint` enables saving and loading models from multiple ranks in parallel. You can use this module to save on any number of ranks in parallel, +and then re-shard across differing cluster topologies at load time. + +Addditionally, through the use of modules in :func:`torch.distributed.checkpoint.state_dict`, +DCP offers support for gracefully handling ``state_dict`` generation and loading in distributed settings. +This includes managing fully-qualified-name (FQN) mappings across models and optimizers, and setting default parameters for PyTorch provided parallelisms. DCP is different from :func:`torch.save` and :func:`torch.load` in a few significant ways: * It produces multiple files per checkpoint, with at least one per rank. * It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead. +* DCP offers special handling of Stateful objects (formally defined in `torch.distributed.checkpoint.stateful`), automatically calling both `state_dict` and `load_state_dict` methods if they are defined. .. note:: The code in this tutorial runs on an 8-GPU server, but it can be easily @@ -42,7 +47,7 @@ Here we use a toy model wrapped with FSDP for demonstration purposes. Similarly, Saving ~~~~~~ -Now, let’s create a toy module, wrap it with FSDP, feed it with some dummy input data, and save it. +Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input data, and save it. .. code-block:: python @@ -50,16 +55,48 @@ Now, let’s create a toy module, wrap it with FSDP, feed it with some dummy inp import torch import torch.distributed as dist - import torch.distributed.checkpoint as DCP + import torch.distributed.checkpoint as dcp import torch.multiprocessing as mp import torch.nn as nn from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict + from torch.distributed.checkpoint.stateful import Stateful from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType CHECKPOINT_DIR = "checkpoint" + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + class ToyModel(nn.Module): def __init__(self): super(ToyModel, self).__init__() @@ -99,20 +136,8 @@ Now, let’s create a toy module, wrap it with FSDP, feed it with some dummy inp model(torch.rand(8, 16, device="cuda")).sum().backward() optimizer.step() - # set FSDP StateDictType to SHARDED_STATE_DICT so we can use DCP to checkpoint sharded model state dict - # note that we do not support FSDP StateDictType.LOCAL_STATE_DICT - FSDP.set_state_dict_type( - model, - StateDictType.SHARDED_STATE_DICT, - ) - state_dict = { - "model": model.state_dict(), - } - - DCP.save_state_dict( - state_dict=state_dict, - storage_writer=DCP.FileSystemWriter(CHECKPOINT_DIR), - ) + state_dict = { "app": AppState(model, optimizer) } + dcp.save(state_dict, checkpoint_id=CHECKPOINT_DIR) cleanup() @@ -152,16 +177,47 @@ The reason that we need the ``state_dict`` prior to loading is: import torch import torch.distributed as dist - import torch.distributed.checkpoint as DCP + import torch.distributed.checkpoint as dcp + from torch.distributed.checkpoint.stateful import Stateful + from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict import torch.multiprocessing as mp import torch.nn as nn from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType CHECKPOINT_DIR = "checkpoint" + class AppState(Stateful): + """This is a useful wrapper for checkpointing the Application State. Since this object is compliant + with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the + dcp.save/load APIs. + + Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model + and optimizer. + """ + + def __init__(self, model, optimizer=None): + self.model = model + self.optimizer = optimizer + + def state_dict(self): + # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT + model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer) + return { + "model": model_state_dict, + "optim": optimizer_state_dict + } + + def load_state_dict(self, state_dict): + # sets our state dicts on the model and optimizer, now that we've loaded + set_state_dict( + self.model, + self.optimizer, + model_state_dict=state_dict["model"], + optim_state_dict=state_dict["optim"] + ) + class ToyModel(nn.Module): def __init__(self): super(ToyModel, self).__init__() @@ -194,21 +250,13 @@ The reason that we need the ``state_dict`` prior to loading is: model = ToyModel().to(rank) model = FSDP(model) - FSDP.set_state_dict_type( - model, - StateDictType.SHARDED_STATE_DICT, - ) - # different from ``torch.load()``, DCP requires model state_dict prior to loading to get - # the allocated storage and sharding information. - state_dict = { - "model": model.state_dict(), - } + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) - DCP.load_state_dict( + state_dict = { "app": AppState(model, optimizer)} + dcp.load( state_dict=state_dict, - storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR), + checkpoint_id=CHECKPOINT_DIR, ) - model.load_state_dict(state_dict["model"]) cleanup() @@ -224,7 +272,8 @@ The reason that we need the ``state_dict`` prior to loading is: ) If you would like to load the saved checkpoint into a non-FSDP wrapped model in a non-distributed setup, perhaps for inference, you can also do that with DCP. -By default, DCP saves and loads a distributed ``state_dict`` in Single Program Multiple Data(SPMD) style. To load without a distributed setup, please set ``no_dist`` to ``True`` when loading with DCP. +By default, DCP saves and loads a distributed ``state_dict`` in Single Program Multiple Data(SPMD) style. However if no process group is initialized, DCP infers +the intent is to save or load in "non-distributed" style, meaning entirely in the current process. .. note:: Distributed checkpoint support for Multi-Program Multi-Data is still under development. @@ -234,7 +283,7 @@ By default, DCP saves and loads a distributed ``state_dict`` in Single Program M import os import torch - import torch.distributed.checkpoint as DCP + import torch.distributed.checkpoint as dcp import torch.nn as nn @@ -259,11 +308,10 @@ By default, DCP saves and loads a distributed ``state_dict`` in Single Program M "model": model.state_dict(), } - # turn no_dist to be true to load in non-distributed setting - DCP.load_state_dict( + # since no progress group is initialized, DCP will disable any collectives. + dcp.load( state_dict=state_dict, - storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR), - no_dist=True, + checkpoint_id=CHECKPOINT_DIR, ) model.load_state_dict(state_dict["model"]) @@ -272,9 +320,47 @@ By default, DCP saves and loads a distributed ``state_dict`` in Single Program M run_checkpoint_load_example() +Formats +---------- +One drawback not yet mentioned is that DCP saves checkpoints in a format which is inherently different then those generated using torch.save. +Since this can be an issue when users wish to share models with users used to the torch.save format, or in general just want to add format flexibility +to their applications. For this case, we provide the ``format_utils`` module in ``torch.distributed.checkpoint.format_utils``. + +A command line utility is provided for the users convenience, which follows the following format: + +.. code-block:: bash + + python -m torch.distributed.checkpoint.format_utils + +In the command above, ``mode`` is one of ``torch_to_dcp`` or ``dcp_to_torch``. + + +Alternatively, methods are also provided for users who may wish to convert checkpoints directly. + +.. code-block:: python + + import os + + import torch + import torch.distributed.checkpoint as DCP + from torch.distributed.checkpoint.format_utils import dcp_to_torch_save, torch_save_to_dcp + + CHECKPOINT_DIR = "checkpoint" + TORCH_SAVE_CHECKPOINT_DIR = "torch_save_checkpoint.pth" + + # convert dcp model to torch.save (assumes checkpoint was generated as above) + dcp_to_torch_save(CHECKPOINT_DIR, TORCH_SAVE_CHECKPOINT_DIR) + + # converts the torch.save model back to DCP + dcp_to_torch_save(TORCH_SAVE_CHECKPOINT_DIR, f"{CHECKPOINT_DIR}_new") + + + Conclusion ---------- -In conclusion, we have learned how to use DCP's :func:`save_state_dict` and :func:`load_state_dict` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`. +In conclusion, we have learned how to use DCP's :func:`save` and :func:`load` APIs, as well as how they are different form :func:`torch.save` and :func:`torch.load`. +Additionally, we've learned how to use :func:`get_state_dict` and :func:`set_state_dict` to automatically manage parallelism-specific FQN's and defaults during state dict +generation and loading. For more information, please see the following: diff --git a/recipes_source/distributed_comm_debug_mode.rst b/recipes_source/distributed_comm_debug_mode.rst new file mode 100644 index 00000000000..dc1a6e3e565 --- /dev/null +++ b/recipes_source/distributed_comm_debug_mode.rst @@ -0,0 +1,210 @@ +Getting Started with ``CommDebugMode`` +===================================================== + +**Author**: `Anshul Sinha `__ + + +In this tutorial, we will explore how to use ``CommDebugMode`` with PyTorch's +DistributedTensor (DTensor) for debugging by tracking collective operations in distributed training environments. + +Prerequisites +--------------------- + +* Python 3.8 - 3.11 +* PyTorch 2.2 or later + + +What is ``CommDebugMode`` and why is it useful +---------------------------------------------------- +As the size of models continues to increase, users are seeking to leverage various combinations +of parallel strategies to scale up distributed training. However, the lack of interoperability +between existing solutions poses a significant challenge, primarily due to the absence of a +unified abstraction that can bridge these different parallelism strategies. To address this +issue, PyTorch has proposed `DistributedTensor(DTensor) +`_ +which abstracts away the complexities of tensor communication in distributed training, +providing a seamless user experience. However, when dealing with existing parallelism solutions and +developing parallelism solutions using the unified abstraction like DTensor, the lack of transparency +about what and when the collective communications happens under the hood could make it challenging +for advanced users to identify and resolve issues. To address this challenge, ``CommDebugMode``, a +Python context manager will serve as one of the primary debugging tools for DTensors, enabling +users to view when and why collective operations are happening when using DTensors, effectively +addressing this issue. + + +Using ``CommDebugMode`` +------------------------ + +Here is how you can use ``CommDebugMode``: + +.. code-block:: python + + # The model used in this example is a MLPModule applying Tensor Parallel + comm_mode = CommDebugMode() + with comm_mode: + output = model(inp) + + # print the operation level collective tracing information + print(comm_mode.generate_comm_debug_tracing_table(noise_level=0)) + + # log the operation level collective tracing information to a file + comm_mode.log_comm_debug_tracing_table_to_file( + noise_level=1, file_name="transformer_operation_log.txt" + ) + + # dump the operation level collective tracing information to json file, + # used in the visual browser below + comm_mode.generate_json_dump(noise_level=2) + +This is what the output looks like for a MLPModule at noise level 0: + +.. code-block:: python + + Expected Output: + Global + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule + FORWARD PASS + *c10d_functional.all_reduce: 1 + MLPModule.net1 + MLPModule.relu + MLPModule.net2 + FORWARD PASS + *c10d_functional.all_reduce: 1 + +To use ``CommDebugMode``, you must wrap the code running the model in ``CommDebugMode`` and call the API that +you want to use to display the data. You can also use a ``noise_level`` argument to control the verbosity +level of displayed information. Here is what each noise level displays: + +| 0. Prints module-level collective counts +| 1. Prints DTensor operations (not including trivial operations), module sharding information +| 2. Prints tensor operations (not including trivial operations) +| 3. Prints all operations + +In the example above, you can see that the collective operation, all_reduce, occurs once in the forward pass +of the ``MLPModule``. Furthermore, you can use ``CommDebugMode`` to pinpoint that the all-reduce operation happens +in the second linear layer of the ``MLPModule``. + + +Below is the interactive module tree visualization that you can use to upload your own JSON dump: + +.. raw:: html + + + + + + + CommDebugMode Module Tree + + + +
    +
    + Drag file here +
    + +
    +
    + + + + +Conclusion +------------------------------------------ + +In this recipe, we have learned how to use ``CommDebugMode`` to debug Distributed Tensors and +parallelism solutions that uses communication collectives with PyTorch. You can use your own +JSON outputs in the embedded visual browser. + +For more detailed information about ``CommDebugMode``, see +`comm_mode_features_example.py +`_ diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst new file mode 100644 index 00000000000..17ce3c4859e --- /dev/null +++ b/recipes_source/distributed_device_mesh.rst @@ -0,0 +1,179 @@ +Getting Started with DeviceMesh +===================================================== + +**Author**: `Iris Zhang `__, `Wanchao Liang `__ + +.. note:: + |edit| View and edit this tutorial in `github `__. + +Prerequisites: + +- `Distributed Communication Package - torch.distributed `__ +- Python 3.8 - 3.11 +- PyTorch 2.2 + + +Setting up distributed communicators, i.e. NVIDIA Collective Communication Library (NCCL) communicators, for distributed training can pose a significant challenge. For workloads where users need to compose different parallelisms, +users would need to manually set up and manage NCCL communicators (for example, :class:`ProcessGroup`) for each parallelism solution. This process could be complicated and susceptible to errors. +:class:`DeviceMesh` can simplify this process, making it more manageable and less prone to errors. + +What is DeviceMesh +------------------ +:class:`DeviceMesh` is a higher level abstraction that manages :class:`ProcessGroup`. It allows users to effortlessly +create inter-node and intra-node process groups without worrying about how to set up ranks correctly for different sub process groups. +Users can also easily manage the underlying process_groups/devices for multi-dimensional parallelism via :class:`DeviceMesh`. + +.. figure:: /_static/img/distributed/device_mesh.png + :width: 100% + :align: center + :alt: PyTorch DeviceMesh + +Why DeviceMesh is Useful +------------------------ +DeviceMesh is useful when working with multi-dimensional parallelism (i.e. 3-D parallel) where parallelism composability is required. For example, when your parallelism solutions require both communication across hosts and within each host. +The image above shows that we can create a 2D mesh that connects the devices within each host, and connects each device with its counterpart on the other hosts in a homogenous setup. + +Without DeviceMesh, users would need to manually set up NCCL communicators, cuda devices on each process before applying any parallelism, which could be quite complicated. +The following code snippet illustrates a hybrid sharding 2-D Parallel pattern setup without :class:`DeviceMesh`. +First, we need to manually calculate the shard group and replicate group. Then, we need to assign the correct shard and +replicate group to each rank. + +.. code-block:: python + + import os + + import torch + import torch.distributed as dist + + # Understand world topology + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + print(f"Running example on {rank=} in a world with {world_size=}") + + # Create process groups to manage 2-D like parallel pattern + dist.init_process_group("nccl") + torch.cuda.set_device(rank) + + # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7)) + # and assign the correct shard group to each rank + num_node_devices = torch.cuda.device_count() + shard_rank_lists = list(range(0, num_node_devices // 2)), list(range(num_node_devices // 2, num_node_devices)) + shard_groups = ( + dist.new_group(shard_rank_lists[0]), + dist.new_group(shard_rank_lists[1]), + ) + current_shard_group = ( + shard_groups[0] if rank in shard_rank_lists[0] else shard_groups[1] + ) + + # Create replicate groups (for example, (0, 4), (1, 5), (2, 6), (3, 7)) + # and assign the correct replicate group to each rank + current_replicate_group = None + shard_factor = len(shard_rank_lists[0]) + for i in range(num_node_devices // 2): + replicate_group_ranks = list(range(i, num_node_devices, shard_factor)) + replicate_group = dist.new_group(replicate_group_ranks) + if rank in replicate_group_ranks: + current_replicate_group = replicate_group + +To run the above code snippet, we can leverage PyTorch Elastic. Let's create a file named ``2d_setup.py``. +Then, run the following `torch elastic/torchrun `__ command. + +.. code-block:: python + + torchrun --nproc_per_node=8 --rdzv_id=100 --rdzv_endpoint=localhost:29400 2d_setup.py + +.. note:: + For simplicity of demonstration, we are simulating 2D parallel using only one node. Note that this code snippet can also be used when running on multi hosts setup. + +With the help of :func:`init_device_mesh`, we can accomplish the above 2D setup in just two lines, and we can still +access the underlying :class:`ProcessGroup` if needed. + + +.. code-block:: python + + from torch.distributed.device_mesh import init_device_mesh + mesh_2d = init_device_mesh("cuda", (2, 4), mesh_dim_names=("replicate", "shard")) + + # Users can access the underlying process group thru `get_group` API. + replicate_group = mesh_2d.get_group(mesh_dim="replicate") + shard_group = mesh_2d.get_group(mesh_dim="shard") + +Let's create a file named ``2d_setup_with_device_mesh.py``. +Then, run the following `torch elastic/torchrun `__ command. + +.. code-block:: python + + torchrun --nproc_per_node=8 2d_setup_with_device_mesh.py + + +How to use DeviceMesh with HSDP +------------------------------- + +Hybrid Sharding Data Parallel(HSDP) is 2D strategy to perform FSDP within a host and DDP across hosts. + +Let's see an example of how DeviceMesh can assist with applying HSDP to your model with a simple setup. With DeviceMesh, +users would not need to manually create and manage shard group and replicate group. + +.. code-block:: python + + import torch + import torch.nn as nn + + from torch.distributed.device_mesh import init_device_mesh + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy + + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + + # HSDP: MeshShape(2, 4) + mesh_2d = init_device_mesh("cuda", (2, 4)) + model = FSDP( + ToyModel(), device_mesh=mesh_2d, sharding_strategy=ShardingStrategy.HYBRID_SHARD + ) + +Let's create a file named ``hsdp.py``. +Then, run the following `torch elastic/torchrun `__ command. + +.. code-block:: python + + torchrun --nproc_per_node=8 hsdp.py + +How to use DeviceMesh for your custom parallel solutions +-------------------------------------------------------- +When working with large scale training, you might have more complex custom parallel training composition. For example, you may need to slice out submeshes for different parallelism solutions. +DeviceMesh allows users to slice child mesh from the parent mesh and re-use the NCCL communicators already created when the parent mesh is initialized. + +.. code-block:: python + + from torch.distributed.device_mesh import init_device_mesh + mesh_3d = init_device_mesh("cuda", (2, 2, 2), mesh_dim_names=("replicate", "shard", "tp")) + + # Users can slice child meshes from the parent mesh. + hsdp_mesh = mesh_3d["replicate", "shard"] + tp_mesh = mesh_3d["tp"] + + # Users can access the underlying process group thru `get_group` API. + replicate_group = hsdp_mesh["replicate"].get_group() + shard_group = hsdp_mesh["shard"].get_group() + tp_group = tp_mesh.get_group() + + +Conclusion +---------- +In conclusion, we have learned about :class:`DeviceMesh` and :func:`init_device_mesh`, as well as how +they can be used to describe the layout of devices across the cluster. + +For more information, please see the following: + +- `2D parallel combining Tensor/Sequance Parallel with FSDP `__ +- `Composable PyTorch Distributed with PT2 `__ diff --git a/recipes_source/distributed_optim_torchscript.rst b/recipes_source/distributed_optim_torchscript.rst index c5bac179f61..2e68f035170 100644 --- a/recipes_source/distributed_optim_torchscript.rst +++ b/recipes_source/distributed_optim_torchscript.rst @@ -1,8 +1,7 @@ Distributed Optimizer with TorchScript support ============================================================== -.. note:: Distributed Optimizer with TorchScript support is introduced in PyTorch 1.8 - as a beta feature. This API is subject to change. +.. warning:: TorchScript is no longer in active development. In this recipe, you will learn: diff --git a/recipes_source/inference_tuning_on_aws_graviton.rst b/recipes_source/inference_tuning_on_aws_graviton.rst new file mode 100644 index 00000000000..08d3515ce9a --- /dev/null +++ b/recipes_source/inference_tuning_on_aws_graviton.rst @@ -0,0 +1,368 @@ +(Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors +====================================================================== + +**Author**: `Sunita Nadampalli `_ + +`AWS Graviton `_ is a series of ARM-based processors designed by AWS. AWS Graviton3 processors are optimized for Machine Learning (ML) workloads, including support for ``bfloat16``, Scalable Vector Extension (SVE) and twice the Single Instruction Multiple Data (SIMD) bandwidth compared to Graviton2. + +PyTorch provides native reference ATen kernels for the machine learning operators like convolutions, matmul, relu, etc. These operators can be accelerated with platform specific kernel implementations from Basic Linear Algebra (BLAS) libraries. On AWS Graviton CPUs, MKLDNN with Arm Compute Library (`ACL `_) and `OpenBLAS `_ libraries provide optimized implementations for a subset of the operators. Both these libraries are integrated into PyTorch with PyTorch 2.0 version. + +In this tutorial we will cover how to achieve the best inference performance for linear layer neural network on AWS Graviton3 CPUs (`AWS c7g instance `_) with ``bfloa16`` kernels and with the right backend selection. + +Contents +-------- +1. Basic Usage +2. Speed up inference with Bfloat16 fast math kernels +3. Improve inference performance with OpenBLAS for smaller batch dimensions +4. Optimize memory allocation overhead with Linux Transparent huge pages +5. Conclusion + +.. note:: + To successfully run this tutorial and reproduce the speedup numbers shown below, you need an instance from the Graviton3 family (``c7g/r7g/m7g``) of hardware. For this tutorial, we used the `c7g.xl (4vcpu) instance `_ . + +Basic Usage +--------------- + +PyTorch natively supports AWS Graviton3 optimizations starting with PyTorch 2.0 version. +Please refer to this `blog `_ for more details on the optimizations. + +1. Install PyTorch by running the following command: + + .. code-block:: + + python3 -m pip install torch + +2. We will start by importing the required dependencies and defining the device will run on: + +.. code-block:: python + + import torch + import torch.nn as nn + from torch.profiler import profile, record_function, ProfilerActivity + + # AWS Graviton3 cpu + device = ("cpu") + print(f"Using {device} device") + + +3. Given linear layers are at the heart of several neural networks, including transformers, we take a linear layer for this demo. We define our neural network by subclassing ``nn.Module``, and initializing the layers in ``__init__``. We construct the network with a typical large language model parameters to match the real world scenario: + +.. code-block:: python + + class MyNeuralNetwork(nn.Module): + def __init__(self): + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(4096, 4096), + nn.ReLU(), + nn.Linear(4096, 11008), + nn.ReLU(), + nn.Linear(11008, 10), + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + +4. Let's create an instance of ``MyNeuralNetwork``, and move it to the device: + +.. code-block:: python + + model = MyNeuralNetwork().to(device) + print(model) + +Next, let's get the prediction probabilities by passing them through an instance of the ``nn.Softmax`` module: + +.. code-block:: python + + X = torch.rand(1, 64, 64, device=device) + logits = model(X) + pred_probab = nn.Softmax(dim=1)(logits) + y_pred = pred_probab.argmax(1) + print(f"Predicted class: {y_pred}") + +output: + +.. code-block:: + + Predicted class: tensor([2]) + +Our network functionality is verified. Next, we will profile the performance. Lets' check two different scenarios: small and large batch dimensions. + +**Scenario 1:** A larger batch dimension, for example 256: + +.. code-block:: python + + # warm it up first and loop over multiple times to have enough execution time + + X = torch.rand(256, 64, 64, device=device) + + with torch.set_grad_enabled(False): + for _ in range(50): + model(X) #Warmup + with profile(activities=[ProfilerActivity.CPU]) as prof: + with record_function("mymodel_inference"): + for _ in range(100): + model(X) + + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + + +Following is the profiler output with the default PyTorch configuration: + +.. table:: + :widths: auto + + ====================== ============ =========== ============= =========== ============ ============ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ =========== ============= =========== ============ ============ + aten::addmm 97.61% 15.813s 98.61% 15.977s 53.255ms 300 + aten::clamp_min 1.09% 177.032ms 1.09% 177.032ms 885.160us 200 + aten::copy 1.00% 162.054ms 1.00% 162.054ms 540.180us 300 + mymodel_inference 0.22% 35.738ms 100.00% 16.201s 16.201s 1 + aten::linear 0.02% 2.955ms 98.66% 15.985s 53.282ms 300 + aten::t 0.01% 2.421ms 0.03% 5.043ms 16.810us 300 + aten::relu 0.01% 2.356ms 1.11% 179.388ms 896.940us 200 + ====================== ============ =========== ============= =========== ============ ============ + +**Self CPU time total:** 16.201s + + +Speed up Inference with ``bfloat16`` Fast Math Kernels +---------------------------------------------------------- + +AWS Graviton3 processors support `bfloat16 MMLA instructions `_. Arm Compute Library (`ACL `_) provides optimized ``bfloat16`` General Matrix Multiplication (GEMM) kernels for AWS Graviton processors, and are integrated into PyTorch via MKLDNN backend starting with PyTorch 2.0. The inference performance can be optimized with the fast math GEMM kernels. The fast math mode is not enabled by default because these kernels perform GEMM in ``bfloat16`` precision instead of ``float``, and hence results in a slight drop in the model inference accuracy. However, the accuracy drop is within the ``cosine similarity`` threshold defined for ``bfloat16`` backend in ``torchbench`` test suite, and hence acceptable for majority of the applications. To enable the fast math GEMM kernels, set the following environment variable: + +.. code-block:: bash + + $ export DNNL_DEFAULT_FPMATH_MODE=BF16 + + +When you run the above inference script, you should see the following profiler output with the MKLDNN fast math mode enabled: + +.. table:: + :widths: auto + + ====================== ============ ============ ============ ============ ============ ============ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ ============ ============ ============ ============ ============ + aten::addmm 95.61% 6.943s 97.10% 7.052s 23.507ms 300 + aten::clamp_min 2.31% 167.653ms 2.31% 167.653ms 838.265us 200 + aten::copy 1.48% 107.593ms 1.48% 107.593ms 358.643us 300 + mymodel_inference 0.43% 31.167ms 100.00% 7.262s 7.262s 1 + aten::linear 0.04% 2.911ms 97.21% 7.060s 23.533ms 300 + aten::t 0.03% 2.414ms 0.07% 4.892ms 16.307us 300 + aten::relu 0.03% 2.281ms 2.34% 169.934ms 849.670us 200 + ====================== ============ ============ ============ ============ ============ ============ + +**Self CPU time total:** 7.262s + + +This is around ``2x (7.262s vs 16.201s)`` performance improvement with the ``bfloat16`` fastmath kernels. Next, let’s look at the smaller batch dimension scenario. + +**Scenario 2:** A smaller batch dimension, for example, 32: + +.. code-block:: python + + X = torch.rand(32, 64, 64, device=device) + with torch.set_grad_enabled(False): + for _ in range(50): + model(X) #Warmup + with profile(activities=[ProfilerActivity.CPU]) as prof: + with record_function("mymodel_inference"): + for _ in range(100): + model(X) + + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + + +You should see the following profiler output when the above script is run with the PyTorch default configuration: + +.. table:: + :widths: auto + + ====================== ============= ============ ============ ============ ============ ============ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============= ============ ============ ============ ============ ============ + aten::addmm 95.51% 5.821s 97.04% 5.914s 19.713ms 300 + aten::clamp_min 2.33% 142.244ms 2.33% 142.244ms 711.220us 200 + aten::copy 1.51% 92.322ms 1.51% 92.322ms 307.740us 300 + mymodel_inference 0.45% 27.713ms 100.00% 6.094s 6.094s 1 + aten::linear 0.04% 2.495ms 97.16% 5.921s 19.736ms 300 + aten::t 0.03% 2.131ms 0.07% 4.441ms 14.803us 300 + aten::relu 0.03% 1.942ms 2.37% 144.186ms 720.930us 200 + ====================== ============= ============ ============ ============ ============ ============ + +**Self CPU time total:** 6.094s + + +The following output is the profiler output when run with the MKLDNN fast math mode enabled: + +.. code-block:: bash + + $ export DNNL_DEFAULT_FPMATH_MODE=BF16 + +.. table:: + :widths: auto + + ====================== ============ ============ ============ ============ ============ ============= + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ ============ ============ ============ ============ ============= + aten::addmm 93.31% 3.848s 95.66% 3.944s 13.148ms 300 + aten::clamp_min 3.43% 141.309ms 3.43% 141.309ms 706.545us 200 + aten::copy 2.33% 95.916ms 2.33% 95.916ms 319.720us 300 + mymodel_inference 0.67% 27.431ms 100.00% 4.123s 4.123s 1 + aten::linear 0.06% 2.471ms 95.83% 3.951s 13.170ms 300 + aten::t 0.05% 2.027ms 0.10% 4.243ms 14.143us 300 + aten::relu 0.05% 1.928ms 3.47% 143.237ms 716.185us 200 + ====================== ============ ============ ============ ============ ============ ============= + +**Self CPU time total:** 4.123s + +The MKLDNN fast math mode yields approximately a **1.47x (4.123s vs 6.094s)** performance improvement for smaller batch dimensions. Although this improvement is noteworthy, the overall performance still leaves room for improvement. This is because of the runtime overhead (weights reorders and kernel launch time) from oneDNN and ACL backend outweighing the compute benefits from the ACL GEMM kernels for the smaller batch compute. + + +Improve Inference Performance with OpenBLAS for Smaller Batch Dimensions +------------------------------------------------------------------------ + +The inference performance for smaller batch dimensions can be improved by offloading the smaller shapes from MKLDNN to OpenBLAS backend. We are working on making the backend selection automatic, with robust heuristics, for the future releases. Till the heuristics are implemented, the smaller shapes can be offloaded to OpenBLAS by increasing the threshold for MKLDNN backend selection. In the following example, we use ``64`` as the threshold, so that input with ``batch dimension of 32`` is not dispatched to MKLDNN. Instead, it is dispatched to OpenBLAS. + +.. code-block:: bash + + $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64 + +Here is the profiler output with OpenBLAS backend: + +.. table:: + :widths: auto + + ====================== ============ ============ ============ ============= ============ ============= + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ ============ ============ ============= ============ ============= + aten::addmm 96.25% 1.958s 97.51% 1.984s 6.612ms 300 + aten::clamp_min 1.28% 26.124ms 1.28% 26.124ms 130.620us 200 + aten::copy 1.23% 24.951ms 1.23% 24.951ms 83.170us 300 + mymodel_inference 0.86% 17.423ms 100.00% 2.034s 2.034s 1 + aten::linear 0.08% 1.691ms 97.74% 1.988s 6.628ms 300 + aten::t 0.07% 1.520ms 0.14% 2.945ms 9.817us 300 + aten::relu 0.06% 1.258ms 1.35% 27.382ms 136.910us 200 + ====================== ============ ============ ============ ============= ============ ============= + +**Self CPU time total:** 2.034s + + +As you can see above, switching to OpenBLAS doubled the performance **(2.034s vs 4.123s)** compared to the default MKLDNN backend configuration. This becomes significant for even smaller batch dimensions, for example, for a batch dimension of 10: + +.. code-block:: python + + X = torch.rand(10, 64, 64, device=device) + with torch.set_grad_enabled(False): + for _ in range(50): + model(X) #Warmup + with profile(activities=[ProfilerActivity.CPU]) as prof: + with record_function("mymodel_inference"): + for _ in range(100): + model(X) + + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + + +The following is the profiler output with MKLDNN fast math mode: + +.. table:: + :widths: auto + + ====================== ============ ============ ============ ============ ============= ============= + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ ============ ============ ============ ============= ============= + aten::addmm 87.81% 3.613s 91.90% 3.781s 12.604ms 300 + aten::clamp_min 7.18% 295.437ms 7.18% 295.437ms 1.477ms 200 + aten::copy 4.07% 167.516ms 4.07% 167.516ms 558.387us 300 + mymodel_inference 0.67% 27.708ms 100.00% 4.115s 4.115s 1 + aten::linear 0.06% 2.499ms 92.06% 3.788s 12.627ms 300 + aten::t 0.05% 1.982ms 0.11% 4.385ms 14.617us 300 + aten::relu 0.05% 1.932ms 7.23% 297.369ms 1.487ms 200 + ====================== ============ ============ ============ ============ ============= ============= + +**Self CPU time total:** 4.115s + + +and the following is the profiler output with the OpenBLAS backend: + +.. code-block:: bash + + $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64 + +.. table:: + :widths: auto + + ====================== ============= ============ ============ ============ ============= ============ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============= ============ ============ ============ ============= ============ + aten::addmm 92.66% 1.179s 95.23% 1.211s 4.038ms 300 + aten::clamp_min 2.83% 36.060ms 2.83% 36.060ms 180.300us 200 + aten::copy 2.52% 32.013ms 2.52% 32.013ms 106.710us 300 + mymodel_inference 1.38% 17.521ms 100.00% 1.272s 1.272s 1 + aten::linear 0.14% 1.750ms 95.60% 1.216s 4.054ms 300 + aten::t 0.12% 1.475ms 0.24% 3.033ms 10.110us 300 + aten::relu 0.10% 1.285ms 2.94% 37.345ms 186.725us 200 + ====================== ============= ============ ============ ============ ============= ============ + +**Self CPU time total:** 1.272s + + +Here we observed **3.2x (1.272s vs 4.115s)** performance improvement by tuning the backend thresholds appropriately. + + +Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP) +--------------------------------------------------------------------------- + +We also observed that for these larger networks, tensor memory allocations take significant portion of the inference latency. This can be optimized by enabling Linux transparent huge page allocations from PyTorch C10 memory allocator. Currently the feature is not enabled by default because it will increase the memory footprint marginally. Set the following environment variable to enable it: + +.. code-block:: bash + + $ export THP_MEM_ALLOC_ENABLE=1 + +For the batch dimension of 256 and with MKLDNN fast math mode: + +.. code-block:: python + + X = torch.rand(256, 64, 64, device=device) + with torch.set_grad_enabled(False): + for _ in range(50): + model(X) #Warmup + with profile(activities=[ProfilerActivity.CPU]) as prof: + with record_function("mymodel_inference"): + for _ in range(100): + model(X) + + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + + +The following is the profiler output with THP memory allocations enabled: + +.. table:: + :widths: auto + + ====================== ============ ============ ============ ============ ============== ============ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + ====================== ============ ============ ============ ============ ============== ============ + aten::addmm 91.31% 6.115s 94.39% 6.321s 21.069ms 300 + aten::clamp_min 4.82% 322.568ms 4.82% 322.568ms 1.613ms 200 + aten::copy 3.06% 204.602ms 3.06% 204.602ms 682.007us 300 + mymodel_inference 0.61% 40.777ms 100.00% 6.697s 6.697s 1 + aten::linear 0.05% 3.082ms 94.51% 6.329s 21.097ms 300 + aten::relu 0.04% 2.547ms 4.85% 325.115ms 1.626ms 200 + ====================== ============ ============ ============ ============ ============== ============ + +**Self CPU time total:** 6.697s + +This is an additional **1.08x or 8% (6.697s vs 7.262s)** improvement on top of the already optimized MKLDNN fast math mode measured above. + + +Conclusion +------------ + +In this tutorial, we covered PyTorch inference on AWS Graviton3 instances by covering the basic usage, demonstrating speedups with fast math kernels, comparing different backends for different batch dimensions, and how to optimize tensor memory allocation latencies with Linux transparent huge pages. The recommendation is to use MKLDNN backend with Bfloat16 fastmath mode and THP memory allocations for larger tensor shapes and to use OpenBLAS backend for smaller tensor shapes. We hope that you will give it a try! diff --git a/recipes_source/intel_extension_for_pytorch.rst b/recipes_source/intel_extension_for_pytorch.rst index 03416102d2b..7632ee73f3c 100644 --- a/recipes_source/intel_extension_for_pytorch.rst +++ b/recipes_source/intel_extension_for_pytorch.rst @@ -12,8 +12,8 @@ easy GPU acceleration for Intel discrete GPUs with PyTorch*. Intel® Extension for PyTorch* has been released as an open–source project at `Github `_. -- Source code for CPU is available at `master branch `_. -- Source code for GPU is available at `xpu-master branch `_. +- Source code for CPU is available at `main branch `_. +- Source code for GPU is available at `xpu-main branch `_. Features -------- diff --git a/recipes_source/intel_neural_compressor_for_pytorch.rst b/recipes_source/intel_neural_compressor_for_pytorch.rst index 67f1a7f333e..02ce3d7b378 100755 --- a/recipes_source/intel_neural_compressor_for_pytorch.rst +++ b/recipes_source/intel_neural_compressor_for_pytorch.rst @@ -115,7 +115,7 @@ In this tutorial, the LeNet model is used to demonstrate how to deal with *Intel return F.log_softmax(x, dim=1) model = Net() - model.load_state_dict(torch.load('./lenet_mnist_model.pth')) + model.load_state_dict(torch.load('./lenet_mnist_model.pth', weights_only=True)) The pretrained model weight `lenet_mnist_model.pth` comes from `here `_. diff --git a/recipes_source/loading_data_recipe.rst b/recipes_source/loading_data_recipe.rst new file mode 100644 index 00000000000..6ecd54b928a --- /dev/null +++ b/recipes_source/loading_data_recipe.rst @@ -0,0 +1,8 @@ +Loading data in PyTorch +======================= + +The content is deprecated. See `Datasets & DataLoaders `__ instead. + +.. raw:: html + + diff --git a/recipes_source/mobile_interpreter.rst b/recipes_source/mobile_interpreter.rst index 135eed1d51e..e6d2056e1a6 100644 --- a/recipes_source/mobile_interpreter.rst +++ b/recipes_source/mobile_interpreter.rst @@ -1,198 +1,10 @@ (beta) Efficient mobile interpreter in Android and iOS ================================================================== -**Author**: `Chen Lai `_, `Martin Yuan `_ +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------- +Redirecting in 3 seconds... -This tutorial introduces the steps to use PyTorch's efficient interpreter on iOS and Android. We will be using an Image Segmentation demo application as an example. +.. raw:: html -This application will take advantage of the pre-built interpreter libraries available for Android and iOS, which can be used directly with Maven (Android) and CocoaPods (iOS). It is important to note that the pre-built libraries are the available for simplicity, but further size optimization can be achieved with by utilizing PyTorch's custom build capabilities. - -.. note:: If you see the error message: `PytorchStreamReader failed locating file bytecode.pkl: file not found ()`, likely you are using a torch script model that requires the use of the PyTorch JIT interpreter (a version of our PyTorch interpreter that is not as size-efficient). In order to leverage our efficient interpreter, please regenerate the model by running: `module._save_for_lite_interpreter(${model_path})`. - - - If `bytecode.pkl` is missing, likely the model is generated with the api: `module.save(${model_psth})`. - - The api `_load_for_lite_interpreter(${model_psth})` can be helpful to validate model with the efficient mobile interpreter. - -Android -------------------- -Get the Image Segmentation demo app in Android: https://github.com/pytorch/android-demo-app/tree/master/ImageSegmentation - -1. **Prepare model**: Prepare the mobile interpreter version of model by run the script below to generate the scripted model `deeplabv3_scripted.pt` and `deeplabv3_scripted.ptl` - -.. code:: python - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - model = torch.hub.load('pytorch/vision:v0.7.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - - scripted_module = torch.jit.script(model) - # Export full jit version model (not compatible mobile interpreter), leave it here for comparison - scripted_module.save("deeplabv3_scripted.pt") - # Export mobile interpreter version model (compatible with mobile interpreter) - optimized_scripted_module = optimize_for_mobile(scripted_module) - optimized_scripted_module._save_for_lite_interpreter("deeplabv3_scripted.ptl") - -2. **Use the PyTorch Android library in the ImageSegmentation app**: Update the `dependencies` part of ``ImageSegmentation/app/build.gradle`` to - -.. code:: gradle - - repositories { - maven { - url "https://oss.sonatype.org/content/repositories/snapshots" - } - } - - dependencies { - implementation 'androidx.appcompat:appcompat:1.2.0' - implementation 'androidx.constraintlayout:constraintlayout:2.0.2' - testImplementation 'junit:junit:4.12' - androidTestImplementation 'androidx.test.ext:junit:1.1.2' - androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' - implementation 'org.pytorch:pytorch_android_lite:1.9.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.9.0' - - implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3' - } - - - -3. **Update model loader api**: Update ``ImageSegmentation/app/src/main/java/org/pytorch/imagesegmentation/MainActivity.java`` by - - 4.1 Add new import: `import org.pytorch.LiteModuleLoader` - - 4.2 Replace the way to load pytorch lite model - -.. code:: java - - // mModule = Module.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.pt")); - mModule = LiteModuleLoader.load(MainActivity.assetFilePath(getApplicationContext(), "deeplabv3_scripted.ptl")); - -4. **Test app**: Build and run the `ImageSegmentation` app in Android Studio - -iOS -------------------- -Get ImageSegmentation demo app in iOS: https://github.com/pytorch/ios-demo-app/tree/master/ImageSegmentation - -1. **Prepare model**: Same as Android. - -2. **Build the project with Cocoapods and prebuilt interpreter** Update the `PodFile` and run `pod install`: - -.. code-block:: podfile - - target 'ImageSegmentation' do - # Comment the next line if you don't want to use dynamic frameworks - use_frameworks! - - # Pods for ImageSegmentation - pod 'LibTorch_Lite', '~>1.9.0' - end - -3. **Update library and API** - - 3.1 Update ``TorchModule.mm``: To use the custom built libraries project, use `` (in ``TorchModule.mm``): - -.. code-block:: swift - - #import - // If it's built from source with xcode, comment out the line above - // and use following headers - // #include - // #include - // #include - -.. code-block:: swift - - @implementation TorchModule { - @protected - // torch::jit::script::Module _impl; - torch::jit::mobile::Module _impl; - } - - - (nullable instancetype)initWithFileAtPath:(NSString*)filePath { - self = [super init]; - if (self) { - try { - _impl = torch::jit::_load_for_mobile(filePath.UTF8String); - // _impl = torch::jit::load(filePath.UTF8String); - // _impl.eval(); - } catch (const std::exception& exception) { - NSLog(@"%s", exception.what()); - return nil; - } - } - return self; - } - -3.2 Update ``ViewController.swift`` - -.. code-block:: swift - - // if let filePath = Bundle.main.path(forResource: - // "deeplabv3_scripted", ofType: "pt"), - // let module = TorchModule(fileAtPath: filePath) { - // return module - // } else { - // fatalError("Can't find the model file!") - // } - if let filePath = Bundle.main.path(forResource: - "deeplabv3_scripted", ofType: "ptl"), - let module = TorchModule(fileAtPath: filePath) { - return module - } else { - fatalError("Can't find the model file!") - } - -4. Build and test the app in Xcode. - -How to use mobile interpreter + custom build ------------------------------------------- -A custom PyTorch interpreter library can be created to reduce binary size, by only containing the operators needed by the model. In order to do that follow these steps: - -1. To dump the operators in your model, say `deeplabv3_scripted`, run the following lines of Python code: - -.. code-block:: python - - # Dump list of operators used by deeplabv3_scripted: - import torch, yaml - model = torch.jit.load('deeplabv3_scripted.ptl') - ops = torch.jit.export_opnames(model) - with open('deeplabv3_scripted.yaml', 'w') as output: - yaml.dump(ops, output) - -In the snippet above, you first need to load the ScriptModule. Then, use export_opnames to return a list of operator names of the ScriptModule and its submodules. Lastly, save the result in a yaml file. The yaml file can be generated for any PyTorch 1.4.0 or above version. You can do that by checking the value of `torch.__version__`. - -2. To run the build script locally with the prepared yaml list of operators, pass in the yaml file generate from the last step into the environment variable SELECTED_OP_LIST. Also in the arguments, specify BUILD_PYTORCH_MOBILE=1 as well as the platform/architechture type. - -**iOS**: Take the simulator build for example, the command should be: - -.. code-block:: bash - - SELECTED_OP_LIST=deeplabv3_scripted.yaml BUILD_PYTORCH_MOBILE=1 IOS_PLATFORM=SIMULATOR ./scripts/build_ios.sh - -**Android**: Take the x86 build for example, the command should be: - -.. code-block:: bash - - SELECTED_OP_LIST=deeplabv3_scripted.yaml ./scripts/build_pytorch_android.sh x86 - - - -Conclusion ----------- - -In this tutorial, we demonstrated how to use PyTorch's efficient mobile interpreter, in an Android and iOS app. - -We walked through an Image Segmentation example to show how to dump the model, build a custom torch library from source and use the new api to run model. - -Our efficient mobile interpreter is still under development, and we will continue improving its size in the future. Note, however, that the APIs are subject to change in future versions. - -Thanks for reading! As always, we welcome any feedback, so please create an issue `here ` - if you have any. - -Learn More ----------- - -- To learn more about PyTorch Mobile, please refer to `PyTorch Mobile Home Page `_ -- To learn more about Image Segmentation, please refer to the `Image Segmentation DeepLabV3 on Android Recipe `_ + diff --git a/recipes_source/mobile_perf.rst b/recipes_source/mobile_perf.rst index aae1447cbf8..8835ddecc6d 100644 --- a/recipes_source/mobile_perf.rst +++ b/recipes_source/mobile_perf.rst @@ -1,356 +1,10 @@ Pytorch Mobile Performance Recipes ================================== -Introduction ----------------- -Performance (aka latency) is crucial to most, if not all, -applications and use-cases of ML model inference on mobile devices. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Today, PyTorch executes the models on the CPU backend pending availability -of other hardware backends such as GPU, DSP, and NPU. +Redirecting in 3 seconds... -In this recipe, you will learn: +.. raw:: html -- How to optimize your model to help decrease execution time (higher performance, lower latency) on the mobile device. -- How to benchmark (to check if optimizations helped your use case). - - -Model preparation ------------------ - -We will start with preparing to optimize your model to help decrease execution time -(higher performance, lower latency) on the mobile device. - - -Setup -^^^^^^^ - -First we need to installed pytorch using conda or pip with version at least 1.5.0. - -:: - - conda install pytorch torchvision -c pytorch - -or - -:: - - pip install torch torchvision - -Code your model: - -:: - - import torch - from torch.utils.mobile_optimizer import optimize_for_mobile - - class AnnotatedConvBnReLUModel(torch.nn.Module): - def __init__(self): - super(AnnotatedConvBnReLUModel, self).__init__() - self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float) - self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float) - self.relu = torch.nn.ReLU(inplace=True) - self.quant = torch.quantization.QuantStub() - self.dequant = torch.quantization.DeQuantStub() - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - x = self.quant(x) - x = self.conv(x) - x = self.bn(x) - x = self.relu(x) - x = self.dequant(x) - return x - - model = AnnotatedConvBnReLUModel() - - -``torch.quantization.QuantStub`` and ``torch.quantization.DeQuantStub()`` are no-op stubs, which will be used for quantization step. - - -1. Fuse operators using ``torch.quantization.fuse_modules`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Do not be confused that fuse_modules is in the quantization package. -It works for all ``torch.nn.Module``. - -``torch.quantization.fuse_modules`` fuses a list of modules into a single module. -It fuses only the following sequence of modules: - -- Convolution, Batch normalization -- Convolution, Batch normalization, Relu -- Convolution, Relu -- Linear, Relu - -This script will fuse Convolution, Batch Normalization and Relu in previously declared model. - -:: - - torch.quantization.fuse_modules(model, [['conv', 'bn', 'relu']], inplace=True) - - -2. Quantize your model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can find more about PyTorch quantization in -`the dedicated tutorial `_. - -Quantization of the model not only moves computation to int8, -but also reduces the size of your model on a disk. -That size reduction helps to reduce disk read operations during the first load of the model and decreases the amount of RAM. -Both of those resources can be crucial for the performance of mobile applications. -This code does quantization, using stub for model calibration function, you can find more about it `here `__. - -:: - - model.qconfig = torch.quantization.get_default_qconfig('qnnpack') - torch.quantization.prepare(model, inplace=True) - # Calibrate your model - def calibrate(model, calibration_data): - # Your calibration code here - return - calibrate(model, []) - torch.quantization.convert(model, inplace=True) - - - -3. Use torch.utils.mobile_optimizer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Torch mobile_optimizer package does several optimizations with the scripted model, -which will help to conv2d and linear operations. -It pre-packs model weights in an optimized format and fuses ops above with relu -if it is the next operation. - -First we script the result model from previous step: - -:: - - torchscript_model = torch.jit.script(model) - -Next we call ``optimize_for_mobile`` and save model on the disk. - -:: - - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "model.pt") - -4. Prefer Using Channels Last Tensor memory format -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Channels Last(NHWC) memory format was introduced in PyTorch 1.4.0. It is supported only for four-dimensional tensors. This memory format gives a better memory locality for most operators, especially convolution. Our measurements showed a 3x speedup of MobileNetV2 model compared with the default Channels First(NCHW) format. - -At the moment of writing this recipe, PyTorch Android java API does not support using inputs in Channels Last memory format. But it can be used on the TorchScript model level, by adding the conversion to it for model inputs. - -.. code-block:: python - - def forward(self, x): - x = x.contiguous(memory_format=torch.channels_last) - ... - - -This conversion is zero cost if your input is already in Channels Last memory format. After it, all operators will work preserving ChannelsLast memory format. - -5. Android - Reusing tensors for forward -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This part of the recipe is Android only. - -Memory is a critical resource for android performance, especially on old devices. -Tensors can need a significant amount of memory. -For example, standard computer vision tensor contains 1*3*224*224 elements, -assuming that data type is float and will need 588Kb of memory. - -:: - - FloatBuffer buffer = Tensor.allocateFloatBuffer(1*3*224*224); - Tensor tensor = Tensor.fromBlob(buffer, new long[]{1, 3, 224, 224}); - - -Here we allocate native memory as ``java.nio.FloatBuffer`` and creating ``org.pytorch.Tensor`` which storage will be pointing to the memory of the allocated buffer. - -For most of the use cases, we do not do model forward only once, repeating it with some frequency or as fast as possible. - -If we are doing new memory allocation for every module forward - that will be suboptimal. -Instead of this, we can reuse the same memory that we allocated on the previous step, fill it with new data, and run module forward again on the same tensor object. - -You can check how it looks in code in `pytorch android application example `_. - -:: - - protected AnalysisResult analyzeImage(ImageProxy image, int rotationDegrees) { - if (mModule == null) { - mModule = Module.load(moduleFileAbsoluteFilePath); - mInputTensorBuffer = - Tensor.allocateFloatBuffer(3 * 224 * 224); - mInputTensor = Tensor.fromBlob(mInputTensorBuffer, new long[]{1, 3, 224, 224}); - } - - TensorImageUtils.imageYUV420CenterCropToFloatBuffer( - image.getImage(), rotationDegrees, - 224, 224, - TensorImageUtils.TORCHVISION_NORM_MEAN_RGB, - TensorImageUtils.TORCHVISION_NORM_STD_RGB, - mInputTensorBuffer, 0); - - Tensor outputTensor = mModule.forward(IValue.from(mInputTensor)).toTensor(); - } - -Member fields ``mModule``, ``mInputTensorBuffer`` and ``mInputTensor`` are initialized only once -and buffer is refilled using ``org.pytorch.torchvision.TensorImageUtils.imageYUV420CenterCropToFloatBuffer``. - -6. Load time optimization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -**Available since Pytorch 1.13** - -PyTorch Mobile also supports a FlatBuffer-based file format that is faster -to load. Both flatbuffer and pickle-based model file can be load with the -same ``_load_for_lite_interpreter`` (Python) or ``_load_for_mobile``(C++) API. - -To use the FlatBuffer format, instead of creating the model file with -``model._save_for_lite_interpreter('path/to/file.ptl')``, you can run the following command: - - -One can save using - -:: - - model._save_for_lite_interpreter('path/to/file.ptl', _use_flatbuffer=True) - - -The extra argument ``_use_flatbuffer`` makes a FlatBuffer file instead of a -zip file. The created file will be faster to load. - -For example, using ResNet-50 and running the following script: - -:: - - import torch - from torch.jit import mobile - import time - model = torch.hub.load('pytorch/vision:v0.10.0', 'deeplabv3_resnet50', pretrained=True) - model.eval() - jit_model = torch.jit.script(model) - - jit_model._save_for_lite_interpreter('/tmp/jit_model.ptl') - jit_model._save_for_lite_interpreter('/tmp/jit_model.ff', _use_flatbuffer=True) - - import timeit - print('Load ptl file:') - print(timeit.timeit('from torch.jit import mobile; mobile._load_for_lite_interpreter("/tmp/jit_model.ptl")', - number=20)) - print('Load flatbuffer file:') - print(timeit.timeit('from torch.jit import mobile; mobile._load_for_lite_interpreter("/tmp/jit_model.ff")', - number=20)) - - - -you would get the following result: - -:: - - Load ptl file: - 0.5387594579999999 - Load flatbuffer file: - 0.038842832999999466 - -While speed ups on actual mobile devices will be smaller, you can still expect -3x - 6x load time reductions. - -### Reasons to avoid using a FlatBuffer-based mobile model - -However, FlatBuffer format also has some limitations that you might want to consider: - -* It is only available in PyTorch 1.13 or later. Therefore, client devices compiled - with earlier PyTorch versions might not be able to load it. -* The Flatbuffer library imposes a 4GB limit for file sizes. So it is not suitable - for large models. - -Benchmarking ------------- - -The best way to benchmark (to check if optimizations helped your use case) - is to measure your particular use case that you want to optimize, as performance behavior can vary in different environments. - -PyTorch distribution provides a way to benchmark naked binary that runs the model forward, -this approach can give more stable measurements rather than testing inside the application. - - -Android - Benchmarking Setup -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This part of the recipe is Android only. - -For this you first need to build benchmark binary: - -:: - - - rm -rf build_android - BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DBUILD_BINARY=ON - -You should have arm64 binary at: ``build_android/bin/speed_benchmark_torch``. -This binary takes ``--model=``, ``--input_dim="1,3,224,224"`` as dimension information for the input and ``--input_type="float"`` as the type of the input as arguments. - -Once you have your android device connected, -push speedbenchark_torch binary and your model to the phone: - -:: - - adb push /data/local/tmp - adb push /data/local/tmp - - -Now we are ready to benchmark your model: - -:: - - adb shell "/data/local/tmp/speed_benchmark_torch --model=/data/local/tmp/model.pt" --input_dims="1,3,224,224" --input_type="float" - ----- output ----- - Starting benchmark. - Running warmup runs. - Main runs. - Main run finished. Microseconds per iter: 121318. Iters per second: 8.24281 - - -iOS - Benchmarking Setup -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For iOS, we'll be using our `TestApp `_ as the benchmarking tool. - -To begin with, let's apply the ``optimize_for_mobile`` method to our python script located at `TestApp/benchmark/trace_model.py `_. Simply modify the code as below. - -:: - - import torch - import torchvision - from torch.utils.mobile_optimizer import optimize_for_mobile - - model = torchvision.models.mobilenet_v2(pretrained=True) - model.eval() - example = torch.rand(1, 3, 224, 224) - traced_script_module = torch.jit.trace(model, example) - torchscript_model_optimized = optimize_for_mobile(traced_script_module) - torch.jit.save(torchscript_model_optimized, "model.pt") - -Now let's run ``python trace_model.py``. If everything works well, we should be able to generate our optimized model in the benchmark directory. - -Next, we're going to build the PyTorch libraries from source. - -:: - - BUILD_PYTORCH_MOBILE=1 IOS_ARCH=arm64 ./scripts/build_ios.sh - -Now that we have the optimized model and PyTorch ready, it's time to generate our XCode project and do benchmarking. To do that, we'll be using a ruby script - `setup.rb` which does the heavy lifting jobs of setting up the XCode project. - -:: - - ruby setup.rb - -Now open the `TestApp.xcodeproj` and plug in your iPhone, you're ready to go. Below is an example result from iPhoneX - -:: - - TestApp[2121:722447] Main runs - TestApp[2121:722447] Main run finished. Milliseconds per iter: 28.767 - TestApp[2121:722447] Iters per second: : 34.762 - TestApp[2121:722447] Done. + diff --git a/recipes_source/model_preparation_android.rst b/recipes_source/model_preparation_android.rst index 55ef7d9735c..22c0e17df31 100644 --- a/recipes_source/model_preparation_android.rst +++ b/recipes_source/model_preparation_android.rst @@ -1,85 +1,10 @@ Model Preparation for Android Recipe ===================================== -This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for Android apps, and how to set up Android projects to use the mobile-ready model file. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------------ +Redirecting in 3 seconds... -After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see the `Quantization Recipe `_), converted to TorchScript so Android apps can load it, and optimized for mobile apps. Furthermore, Android apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference. +.. raw:: html -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Android Studio 3.5.1 or above with NDK installed - -Steps ------------------ - -1. Get Pretrained and Quantized MobileNet v2 Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - -2. Script and Optimize the Model for Mobile Apps -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use either the `script` or `trace` method to convert the quantized model to the TorchScript format: - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - -or - -:: - - torchscript_model = torch.jit.script(model_quantized) - - -.. warning:: - The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe `_ for more details. - -Then optimize the TorchScript formatted model for mobile and save it: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt") - -With the total 7 or 8 (depending on if the `script` or `trace` method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps. - -3. Add the Model and PyTorch Library on Android -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* In your current or a new Android Studio project, open the build.gradle file, and add the following two lines (the second one is required only if you plan to use a TorchVision model): - -:: - - implementation 'org.pytorch:pytorch_android:1.6.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.6.0' - -* Drag and drop the model file `mobilenetv2_quantized.pt` to your project's assets folder. - -That's it! Now you can build your Android app with the PyTorch library and the model ready to use. To actually write code to use the model, refer to the PyTorch Mobile `Android Quickstart with a HelloWorld Example `_ and `Android Hackathon Example `_. - -Learn More ------------------ - -1. `PyTorch Mobile site `_ - -2. `Introduction to TorchScript `_ + diff --git a/recipes_source/model_preparation_ios.rst b/recipes_source/model_preparation_ios.rst index 2fbacd7fa68..cbb4927eaeb 100644 --- a/recipes_source/model_preparation_ios.rst +++ b/recipes_source/model_preparation_ios.rst @@ -1,95 +1,10 @@ Model Preparation for iOS Recipe ===================================== -This recipe demonstrates how to prepare a PyTorch MobileNet v2 image classification model for iOS apps, and how to set up an iOS project to use the mobile-ready model file. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ------------------ +Redirecting in 3 seconds... -After a PyTorch model is trained or a pre-trained model is made available, it is normally not ready to be used in mobile apps yet. It needs to be quantized (see `Quantization Recipe `_ for more details), converted to TorchScript so iOS apps can load it and optimized for mobile apps (see `Script and Optimize for Mobile Recipe `_). Furthermore, iOS apps need to be set up correctly to enable the use of PyTorch Mobile libraries, before they can load and use the model for inference. +.. raw:: html -Pre-requisites ------------------ - -PyTorch 1.6.0 or 1.7.0 - -torchvision 0.6.0 or 0.7.0 - -Xcode 11 or 12 - -Steps ------------------ - -1. Get Pretrained and Quantized MobileNet v2 Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To get the MobileNet v2 quantized model, simply do: - -:: - - import torchvision - - model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True) - -2. Script and Optimize the Model for Mobile Apps -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Use either the script or trace method to convert the quantized model to the TorchScript format: - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - -or - -:: - - torchscript_model = torch.jit.script(model_quantized) - -.. warning:: - The `trace` method only scripts the code path executed during the trace, so it will not work properly for models that include decision branches. See the `Script and Optimize for Mobile Recipe `_ for more details. - - -Then optimize the TorchScript formatted model for mobile and save it: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - torchscript_model_optimized = optimize_for_mobile(torchscript_model) - torch.jit.save(torchscript_model_optimized, "mobilenetv2_quantized.pt") - -With the total 7 or 8 (depending on if the script or trace method is called to get the TorchScript format of the model) lines of code in the two steps above, we have a model ready to be added to mobile apps. - -3. Add the Model and PyTorch Library on iOS -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To use the mobile-ready model `mobilenetv2_quantized.pt` in an iOS app, either create a new Xcode project or in your existing Xcode project, then follow the steps below: - -* Open a Mac Terminal, cd to your iOS app's project folder; - -* If your iOS app does not use Cocoapods yet, run `pod init` first to generate the `Podfile` file. - -* Edit `Podfile` either from Xcode or any editor, and add the following line under the target: - -:: - - pod 'LibTorch', '~>1.6.1' - -* Run `pod install` from the Terminal and then open your project's xcworkspace file; - -* Save the two files `TorchModule.h` and `TorchModule.mm` from `here `_ and drag and drop them to your project. If your project is Swift based, a message box with the title "Would you like to configure an Objective-C bridging header?" will show up; click the "Create Bridging Header" button to create a Swift to Objective-c bridging header file, and add `#import "TorchModule.h"` to the header file `-Bridging-Header.h`; - -* Drag and drop the model file `mobilenetv2_quantized.pt` to the project. - -After these steps, you can successfully build and run your Xcode project. To actually write code to use the model, refer to the PyTorch Mobile `iOS Code Walkthrough `_ and two complete ready-to-run sample iOS apps `HelloWorld `_ and `iOS Hackathon Example `_. - - -Learn More ------------------ - -1. `PyTorch Mobile site `_ - -2. `Introduction to TorchScript `_ + diff --git a/recipes_source/profile_with_itt.rst b/recipes_source/profile_with_itt.rst index 7ddb1ab3fee..566fd614f22 100644 --- a/recipes_source/profile_with_itt.rst +++ b/recipes_source/profile_with_itt.rst @@ -58,6 +58,10 @@ Launch Intel® VTune™ Profiler To verify the functionality, you need to start an Intel® VTune™ Profiler instance. Please check the `Intel® VTune™ Profiler User Guide `__ for steps to launch Intel® VTune™ Profiler. +.. note:: + Users can also use web-server-ui by following `Intel® VTune™ Profiler Web Server UI Guide `__ + ex : vtune-backend --web-port=8080 --allow-remote-access --enable-server-profiling + Once you get the Intel® VTune™ Profiler GUI launched, you should see a user interface as below: .. figure:: /_static/img/itt_tutorial/vtune_start.png @@ -66,8 +70,8 @@ Once you get the Intel® VTune™ Profiler GUI launched, you should see a user i Three sample results are available on the left side navigation bar under `sample (matrix)` project. If you do not want profiling results appear in this default sample project, you can create a new project via the button `New Project...` under the blue `Configure Analysis...` button. To start a new profiling, click the blue `Configure Analysis...` button to initiate configuration of the profiling. -Configure Profiling -~~~~~~~~~~~~~~~~~~~ +Configure Profiling for CPU +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Once you click the `Configure Analysis...` button, you should see the screen below: @@ -77,6 +81,16 @@ Once you click the `Configure Analysis...` button, you should see the screen bel The right side of the windows is split into 3 parts: `WHERE` (top left), `WHAT` (bottom left), and `HOW` (right). With `WHERE`, you can assign a machine where you want to run the profiling on. With `WHAT`, you can set the path of the application that you want to profile. To profile a PyTorch script, it is recommended to wrap all manual steps, including activating a Python environment and setting required environment variables, into a bash script, then profile this bash script. In the screenshot above, we wrapped all steps into the `launch.sh` bash script and profile `bash` with the parameter to be ``. On the right side `HOW`, you can choose whatever type that you would like to profile. Intel® VTune™ Profiler provides a bunch of profiling types that you can choose from. Details can be found at `Intel® VTune™ Profiler User Guide `__. + +Configure Profiling for XPU +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pick GPU Offload Profiling Type instead of Hotspots, and follow the same instructions as CPU to Launch the Application. + +.. figure:: /_static/img/itt_tutorial/vtune_xpu_config.png + :width: 100% + :align: center + + Read Profiling Result ~~~~~~~~~~~~~~~~~~~~~ @@ -101,6 +115,18 @@ As illustrated on the right side navigation bar, brown portions in the timeline Of course there are much more enriched sets of profiling features that Intel® VTune™ Profiler provides to help you understand a performance issue. When you understand the root cause of a performance issue, you can get it fixed. More detailed usage instructions are available at `Intel® VTune™ Profiler User Guide `__. +Read XPU Profiling Result +~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a successful profiling with ITT, you can open `Platform` tab of the profiling result to see labels in the Intel® VTune™ Profiler timeline. + +.. figure:: /_static/img/itt_tutorial/vtune_xpu_timeline.png + :width: 100% + :align: center + + +The timeline shows the main thread as a `python` thread on the top. Labeled PyTorch operators and customized regions are shown in the main thread row. All operators starting with `aten::` are operators labeled implicitly by the ITT feature in PyTorch. The timeline also shows the GPU Computing Queue on the top, and users could see different XPU Kernels dispatched into GPU Queue. + A short sample code showcasing how to use PyTorch ITT APIs ---------------------------------------------------------- @@ -128,8 +154,12 @@ The topology is formed by two operators, `Conv2d` and `Linear`. Three iterations return x def main(): - m = ITTSample() + m = ITTSample + # unmark below code for XPU + # m = m.to("xpu") x = torch.rand(10, 3, 244, 244) + # unmark below code for XPU + # x = x.to("xpu") with torch.autograd.profiler.emit_itt(): for i in range(3) # Labeling a region with pair of range_push and range_pop diff --git a/recipes_source/ptmobile_recipes_summary.rst b/recipes_source/ptmobile_recipes_summary.rst index cddee940f2a..fdf9f58e43d 100644 --- a/recipes_source/ptmobile_recipes_summary.rst +++ b/recipes_source/ptmobile_recipes_summary.rst @@ -1,40 +1,10 @@ Summary of PyTorch Mobile Recipes ===================================== -This summary provides a top level overview of recipes for PyTorch Mobile to help developers choose which recipes to follow for their PyTorch-powered mobile app development. +PyTorch Mobile is no longer actively supported. Please check out `ExecuTorch `__. -Introduction ----------------- +Redirecting in 3 seconds... -When a PyTorch model is trained or retrained, or when a pre-trained model is available, for mobile deployment, follow the the recipes outlined in this summary so mobile apps can successfully use the model. +.. raw:: html -Pre-requisites ----------------- - -PyTorch 1.6.0 or 1.7.0 - -(Optional) torchvision 0.6.0 or 0.7.0 - -For iOS development: Xcode 11 or 12 - -For Android development: Android Studio 3.5.1 or above (with NDK installed); or Android SDK, NDK, Gradle, JDK. - -New Recipes for PyTorch Mobile --------------------------------- - -* (Recommended) To fuse a list of PyTorch modules into a single module to reduce the model size before quantization, read the `Fuse Modules recipe `_. - -* (Recommended) To reduce the model size and make it run faster without losing much on accuracy, read the `Quantization Recipe `_. - -* (Must) To convert the model to TorchScipt and (optional) optimize it for mobile apps, read the `Script and Optimize for Mobile Recipe `_. - -* (Must for iOS development) To add the model in an iOS project and use PyTorch pod for iOS, read the `Model preparation for iOS Recipe `_. - -* (Must for Android development) To add the model in an Android project and use the PyTorch library for Android, read the `Model preparation for Android Recipe `_. - - -Learn More ------------------ - -1. `PyTorch Mobile site `_ -2. `PyTorch Mobile Performance Recipes `_ + diff --git a/recipes_source/quantization.rst b/recipes_source/quantization.rst index a8719c19813..ac9cd48fe8c 100644 --- a/recipes_source/quantization.rst +++ b/recipes_source/quantization.rst @@ -81,7 +81,7 @@ The full documentation of the `quantize_dynamic` API call is `here `_ # are chosen based on numerical properties, but also on experience. diff --git a/recipes_source/recipes/defining_a_neural_network.py b/recipes_source/recipes/defining_a_neural_network.py index 07d91edcd8c..da58a1c5752 100644 --- a/recipes_source/recipes/defining_a_neural_network.py +++ b/recipes_source/recipes/defining_a_neural_network.py @@ -17,7 +17,7 @@ In this recipe, we will use ``torch.nn`` to define a neural network intended for the `MNIST -dataset `__. +dataset `__. Setup ----- diff --git a/recipes_source/recipes/dynamic_quantization.py b/recipes_source/recipes/dynamic_quantization.py index cdb3d22da72..e69d7bfd02e 100644 --- a/recipes_source/recipes/dynamic_quantization.py +++ b/recipes_source/recipes/dynamic_quantization.py @@ -24,7 +24,7 @@ reduction without losing a lot of accuracy. What is dynamic quantization? -------------- +----------------------------- Quantizing a network means converting it to use a reduced precision integer representation for the weights and/or activations. This saves on @@ -162,16 +162,9 @@ def forward(self,inputs,hidden): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Now we get to the fun part. First we create an instance of the model -# called ``float\_lstm`` then we are going to quantize it. We're going to use -# the -# -# :: -# -# torch.quantization.quantize_dynamic() -# -# function here (`see -# documentation `__) -# which takes the model, then a list of the submodules which we want to +# called ``float_lstm`` then we are going to quantize it. We're going to use +# the `torch.quantization.quantize_dynamic `__ function, which takes the model, then a list of the submodules +# which we want to # have quantized if they appear, then the datatype we are targeting. This # function returns a quantized version of the original model as a new # module. @@ -281,7 +274,7 @@ def print_size_of_model(model, label=""): ###################################################################### -# Learn More +# Learn More # ------------ # We've explained what dynamic quantization is, what benefits it brings, # and you have used the ``torch.quantization.quantize_dynamic()`` function @@ -292,20 +285,10 @@ def print_size_of_model(model, label=""): # # # Additional Resources -# ========= -# Documentation -# ~~~~~~~~~~~~~~ -# -# `Quantization API Documentaion `_ -# -# Tutorials -# ~~~~~~~~~~~~~~ -# -# `(beta) Dynamic Quantization on BERT `_ -# -# `(beta) Dynamic Quantization on an LSTM Word Language Model `_ +# -------------------- # -# Blogs -# ~~~~~~~~~~~~~~ -# `Introduction to Quantization on PyTorch `_ +# * `Quantization API Documentaion `_ +# * `(beta) Dynamic Quantization on BERT `_ +# * `(beta) Dynamic Quantization on an LSTM Word Language Model `_ +# * `Introduction to Quantization on PyTorch `_ # diff --git a/recipes_source/recipes/loading_data_recipe.py b/recipes_source/recipes/loading_data_recipe.py deleted file mode 100644 index 63efbdc01ce..00000000000 --- a/recipes_source/recipes/loading_data_recipe.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Loading data in PyTorch -======================= -PyTorch features extensive neural network building blocks with a simple, -intuitive, and stable API. PyTorch includes packages to prepare and load -common datasets for your model. - -Introduction ------------- -At the heart of PyTorch data loading utility is the -`torch.utils.data.DataLoader `__ -class. It represents a Python iterable over a dataset. Libraries in -PyTorch offer built-in high-quality datasets for you to use in -`torch.utils.data.Dataset `__. -These datasets are currently available in: - -* `torchvision `__ -* `torchaudio `__ -* `torchtext `__ - -with more to come. -Using the ``yesno`` dataset from ``torchaudio.datasets.YESNO``, we will -demonstrate how to effectively and efficiently load data from a PyTorch -``Dataset`` into a PyTorch ``DataLoader``. -""" - - - -###################################################################### -# Setup -# ----- -# Before we begin, we need to install ``torchaudio`` to have access to the -# dataset. - -# pip install torchaudio - -####################################################### -# To run in Google Colab, uncomment the following line: - -# !pip install torchaudio - -############################# -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Access the data in the dataset -# 3. Loading the data -# 4. Iterate over the data -# 5. [Optional] Visualize the data -# -# -# 1. Import necessary libraries for loading our data -# --------------------------------------------------------------- -# -# For this recipe, we will use ``torch`` and ``torchaudio``. Depending on -# what built-in datasets you use, you can also install and import -# ``torchvision`` or ``torchtext``. -# - -import torch -import torchaudio - - -###################################################################### -# 2. Access the data in the dataset -# --------------------------------------------------------------- -# -# The ``yesno`` dataset in ``torchaudio`` features sixty recordings of one -# individual saying yes or no in Hebrew; with each recording being eight -# words long (`read more here `__). -# -# ``torchaudio.datasets.YESNO`` creates a dataset for ``yesno``. -torchaudio.datasets.YESNO( - root='./', - url='http://www.openslr.org/resources/1/waves_yesno.tar.gz', - folder_in_archive='waves_yesno', - download=True) - -########################################################################### -# Each item in the dataset is a tuple of the form: (waveform, sample_rate, -# labels). -# -# You must set a ``root`` for the ``yesno`` dataset, which is where the -# training and testing dataset will exist. The other parameters are -# optional, with their default values shown. Here is some additional -# useful info on the other parameters: - -# * ``download``: If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. -# -# Let’s access our ``yesno`` data: -# - -# A data point in ``yesno`` is a tuple (waveform, sample_rate, labels) where labels -# is a list of integers with 1 for yes and 0 for no. -yesno_data = torchaudio.datasets.YESNO('./', download=True) - -# Pick data point number 3 to see an example of the the ``yesno_data``: -n = 3 -waveform, sample_rate, labels = yesno_data[n] -print("Waveform: {}\nSample rate: {}\nLabels: {}".format(waveform, sample_rate, labels)) - - -###################################################################### -# When using this data in practice, it is best practice to provision the -# data into a “training” dataset and a “testing” dataset. This ensures -# that you have out-of-sample data to test the performance of your model. -# -# 3. Loading the data -# --------------------------------------------------------------- -# -# Now that we have access to the dataset, we must pass it through -# ``torch.utils.data.DataLoader``. The ``DataLoader`` combines the dataset -# and a sampler, returning an iterable over the dataset. -# - -data_loader = torch.utils.data.DataLoader(yesno_data, - batch_size=1, - shuffle=True) - - -###################################################################### -# 4. Iterate over the data -# --------------------------------------------------------------- -# -# Our data is now iterable using the ``data_loader``. This will be -# necessary when we begin training our model! You will notice that now -# each data entry in the ``data_loader`` object is converted to a tensor -# containing tensors representing our waveform, sample rate, and labels. -# - -for data in data_loader: - print("Data: ", data) - print("Waveform: {}\nSample rate: {}\nLabels: {}".format(data[0], data[1], data[2])) - break - - -###################################################################### -# 5. [Optional] Visualize the data -# --------------------------------------------------------------- -# -# You can optionally visualize your data to further understand the output -# from your ``DataLoader``. -# - -import matplotlib.pyplot as plt - -print(data[0][0].numpy()) - -plt.figure() -plt.plot(waveform.t().numpy()) - - -###################################################################### -# Congratulations! You have successfully loaded data in PyTorch. -# -# Learn More -# ---------- -# -# Take a look at these other recipes to continue your learning: -# -# - `Defining a Neural Network `__ -# - `What is a state_dict in PyTorch `__ diff --git a/recipes_source/recipes/module_load_state_dict_tips.py b/recipes_source/recipes/module_load_state_dict_tips.py index 3a5eea41c41..70e9830cb3c 100644 --- a/recipes_source/recipes/module_load_state_dict_tips.py +++ b/recipes_source/recipes/module_load_state_dict_tips.py @@ -2,6 +2,7 @@ Tips for Loading an ``nn.Module`` from a Checkpoint =================================================== +**Author:** `Mikayla Gawarecki `_ If you're loading a checkpoint and want to reduce compute and memory as much as possible, this tutorial shares some recommended practices. In particular, we will discuss @@ -38,7 +39,7 @@ def forward(self, x): # to ``torch.load``, the ``torch.device()`` context manager and the ``assign`` # keyword argument to ``nn.Module.load_state_dict()``. -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) with torch.device('meta'): meta_m = SomeModule(1000) meta_m.load_state_dict(state_dict, assign=True) @@ -46,7 +47,7 @@ def forward(self, x): ############################################################################# # Compare the snippet below to the one above: -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) m = SomeModule(1000) m.load_state_dict(state_dict) @@ -70,7 +71,7 @@ def forward(self, x): # * Waiting for the entire checkpoint to be loaded into RAM before performing, for example, some per-tensor processing. start_time = time.time() -state_dict = torch.load('checkpoint.pth') +state_dict = torch.load('checkpoint.pth', weights_only=True) end_time = time.time() print(f"loading time without mmap={end_time - start_time}") @@ -83,7 +84,7 @@ def forward(self, x): # storages will be memory-mapped. start_time = time.time() -state_dict = torch.load('checkpoint.pth', mmap=True) +state_dict = torch.load('checkpoint.pth', mmap=True, weights_only=True) end_time = time.time() print(f"loading time with mmap={end_time - start_time}") @@ -152,8 +153,13 @@ def my_processing_function(key, device): # ``nn.Module.parameters()``, the optimizer must be initialized after the module # is loaded from state dict if ``assign=True`` is passed. +# As of PyTorch 2.3.0, one can use ``torch.__future__.set_swap_module_params_on_conversion`` to +# avoid this caveat. This `recipe `_ +# provides more details. + new_m.load_state_dict(state_dict, assign=True) -# This MUST be done AFTER the load_state_dict with assign. +# Before 2.3.0, this MUST be done AFTER the load_state_dict with assign. +# In versions >= 2.3.0, one can consider setting ``torch.__future__.set_swap_module_params_on_conversion`` opt = torch.optim.SGD(new_m.parameters(), lr=1e-3) ############################################################################### diff --git a/recipes_source/recipes/profiler_recipe.py b/recipes_source/recipes/profiler_recipe.py index a88ea87feca..4d43726e71f 100644 --- a/recipes_source/recipes/profiler_recipe.py +++ b/recipes_source/recipes/profiler_recipe.py @@ -16,7 +16,7 @@ ----- To install ``torch`` and ``torchvision`` use the following command: -:: +.. code-block:: sh pip install torch torchvision @@ -34,8 +34,7 @@ # 4. Using profiler to analyze memory consumption # 5. Using tracing functionality # 6. Examining stack traces -# 7. Visualizing data as a flame graph -# 8. Using profiler to analyze long-running jobs +# 7. Using profiler to analyze long-running jobs # # 1. Import all necessary libraries # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -71,10 +70,10 @@ # - ``ProfilerActivity.CPU`` - PyTorch operators, TorchScript functions and # user-defined code labels (see ``record_function`` below); # - ``ProfilerActivity.CUDA`` - on-device CUDA kernels; +# - ``ProfilerActivity.XPU`` - on-device XPU kernels; # - ``record_shapes`` - whether to record shapes of the operator inputs; # - ``profile_memory`` - whether to report amount of memory consumed by # model's Tensors; -# - ``use_cuda`` - whether to measure execution time of CUDA kernels. # # Note: when using CUDA, profiler also shows the runtime CUDA events # occurring on the host. @@ -121,7 +120,8 @@ # aten::mean 332.000us 2.631ms 125.286us 21 # aten::select 1.668ms 2.292ms 8.988us 255 # --------------------------------- ------------ ------------ ------------ ------------ -# Self CPU time total: 57.549ms +# Self CPU time total: 57.549m +# ###################################################################### # Here we see that, as expected, most of the time is spent in convolution (and specifically in ``mkldnn_convolution`` @@ -138,7 +138,7 @@ ######################################################################################## # The output might look like this (omitting some columns): # -# :: +# .. code-block:: sh # # --------------------------------- ------------ ------------------------------------------- # Name CPU total Input Shapes @@ -155,22 +155,34 @@ # aten::conv2d 4.751ms [[5,256,14,14], [256,256,3,3], [], ..., []] # --------------------------------- ------------ ------------------------------------------- # Self CPU time total: 57.549ms +# ###################################################################### # Note the occurrence of ``aten::convolution`` twice with different input shapes. ###################################################################### -# Profiler can also be used to analyze performance of models executed on GPUs: - -model = models.resnet18().cuda() -inputs = torch.randn(5, 3, 224, 224).cuda() - -with profile(activities=[ - ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: +# Profiler can also be used to analyze performance of models executed on GPUs and XPUs: +# Users could switch between cpu, cuda and xpu +if torch.cuda.is_available(): + device = 'cuda' +elif torch.xpu.is_available(): + device = 'xpu' +else: + print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices') + import sys + sys.exit(0) + +activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA, ProfilerActivity.XPU] +sort_by_keyword = device + "_time_total" + +model = models.resnet18().to(device) +inputs = torch.randn(5, 3, 224, 224).to(device) + +with profile(activities=activities, record_shapes=True) as prof: with record_function("model_inference"): model(inputs) -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) +print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10)) ###################################################################### # (Note: the first use of CUDA profiling may bring an extra overhead.) @@ -178,7 +190,7 @@ ###################################################################### # The resulting table output (omitting some columns): # -# :: +# .. code-block:: sh # # ------------------------------------------------------- ------------ ------------ # Name Self CUDA CUDA total @@ -196,6 +208,37 @@ # ------------------------------------------------------- ------------ ------------ # Self CPU time total: 23.015ms # Self CUDA time total: 11.666ms +# +###################################################################### + + +###################################################################### +# (Note: the first use of XPU profiling may bring an extra overhead.) + +###################################################################### +# The resulting table output (omitting some columns): +# +# .. code-block:: sh +# +#------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ +# Name Self XPU Self XPU % XPU total XPU time avg # of Calls +# ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ +# model_inference 0.000us 0.00% 2.567ms 2.567ms 1 +# aten::conv2d 0.000us 0.00% 1.871ms 93.560us 20 +# aten::convolution 0.000us 0.00% 1.871ms 93.560us 20 +# aten::_convolution 0.000us 0.00% 1.871ms 93.560us 20 +# aten::convolution_overrideable 1.871ms 72.89% 1.871ms 93.560us 20 +# gen_conv 1.484ms 57.82% 1.484ms 74.216us 20 +# aten::batch_norm 0.000us 0.00% 432.640us 21.632us 20 +# aten::_batch_norm_impl_index 0.000us 0.00% 432.640us 21.632us 20 +# aten::native_batch_norm 432.640us 16.85% 432.640us 21.632us 20 +# conv_reorder 386.880us 15.07% 386.880us 6.448us 60 +# ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ +# Self CPU time total: 712.486ms +# Self XPU time total: 2.567ms + +# + ###################################################################### # Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN``). @@ -241,7 +284,7 @@ ############################################################################# # The output might look like this (omitting some columns): # -# :: +# .. code-block:: sh # # --------------------------------- ------------ ------------ ------------ # Name CPU Mem Self CPU Mem # of Calls @@ -258,23 +301,29 @@ # aten::max_pool2d_with_indices 11.48 Mb 11.48 Mb 1 # --------------------------------- ------------ ------------ ------------ # Self CPU time total: 53.064ms +# ###################################################################### # 5. Using tracing functionality # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Profiling results can be outputted as a ``.json`` trace file: +# Tracing CUDA or XPU kernels +# Users could switch between cpu, cuda and xpu +device = 'cuda' -model = models.resnet18().cuda() -inputs = torch.randn(5, 3, 224, 224).cuda() +activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA, ProfilerActivity.XPU] -with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: +model = models.resnet18().to(device) +inputs = torch.randn(5, 3, 224, 224).to(device) + +with profile(activities=activities) as prof: model(inputs) prof.export_chrome_trace("trace.json") ###################################################################### -# You can examine the sequence of profiled operators and CUDA kernels +# You can examine the sequence of profiled operators and CUDA/XPU kernels # in Chrome trace viewer (``chrome://tracing``): # # .. image:: ../../_static/img/trace_img.png @@ -285,20 +334,21 @@ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # Profiler can be used to analyze Python and TorchScript stack traces: +sort_by_keyword = "self_" + device + "_time_total" with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + activities=activities, with_stack=True, ) as prof: model(inputs) # Print aggregated stats -print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) +print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2)) ################################################################################# # The output might look like this (omitting some columns): # -# :: +# .. code-block:: sh # # ------------------------- ----------------------------------------------------------- # Name Source Location @@ -316,40 +366,15 @@ # ------------------------- ----------------------------------------------------------- # Self CPU time total: 34.016ms # Self CUDA time total: 11.659ms +# ###################################################################### # Note the two convolutions and the two call sites in ``torchvision/models/resnet.py`` script. # # (Warning: stack tracing adds an extra profiling overhead.) - ###################################################################### -# 7. Visualizing data as a flame graph -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Execution time (``self_cpu_time_total`` and ``self_cuda_time_total`` metrics) and stack traces -# can also be visualized as a flame graph. To do this, first export the raw data using ``export_stacks`` (requires ``with_stack=True``): - -prof.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total") - -###################################################################### -# We recommend using `Flamegraph tool `_ to generate an -# interactive ``.svg`` file: -# -# .. code-block:: sh -# -# git clone https://github.com/brendangregg/FlameGraph -# cd FlameGraph -# ./flamegraph.pl --title "CUDA time" --countname "us." /tmp/profiler_stacks.txt > perf_viz.svg - -###################################################################### -# -# .. image:: ../../_static/img/perf_viz.png -# :scale: 25 % - - -###################################################################### -# 8. Using profiler to analyze long-running jobs +# 7. Using profiler to analyze long-running jobs # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # PyTorch profiler offers an additional API to handle long-running jobs @@ -407,15 +432,17 @@ # To send the signal to the profiler that the next step has started, call ``prof.step()`` function. # The current profiler step is stored in ``prof.step_num``. # -# The following example shows how to use all of the concepts above: +# The following example shows how to use all of the concepts above for CUDA and XPU Kernels: + +sort_by_keyword = "self_" + device + "_time_total" def trace_handler(p): - output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=10) + output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10) print(output) p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json") with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + activities=activities, schedule=torch.profiler.schedule( wait=1, warmup=1, @@ -426,14 +453,12 @@ def trace_handler(p): model(inputs) p.step() - ###################################################################### # Learn More # ---------- # # Take a look at the following recipes/tutorials to continue your learning: # -# - `PyTorch Benchmark `_ -# - `PyTorch Profiler with TensorBoard `_ tutorial -# - `Visualizing models, data, and training with TensorBoard `_ tutorial +# - `PyTorch Benchmark `_ +# - `Visualizing models, data, and training with TensorBoard `_ tutorial # diff --git a/recipes_source/recipes/save_load_across_devices.py b/recipes_source/recipes/save_load_across_devices.py deleted file mode 100644 index cd311a62365..00000000000 --- a/recipes_source/recipes/save_load_across_devices.py +++ /dev/null @@ -1,182 +0,0 @@ -""" -Saving and loading models across devices in PyTorch -=================================================== - -There may be instances where you want to save and load your neural -networks across different devices. - -Introduction ------------- - -Saving and loading models across devices is relatively straightforward -using PyTorch. In this recipe, we will experiment with saving and -loading models across CPUs and GPUs. - -Setup ------ - -In order for every code block to run properly in this recipe, you must -first change the runtime to “GPU” or higher. Once you do, we need to -install ``torch`` if it isn’t already available. - -:: - - pip install torch - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Save on a GPU, load on a CPU -# 4. Save on a GPU, load on a GPU -# 5. Save on a CPU, load on a GPU -# 6. Saving and loading ``DataParallel`` models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Save on GPU, Load on CPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a CPU that was trained with a GPU, pass -# ``torch.device('cpu')`` to the ``map_location`` argument in the -# ``torch.load()`` function. -# - -# Specify a path to save to -PATH = "model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device('cpu') -model = Net() -model.load_state_dict(torch.load(PATH, map_location=device)) - - -###################################################################### -# In this case, the storages underlying the tensors are dynamically -# remapped to the CPU device using the ``map_location`` argument. -# -# 4. Save on GPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on GPU, simply -# convert the initialized model to a CUDA optimized model using -# ``model.to(torch.device('cuda'))``. -# -# Be sure to use the ``.to(torch.device('cuda'))`` function on all model -# inputs to prepare the data for the model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -model.load_state_dict(torch.load(PATH)) -model.to(device) - - -###################################################################### -# Note that calling ``my_tensor.to(device)`` returns a new copy of -# ``my_tensor`` on GPU. It does NOT overwrite ``my_tensor``. Therefore, -# remember to manually overwrite tensors: -# ``my_tensor = my_tensor.to(torch.device('cuda'))``. -# -# 5. Save on CPU, Load on GPU -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# When loading a model on a GPU that was trained and saved on CPU, set the -# ``map_location`` argument in the ``torch.load()`` function to -# ``cuda:device_id``. This loads the model to a given GPU device. -# -# Be sure to call ``model.to(torch.device('cuda'))`` to convert the -# model’s parameter tensors to CUDA tensors. -# -# Finally, also be sure to use the ``.to(torch.device('cuda'))`` function -# on all model inputs to prepare the data for the CUDA optimized model. -# - -# Save -torch.save(net.state_dict(), PATH) - -# Load -device = torch.device("cuda") -model = Net() -# Choose whatever GPU device number you want -model.load_state_dict(torch.load(PATH, map_location="cuda:0")) -# Make sure to call input = input.to(device) on any input tensors that you feed to the model -model.to(device) - - -###################################################################### -# 6. Saving ``torch.nn.DataParallel`` Models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# ``torch.nn.DataParallel`` is a model wrapper that enables parallel GPU -# utilization. -# -# To save a ``DataParallel`` model generically, save the -# ``model.module.state_dict()``. This way, you have the flexibility to -# load the model any way you want to any device you want. -# - -# Save -torch.save(net.module.state_dict(), PATH) - -# Load to whatever device you want - - -###################################################################### -# Congratulations! You have successfully saved and loaded models across -# devices in PyTorch. -# diff --git a/recipes_source/recipes/save_load_across_devices.rst b/recipes_source/recipes/save_load_across_devices.rst new file mode 100644 index 00000000000..fbda1562201 --- /dev/null +++ b/recipes_source/recipes/save_load_across_devices.rst @@ -0,0 +1,10 @@ +Save Load Across Devices +======================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py deleted file mode 100644 index 31b14f3a28a..00000000000 --- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Saving and loading a general checkpoint in PyTorch -================================================== -Saving and loading a general checkpoint model for inference or -resuming training can be helpful for picking up where you last left off. -When saving a general checkpoint, you must save more than just the -model’s state_dict. It is important to also save the optimizer’s -state_dict, as this contains buffers and parameters that are updated as -the model trains. Other items that you may want to save are the epoch -you left off on, the latest recorded training loss, external -``torch.nn.Embedding`` layers, and more, based on your own algorithm. - -Introduction ------------- -To save multiple checkpoints, you must organize them in a dictionary and -use ``torch.save()`` to serialize the dictionary. A common PyTorch -convention is to save these checkpoints using the ``.tar`` file -extension. To load the items, first initialize the model and optimizer, -then load the dictionary locally using torch.load(). From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. - -In this recipe, we will explore how to save and load multiple -checkpoints. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -:: - - pip install torch - - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save the general checkpoint -# 5. Load the general checkpoint -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Additional information -EPOCH = 5 -PATH = "model.pt" -LOSS = 0.4 - -torch.save({ - 'epoch': EPOCH, - 'model_state_dict': net.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - 'loss': LOSS, - }, PATH) - - -###################################################################### -# 5. Load the general checkpoint -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the model and optimizer, then load the -# dictionary locally. -# - -model = Net() -optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH) -model.load_state_dict(checkpoint['model_state_dict']) -optimizer.load_state_dict(checkpoint['optimizer_state_dict']) -epoch = checkpoint['epoch'] -loss = checkpoint['loss'] - -model.eval() -# - or - -model.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded a general -# checkpoint for inference and/or resuming training in PyTorch. -# diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst new file mode 100644 index 00000000000..b868c26a6cd --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.rst @@ -0,0 +1,10 @@ +Saving And Loading A General Checkpoint +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.py b/recipes_source/recipes/saving_and_loading_models_for_inference.py deleted file mode 100644 index cd24b77c1de..00000000000 --- a/recipes_source/recipes/saving_and_loading_models_for_inference.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Saving and loading models for inference in PyTorch -================================================== -There are two approaches for saving and loading models for inference in -PyTorch. The first is saving and loading the ``state_dict``, and the -second is saving and loading the entire model. - -Introduction ------------- -Saving the model’s ``state_dict`` with the ``torch.save()`` function -will give you the most flexibility for restoring the model later. This -is the recommended method for saving models, because it is only really -necessary to save the trained model’s learned parameters. -When saving and loading an entire model, you save the entire module -using Python’s -`pickle `__ module. Using -this approach yields the most intuitive syntax and involves the least -amount of code. The disadvantage of this approach is that the serialized -data is bound to the specific classes and the exact directory structure -used when the model is saved. The reason for this is because pickle does -not save the model class itself. Rather, it saves a path to the file -containing the class, which is used during load time. Because of this, -your code can break in various ways when used in other projects or after -refactors. -In this recipe, we will explore both ways on how to save and load models -for inference. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - - -:: - - pip install torch - - -""" - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save and load the model via ``state_dict`` -# 5. Save and load the entire model -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -net = Net() -print(net) - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum. -# - -optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save and load the model via ``state_dict`` -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Let’s save and load our model using just ``state_dict``. -# - -# Specify a path -PATH = "state_dict_model.pt" - -# Save -torch.save(net.state_dict(), PATH) - -# Load -model = Net() -model.load_state_dict(torch.load(PATH)) -model.eval() - - -###################################################################### -# A common PyTorch convention is to save models using either a ``.pt`` or -# ``.pth`` file extension. -# -# Notice that the ``load_state_dict()`` function takes a dictionary -# object, NOT a path to a saved object. This means that you must -# deserialize the saved state_dict before you pass it to the -# ``load_state_dict()`` function. For example, you CANNOT load using -# ``model.load_state_dict(PATH)``. -# -# Remember too, that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# Failing to do this will yield inconsistent inference results. -# -# 5. Save and load entire model -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now let’s try the same thing with the entire model. -# - -# Specify a path -PATH = "entire_model.pt" - -# Save -torch.save(net, PATH) - -# Load -model = torch.load(PATH) -model.eval() - - -###################################################################### -# Again here, remember that you must call ``model.eval()`` to set dropout and -# batch normalization layers to evaluation mode before running inference. -# -# Congratulations! You have successfully saved and load models for -# inference in PyTorch. -# -# Learn More -# ---------- -# -# Take a look at these other recipes to continue your learning: -# -# - `Saving and loading a general checkpoint in PyTorch `__ -# - `Saving and loading multiple models in one file using PyTorch `__ diff --git a/recipes_source/recipes/saving_and_loading_models_for_inference.rst b/recipes_source/recipes/saving_and_loading_models_for_inference.rst new file mode 100644 index 00000000000..19e1405dd81 --- /dev/null +++ b/recipes_source/recipes/saving_and_loading_models_for_inference.rst @@ -0,0 +1,10 @@ +Saving And Loading Models For Inference +======================================= + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.py b/recipes_source/recipes/saving_multiple_models_in_one_file.py deleted file mode 100644 index aeff7803969..00000000000 --- a/recipes_source/recipes/saving_multiple_models_in_one_file.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Saving and loading multiple models in one file using PyTorch -============================================================ -Saving and loading multiple models can be helpful for reusing models -that you have previously trained. - -Introduction ------------- -When saving a model comprised of multiple ``torch.nn.Modules``, such as -a GAN, a sequence-to-sequence model, or an ensemble of models, you must -save a dictionary of each model’s state_dict and corresponding -optimizer. You can also save any other items that may aid you in -resuming training by simply appending them to the dictionary. -To load the models, first initialize the models and optimizers, then -load the dictionary locally using ``torch.load()``. From here, you can -easily access the saved items by simply querying the dictionary as you -would expect. -In this recipe, we will demonstrate how to save multiple models to one -file using PyTorch. - -Setup ------ -Before we begin, we need to install ``torch`` if it isn’t already -available. - -:: - - pip install torch - -""" - - - -###################################################################### -# Steps -# ----- -# -# 1. Import all necessary libraries for loading our data -# 2. Define and initialize the neural network -# 3. Initialize the optimizer -# 4. Save multiple models -# 5. Load multiple models -# -# 1. Import necessary libraries for loading our data -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For this recipe, we will use ``torch`` and its subsidiaries ``torch.nn`` -# and ``torch.optim``. -# - -import torch -import torch.nn as nn -import torch.optim as optim - - -###################################################################### -# 2. Define and initialize the neural network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# For sake of example, we will create a neural network for training -# images. To learn more see the Defining a Neural Network recipe. Build -# two variables for the models to eventually save. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - -netA = Net() -netB = Net() - - -###################################################################### -# 3. Initialize the optimizer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We will use SGD with momentum to build an optimizer for each model we -# created. -# - -optimizerA = optim.SGD(netA.parameters(), lr=0.001, momentum=0.9) -optimizerB = optim.SGD(netB.parameters(), lr=0.001, momentum=0.9) - - -###################################################################### -# 4. Save multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Collect all relevant information and build your dictionary. -# - -# Specify a path to save to -PATH = "model.pt" - -torch.save({ - 'modelA_state_dict': netA.state_dict(), - 'modelB_state_dict': netB.state_dict(), - 'optimizerA_state_dict': optimizerA.state_dict(), - 'optimizerB_state_dict': optimizerB.state_dict(), - }, PATH) - - -###################################################################### -# 4. Load multiple models -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Remember to first initialize the models and optimizers, then load the -# dictionary locally. -# - -modelA = Net() -modelB = Net() -optimModelA = optim.SGD(modelA.parameters(), lr=0.001, momentum=0.9) -optimModelB = optim.SGD(modelB.parameters(), lr=0.001, momentum=0.9) - -checkpoint = torch.load(PATH) -modelA.load_state_dict(checkpoint['modelA_state_dict']) -modelB.load_state_dict(checkpoint['modelB_state_dict']) -optimizerA.load_state_dict(checkpoint['optimizerA_state_dict']) -optimizerB.load_state_dict(checkpoint['optimizerB_state_dict']) - -modelA.eval() -modelB.eval() -# - or - -modelA.train() -modelB.train() - - -###################################################################### -# You must call ``model.eval()`` to set dropout and batch normalization -# layers to evaluation mode before running inference. Failing to do this -# will yield inconsistent inference results. -# -# If you wish to resuming training, call ``model.train()`` to ensure these -# layers are in training mode. -# -# Congratulations! You have successfully saved and loaded multiple models -# in PyTorch. -# diff --git a/recipes_source/recipes/saving_multiple_models_in_one_file.rst b/recipes_source/recipes/saving_multiple_models_in_one_file.rst new file mode 100644 index 00000000000..33040e6c87b --- /dev/null +++ b/recipes_source/recipes/saving_multiple_models_in_one_file.rst @@ -0,0 +1,10 @@ +Saving Multiple Models In One File +================================== + +This tutorial was deprecated. There is a newer tutorial that covers the same topic: https://pytorch.org/tutorials/beginner/saving_loading_models.html + +Redirecting in 3 seconds... + +.. raw:: html + + diff --git a/recipes_source/recipes/swap_tensors.py b/recipes_source/recipes/swap_tensors.py new file mode 100644 index 00000000000..d3b90c6ebea --- /dev/null +++ b/recipes_source/recipes/swap_tensors.py @@ -0,0 +1,241 @@ +""" +Extension points in ``nn.Module`` for ``load_state_dict`` and tensor subclasses +=============================================================================== +**Author:** `Mikayla Gawarecki `_ + +This recipe introduces a new utility function ``torch.utils.swap_tensors`` +as well as two new extension points where it has been integrated in +``nn.Module``: + +* ``nn.Module.to()`` and related methods +* ``nn.Module.load_state_dict()`` + +.. note:: + This recipe requires PyTorch 2.3.0 or later. +""" + +############################################################################### +# ``torch.utils.swap_tensors`` +# ---------------------------- +# ``torch.utils.swap_tensors`` (hereafter referred to as ``swap_tensors``) is a +# utility function that takes in two Python tensors and swaps them. + +import torch +import torch.nn as nn +t1 = torch.arange(2) +t2 = torch.arange(3) +print(f"Before swapping, t1: {t1}, t2: {t2}") +torch.utils.swap_tensors(t1, t2) +print(f"After swapping, t1: {t1}, t2: {t2}") + +################################################################################ +# More specifically, ``swap_tensors`` swaps the Python ``__class__``, ``__dict__`` +# and ``__slots__`` of the two tensors, as well as their associated ``at::Tensor``. +# +# +# Application to ``nn.Module`` +# ---------------------------- +# This utility is pertinent to ``nn.Module`` when a Python object outside +# of the module holds a reference to parameters of the module. If an ``nn.Module`` +# modifies any of its parameters out of place, the object holding references to +# the parameters will not see the change. A classic example of this is the +# optimizer, which holds a reference to the parameters of the ``nn.Module``. +# This leads to a silent correctness issue where the ``optimizer.step()`` will +# run without error but the weights of the ``nn.Module`` will not be updated. + +mod = torch.nn.Linear(1, 2, bias=False) +optimizer = torch.optim.SGD(mod.parameters()) +print(f"weight in mod: {mod.weight}") +print(f"weight in optimizer: {optimizer.param_groups[0]['params']}") +mod.weight = torch.nn.Parameter(2 * mod.weight) +print(f"weight in mod: {mod.weight}") +print(f"weight in optimizer: {optimizer.param_groups[0]['params']}") + +################################################################################ +# ``nn.Module.to()`` and related methods +# -------------------------------------- +# This includes methods that change the device of the module (such as ``nn.Module.cpu()``), +# methods that change the ``dtype`` of the module (such as ``nn.Module.float()``) +# as well as methods that allow the module to be materialized +# (such as ``nn.Module.to_empty()``). +# +# At first glance, it might be non-intuitive that these methods are able to +# modify the parameters of the module in-place. The existing approach has been +# to use a nasty hack dating back from the first days of PyTorch. +# +# Notably, the existing approach does not work in these cases: +# +# * when using ``__torch_dispatch__`` subclasses +# * when ``param`` and ``new_param`` do not have the same Python ``type()`` +# * For tensors with special C++ representations (such as sparse tensors and ``XLA`` tensors) +# +# In the following part of this recipe, we will define a toy ``__torch_dispatch__`` +# subclass ``MyQuantizedLinearWeight`` that represents quantized linear weights. +# This subclass will be used for illustration purposes throughout the rest of +# the tutorial. For brevity, we omit most of the ``__torch_dispatch__`` +# implementation. +aten = torch.ops.aten + +class MyQuantizedLinearWeight(torch.Tensor): + @staticmethod + def __new__(cls, elem, scale): + return torch.Tensor._make_wrapper_subclass( + cls, + elem.shape, + dtype=elem.dtype, + layout=elem.layout, + device=elem.device, + strides=elem.stride(), + storage_offset=elem.storage_offset()) + + def __init__(self, elem: torch.Tensor, scale: float): + self.elem = elem + self.scale = scale + + def __repr__(self): + return f"MyQuantizedLinearWeight({self.elem}, scale={self.scale})" + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs): + if func in (aten.detach.default, aten._to_copy.default): + new_elem = func(args[0].elem, *args[1:], **kwargs) + return cls(new_elem, args[0].scale) + # Implementations for certain ops would be added to ``OP_TABLE``. + # We omit this for brevity. + OP_TABLE = dict() + if func in OP_TABLE: + return OP_TABLE[func](func, args, kwargs) + raise NotImplementedError(f"Unsupported function {func}") + +################################################################################# +# Let us create an ``nn.Linear`` layer of ``dtype`` ``torch.float32`` where the weight is +# a ``MyQuantizedLinearWeight`` and try to convert it to ``torch.bfloat16``. +# Observe that the weight's ``dtype`` changes as expected. However, the ``dtype`` +# of the subclass' payload (``elem``) does not change. + +m = nn.Linear(3, 5, dtype=torch.float32) +m.weight = torch.nn.Parameter(MyQuantizedLinearWeight(m.weight, 0.5)) +print(f"Before: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}") +m.bfloat16() +print(f"After: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}") +print(f"m.weight.dtype: {m.weight.dtype}") +print(f"m.weight.elem.dtype: {m.weight.elem.dtype}") +print(f"m.bias.dtype: {m.bias.dtype}") + +################################################################################ +# To this end, we introduce a global config +# ``torch.__future__.set_swap_module_params_on_conversion`` that will use +# ``swap_tensors`` to swap the parameters of the module while preserving +# references in place of ``.data`` setting. When this config is set, +# ``swap_tensors`` will be used during the conversion, which ensures that +# the ``dtype`` of the payload is properly converted. + +torch.__future__.set_swap_module_params_on_conversion(True) +m = nn.Linear(3, 5, dtype=torch.float32) +m.weight = torch.nn.Parameter(MyQuantizedLinearWeight(m.weight, 0.5)) +print(f"Before: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}") +m.bfloat16() +print(f"After: id(m.weight)={id(m.weight)}, id(m.bias)={id(m.bias)}") +print(f"m.weight.dtype: {m.weight.dtype}") +print(f"m.weight.elem.dtype: {m.weight.elem.dtype}") +print(f"m.bias.dtype: {m.bias.dtype}") +torch.__future__.set_swap_module_params_on_conversion(False) + +################################################################################ +# ``nn.Module.load_state_dict()`` +# -------------------------------- +# Depending on the value of the ``assign`` keyword argument passed +# to ``load_state_dict()``, there are two ways to load the ``state_dict``: +# +# * ``assign=False``: preserves the properties of ``module.param`` and only takes the values +# from ``state_dict['param_name']`` +# * ``assign=True``: preserves the properties and values of ``state_dict['param_name']``. +# +# +# Previously, these were implemented with in-place ``copy_`` and ``__setattr__`` respectively. +# With the existing implementation, each approach had its own limitations -- ``assign=False`` +# imposes the constraint that the type of the parameter in the ``state_dict`` must +# be the same as the type of the parameter in the module while ``assign=True`` imposes +# the constraint that anything that holds references to the module's parameters must +# be initialized after ``nn.Module.load_state_dict()``. +# +# Now, we address both constraints by adding a ``swap_tensors`` path to ``load_state_dict()`` +# and introducing a new extension point ``torch.Tensor.module_load(self, other, assign=False)``. +# When the ``swap_tensors`` path is enabled via the ``__future__`` mentioned above, +# we can use a ``__torch_function__`` handler for ``module_load`` to apply a +# custom transformation to the value in the ``state_dict``. The result of this +# transformation will be swapped with the parameter in the module. +# +# In the following example, we will use the ``MyQuantizedLinearWeight`` subclass +# defined above to illustrate how we can use these features to apply a +# custom quantization scheme to the weights of a linear layer when +# loading the ``state_dict``. +# +# Recall that the ``__torch_function__`` handler for ``module_load`` will be +# invoked if either ``self`` or ``other`` (in this case ``param`` or +# ``state_dict[param_key]``) are ``MyQuantizedLinearWeight`` subclasses. +# +# Assume that we expect the ``state_dict`` to contain plain tensors and the +# module to contain ``MyQuantizedLinearWeight`` parameters where we want the +# tensors in the ``state_dict`` to be transformed into the subclass. Then we +# can define a ``__torch_function__`` handler for ``torch.Tensor.module_load`` +# as such: + +@classmethod +def custom_torch_function(cls, func, types, args=(), kwargs=None): + kwargs = {} if kwargs is None else kwargs + + if func is torch.Tensor.module_load: + dest, src = args[0], args[1] + assert type(dest) == cls and type(src) == torch.Tensor + return MyQuantizedLinearWeight(src, dest.scale) + else: + with torch._C.DisableTorchFunctionSubclass(): + return func(*args, **kwargs) + +MyQuantizedLinearWeight.__torch_function__ = custom_torch_function + +################################################################################# +# First, let us create a skeleton of a model on the meta device to avoid +# materializing storages. We convert all weights in the modules to +# ``MyQuantizedLinearWeight`` subclasses while leaving biases intact. + +def fn(m): + if isinstance(m, nn.Linear): + requires_grad = m.weight.requires_grad + m.weight = torch.nn.Parameter( + MyQuantizedLinearWeight(m.weight, 0.5), requires_grad=requires_grad + ) + +with torch.device("meta"): + m = nn.Linear(3, 5) + m.apply(fn) + +################################################################################# +# We can then load the ``state_dict``. Observe that we use ``assign=True`` because +# for biases, we want to preserve the properties of the tensor in the ``state_dict`` +# (for example, we do not want the bias to be on the ``meta`` device after loading). + +torch.__future__.set_swap_module_params_on_conversion(True) +print(f"Before: id(weight)={id(m.weight)}, id(bias)={id(m.bias)}") +print(f"m.state_dict() before load_state_dict():\n {m.state_dict()}") +state_dict = nn.Linear(3, 5).state_dict() +print(f"state_dict:\n {state_dict}") +m.load_state_dict(state_dict, assign=True) +print(f"After: id(weight)={id(m.weight)}, id(bias)={id(m.bias)}") +print(f"m.state_dict() after load_state_dict():\n {m.state_dict()}") + +################################################################################# +# The above is a toy example of how we can use the new extension point in +# ``nn.Module.load_state_dict()``. One can also imagine alternate scenarios such +# as when we have tensor subclasses in the ``state_dict`` and plain ``nn.Parameters``/ +# tensors in the module or when both are tensor subclasses. Based on the use +# case, we can define the ``__torch_function__`` handler for ``module_load`` +# to apply the transforms as needed. +# +# Conclusion +# ---------- +# In this recipe, we learned about ``swap_tensors``, the importance +# of preserving references for parameters in ``nn.Module`` as well as how to +# use the two new extension points that are gated by +# ``torch.__future__.set_swap_module_params_on_conversion``. diff --git a/recipes_source/recipes/tensorboard_with_pytorch.py b/recipes_source/recipes/tensorboard_with_pytorch.py index 3b9455b7f44..4bceda81eaf 100644 --- a/recipes_source/recipes/tensorboard_with_pytorch.py +++ b/recipes_source/recipes/tensorboard_with_pytorch.py @@ -13,14 +13,14 @@ directory. The following command will install PyTorch 1.4+ via Anaconda (recommended): -:: +.. code-block:: sh $ conda install pytorch torchvision -c pytorch or pip -:: +.. code-block:: sh $ pip install torch torchvision @@ -28,7 +28,7 @@ ###################################################################### # Using TensorBoard in PyTorch -# ----- +# ----------------------------- # # Let’s now try using TensorBoard with PyTorch! Before logging anything, # we need to create a ``SummaryWriter`` instance. @@ -45,7 +45,7 @@ ###################################################################### # Log scalars -# ----- +# ----------- # # In machine learning, it’s important to understand key metrics such as # loss and how they change during training. Scalar helps to save @@ -91,7 +91,7 @@ def train_model(iter): ###################################################################### # Run TensorBoard -# ----- +# ---------------- # # Install TensorBoard through the command line to visualize data you logged # @@ -121,44 +121,6 @@ def train_model(iter): # -###################################################################### -# Share TensorBoard dashboards -# ---------------------------- -# -# `TensorBoard.dev `_ lets you upload and share -# your ML experiment results with anyone. Use ``TensorBoard.dev`` to host, -# track, and share your TensorBoard dashboards. -# -# Install the latest version of TensorBoard to use the ``uploader``. -# -# .. code-block:: sh -# -# pip install tensorboard --upgrade -# -# Use a simple command to upload and share your TensorBoard. -# -# .. code-block:: sh -# -# tensorboard dev upload --logdir runs \ -# --name "My latest experiment" \ # optional -# --description "Simple comparison of several hyperparameters" # optional -# -# For help, run ``$ tensorboard dev --help``. -# -# **Note:** Uploaded TensorBoards are public and visible to everyone. -# Do not upload sensitive data. -# -# View your TensorBoard live at URL provided in your terminal. -# For example: `https://tensorboard.dev/experiment/AdYd1TgeTlaLWXx6I8JUbA `_ -# -# -# .. image:: ../../_static/img/thumbnails/tensorboard_dev.png -# :scale: 40 % -# -# -# .. note:: -# ``TensorBoard.dev`` currently supports scalars, graphs, histograms, distributions, ``hparams``, and text dashboards. - ######################################################################## # Learn More # ---------------------------- diff --git a/recipes_source/recipes/timer_quick_start.py b/recipes_source/recipes/timer_quick_start.py index b93e13dcbd2..d6b79e094c7 100644 --- a/recipes_source/recipes/timer_quick_start.py +++ b/recipes_source/recipes/timer_quick_start.py @@ -46,9 +46,7 @@ """, # Alternatively, ``globals`` can be used to pass variables from the outer scope. - # ------------------------------------------------------------------------- - # :: - # + # # globals={ # "x": torch.ones((128,)), # "y": torch.ones((128,)), @@ -176,14 +174,14 @@ # One generally doesn't care about absolute path. For instance, the full path # and function name for a multiply call is something like: # -# :: +# .. code-block:: sh # # /the/prefix/to/your/pytorch/install/dir/pytorch/build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const [/the/path/to/your/conda/install/miniconda3/envs/ab_ref/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so] # # when in reality, all of the information that we're interested in can be # represented in: # -# :: +# .. code-block:: sh # # build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const # diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index dd615714a24..dc1daae2584 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -94,35 +94,36 @@ # ``optimizer.zero_grad(set_to_none=True)``. ############################################################################### -# Fuse pointwise operations +# Fuse operations # ~~~~~~~~~~~~~~~~~~~~~~~~~ -# Pointwise operations (elementwise addition, multiplication, math functions - -# ``sin()``, ``cos()``, ``sigmoid()`` etc.) can be fused into a single kernel -# to amortize memory access time and kernel launch time. -# -# `PyTorch JIT `_ can fuse kernels -# automatically, although there could be additional fusion opportunities not yet -# implemented in the compiler, and not all device types are supported equally. -# -# Pointwise operations are memory-bound, for each operation PyTorch launches a -# separate kernel. Each kernel loads data from the memory, performs computation -# (this step is usually inexpensive) and stores results back into the memory. -# -# Fused operator launches only one kernel for multiple fused pointwise ops and -# loads/stores data only once to the memory. This makes JIT very useful for -# activation functions, optimizers, custom RNN cells etc. +# Pointwise operations such as elementwise addition, multiplication, and math +# functions like `sin()`, `cos()`, `sigmoid()`, etc., can be combined into a +# single kernel. This fusion helps reduce memory access and kernel launch times. +# Typically, pointwise operations are memory-bound; PyTorch eager-mode initiates +# a separate kernel for each operation, which involves loading data from memory, +# executing the operation (often not the most time-consuming step), and writing +# the results back to memory. +# +# By using a fused operator, only one kernel is launched for multiple pointwise +# operations, and data is loaded and stored just once. This efficiency is +# particularly beneficial for activation functions, optimizers, and custom RNN cells etc. +# +# PyTorch 2 introduces a compile-mode facilitated by TorchInductor, an underlying compiler +# that automatically fuses kernels. TorchInductor extends its capabilities beyond simple +# element-wise operations, enabling advanced fusion of eligible pointwise and reduction +# operations for optimized performance. # # In the simplest case fusion can be enabled by applying -# `torch.jit.script `_ +# `torch.compile `_ # decorator to the function definition, for example: -@torch.jit.script -def fused_gelu(x): +@torch.compile +def gelu(x): return x * 0.5 * (1.0 + torch.erf(x / 1.41421)) ############################################################################### # Refer to -# `TorchScript documentation `_ +# `Introduction to torch.compile `_ # for more advanced use cases. ############################################################################### @@ -194,14 +195,14 @@ def fused_gelu(x): # numactl --cpunodebind=N --membind=N python ############################################################################### -# More detailed descriptions can be found `here `_. +# More detailed descriptions can be found `here `_. ############################################################################### # Utilize OpenMP # ~~~~~~~~~~~~~~ # OpenMP is utilized to bring better performance for parallel computation tasks. # ``OMP_NUM_THREADS`` is the easiest switch that can be used to accelerate computations. It determines number of threads used for OpenMP computations. -# CPU affinity setting controls how workloads are distributed over multiple cores. It affects communication overhead, cache line invalidation overhead, or page thrashing, thus proper setting of CPU affinity brings performance benefits. ``GOMP_CPU_AFFINITY`` or ``KMP_AFFINITY`` determines how to bind OpenMP* threads to physical processing units. Detailed information can be found `here `_. +# CPU affinity setting controls how workloads are distributed over multiple cores. It affects communication overhead, cache line invalidation overhead, or page thrashing, thus proper setting of CPU affinity brings performance benefits. ``GOMP_CPU_AFFINITY`` or ``KMP_AFFINITY`` determines how to bind OpenMP* threads to physical processing units. Detailed information can be found `here `_. ############################################################################### # With the following command, PyTorch run the task on N OpenMP threads. @@ -212,6 +213,7 @@ def fused_gelu(x): ############################################################################### # Typically, the following environment variables are used to set for CPU affinity with GNU OpenMP implementation. ``OMP_PROC_BIND`` specifies whether threads may be moved between processors. Setting it to CLOSE keeps OpenMP threads close to the primary thread in contiguous place partitions. ``OMP_SCHEDULE`` determines how OpenMP threads are scheduled. ``GOMP_CPU_AFFINITY`` binds threads to specific CPUs. +# An important tuning parameter is core pinning which prevent the threads of migrating between multiple CPUs, enhancing data location and minimizing inter core communication. # # .. code-block:: sh # @@ -286,7 +288,7 @@ def fused_gelu(x): traced_model(*sample_input) ############################################################################### -# While the JIT fuser for oneDNN Graph also supports inference with ``BFloat16`` datatype, +# While the JIT fuser for oneDNN Graph also supports inference with ``BFloat16`` datatype, # performance benefit with oneDNN Graph is only exhibited by machines with AVX512_BF16 # instruction set architecture (ISA). # The following code snippets serves as an example of using ``BFloat16`` datatype for inference with oneDNN Graph: @@ -317,6 +319,37 @@ def fused_gelu(x): # GPU specific optimizations # -------------------------- +############################################################################### +# Enable Tensor cores +# ~~~~~~~~~~~~~~~~~~~~~~~ +# Tensor cores are specialized hardware designed to compute matrix-matrix multiplication +# operations, primarily utilized in deep learning and AI workloads. Tensor cores have +# specific precision requirements which can be adjusted manually or via the Automatic +# Mixed Precision API. +# +# In particular, tensor operations take advantage of lower precision workloads. +# Which can be controlled via ``torch.set_float32_matmul_precision``. +# The default format is set to 'highest,' which utilizes the tensor data type. +# However, PyTorch offers alternative precision settings: 'high' and 'medium.' +# These options prioritize computational speed over numerical precision." + +############################################################################### +# Use CUDA Graphs +# ~~~~~~~~~~~~~~~~~~~~~~~ +# At the time of using a GPU, work first must be launched from the CPU and +# in some cases the context switch between CPU and GPU can lead to bad resource +# utilization. CUDA graphs are a way to keep computation within the GPU without +# paying the extra cost of kernel launches and host synchronization. + +# It can be enabled using +torch.compile(m, "reduce-overhead") +# or +torch.compile(m, "max-autotune") + +############################################################################### +# Support for CUDA graph is in development, and its usage can incur in increased +# device memory consumption and some models might not compile. + ############################################################################### # Enable cuDNN auto-tuner # ~~~~~~~~~~~~~~~~~~~~~~~ @@ -464,8 +497,8 @@ def fused_gelu(x): # perform the required gradient all-reduce. ############################################################################### -# Match the order of layers in constructors and during the execution if using ``DistributedDataParallel``(find_unused_parameters=True) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Match the order of layers in constructors and during the execution if using ``DistributedDataParallel(find_unused_parameters=True)`` +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # `torch.nn.parallel.DistributedDataParallel `_ # with ``find_unused_parameters=True`` uses the order of layers and parameters # from model constructors to build buckets for ``DistributedDataParallel`` diff --git a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py index c04ae7d4be4..a0752bfc67d 100644 --- a/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py +++ b/recipes_source/recipes/warmstarting_model_using_parameters_from_a_different_model.py @@ -21,7 +21,7 @@ Before we begin, we need to install ``torch`` if it isn’t already available. -:: +.. code-block:: sh pip install torch @@ -124,7 +124,7 @@ def forward(self, x): # are loading into. # -netB.load_state_dict(torch.load(PATH), strict=False) +netB.load_state_dict(torch.load(PATH, weights_only=True), strict=False) ###################################################################### diff --git a/recipes_source/recipes/what_is_state_dict.py b/recipes_source/recipes/what_is_state_dict.py index 838d0c0d4ff..bd9b1d31b62 100644 --- a/recipes_source/recipes/what_is_state_dict.py +++ b/recipes_source/recipes/what_is_state_dict.py @@ -26,7 +26,7 @@ Before we begin, we need to install ``torch`` if it isn’t already available. -:: +.. code-block:: sh pip install torch @@ -52,6 +52,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F import torch.optim as optim diff --git a/recipes_source/recipes/zeroing_out_gradients.py b/recipes_source/recipes/zeroing_out_gradients.py index b3c25654d93..0914edbf558 100644 --- a/recipes_source/recipes/zeroing_out_gradients.py +++ b/recipes_source/recipes/zeroing_out_gradients.py @@ -33,7 +33,7 @@ Before we begin, we need to install ``torch`` and ``torchvision`` if they aren’t already available. -:: +.. code-block:: sh pip install torchvision @@ -44,23 +44,23 @@ ###################################################################### # Steps # ----- -# +# # Steps 1 through 4 set up our data and neural network for training. The # process of zeroing out the gradients happens in step 5. If you already # have your data and neural network built, skip to 5. -# +# # 1. Import all necessary libraries for loading our data # 2. Load and normalize the dataset # 3. Build the neural network # 4. Define the loss function # 5. Zero the gradients while training the network -# +# # 1. Import necessary libraries for loading our data # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # For this recipe, we will just be using ``torch`` and ``torchvision`` to # access the dataset. -# +# import torch @@ -76,10 +76,10 @@ ###################################################################### # 2. Load and normalize the dataset # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # PyTorch features various built-in datasets (see the Loading Data recipe # for more information). -# +# transform = transforms.Compose( [transforms.ToTensor(), @@ -102,10 +102,10 @@ ###################################################################### # 3. Build the neural network # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # We will use a convolutional neural network. To learn more see the # Defining a Neural Network recipe. -# +# class Net(nn.Module): def __init__(self): @@ -130,9 +130,9 @@ def forward(self, x): ###################################################################### # 4. Define a Loss function and optimizer # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # Let’s use a Classification Cross-Entropy loss and SGD with momentum. -# +# net = Net() criterion = nn.CrossEntropyLoss() @@ -142,14 +142,14 @@ def forward(self, x): ###################################################################### # 5. Zero the gradients while training the network # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# +# # This is when things start to get interesting. We simply have to loop # over our data iterator, and feed the inputs to the network and optimize. -# +# # Notice that for each entity of data, we zero out the gradients. This is # to ensure that we aren’t tracking any unnecessary information when we # train our neural network. -# +# for epoch in range(2): # loop over the dataset multiple times @@ -181,13 +181,13 @@ def forward(self, x): # You can also use ``model.zero_grad()``. This is the same as using # ``optimizer.zero_grad()`` as long as all your model parameters are in # that optimizer. Use your best judgment to decide which one to use. -# +# # Congratulations! You have successfully zeroed out gradients PyTorch. -# +# # Learn More # ---------- -# +# # Take a look at these other recipes to continue your learning: -# -# - `Loading data in PyTorch `__ +# +# - `Loading data in PyTorch `__ # - `Saving and loading models across devices in PyTorch `__ diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 10a6ca3fe33..b841d9ee759 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -30,14 +30,6 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu .. Basics -.. customcarditem:: - :header: Loading data in PyTorch - :card_description: Learn how to use PyTorch packages to prepare and load common datasets for your model. - :image: ../_static/img/thumbnails/cropped/loading-data.PNG - :link: ../recipes/recipes/loading_data_recipe.html - :tags: Basics - - .. customcarditem:: :header: Defining a Neural Network :card_description: Learn how to use PyTorch's torch.nn package to create and define a neural network for the MNIST dataset. @@ -144,6 +136,34 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/module_load_state_dict_tips.html :tags: Basics +.. customcarditem:: + :header: (beta) Using TORCH_LOGS to observe torch.compile + :card_description: Learn how to use the torch logging APIs to observe the compilation process. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_logs.html + :tags: Basics + +.. customcarditem:: + :header: Extension points in nn.Module for loading state_dict and tensor subclasses + :card_description: New extension points in nn.Module. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/recipes/swap_tensors.html + :tags: Basics + +.. customcarditem:: + :header: torch.export AOTInductor Tutorial for Python runtime + :card_description: Learn an end-to-end example of how to use AOTInductor for python runtime. + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_export_aoti_python.html + :tags: Basics + +.. customcarditem:: + :header: Demonstration of torch.export flow, common challenges and the solutions to address them + :card_description: Learn how to export models for popular usecases + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_export_challenges_solutions.html + :tags: Compiler,TorchCompile + .. Interpretability .. customcarditem:: @@ -267,6 +287,20 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/recipes/tuning_guide.html :tags: Model-Optimization +.. customcarditem:: + :header: Optimizing CPU Performance on Intel® Xeon® with run_cpu Script + :card_description: How to use run_cpu script for optimal runtime configurations on Intel® Xeon CPUs. + :image: ../_static/img/thumbnails/cropped/profiler.png + :link: ../recipes/xeon_run_cpu.html + :tags: Model-Optimization + +.. customcarditem:: + :header: PyTorch Inference Performance Tuning on AWS Graviton Processors + :card_description: Tips for achieving the best inference performance on AWS Graviton CPUs + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/inference_tuning_on_aws_graviton.html + :tags: Model-Optimization + .. Leverage Advanced Matrix Extensions .. customcarditem:: @@ -276,6 +310,51 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/amx.html :tags: Model-Optimization +.. (beta) Compiling the Optimizer with torch.compile + +.. customcarditem:: + :header: (beta) Compiling the Optimizer with torch.compile + :card_description: Speed up the optimizer using torch.compile + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/compiling_optimizer.html + :tags: Model-Optimization + +.. (beta) Running the compiled optimizer with an LR Scheduler + +.. customcarditem:: + :header: (beta) Running the compiled optimizer with an LR Scheduler + :card_description: Speed up training with LRScheduler and torch.compiled optimizer + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/compiling_optimizer_lr_scheduler.html + :tags: Model-Optimization + +.. Using User-Defined Triton Kernels with ``torch.compile`` + +.. customcarditem:: + :header: Using User-Defined Triton Kernels with ``torch.compile`` + :card_description: Learn how to use user-defined kernels with ``torch.compile`` + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_compile_user_defined_triton_kernel_tutorial.html + :tags: Model-Optimization + +.. Compile Time Caching in ``torch.compile`` + +.. customcarditem:: + :header: Compile Time Caching in ``torch.compile`` + :card_description: Learn how to configure compile time caching in ``torch.compile`` + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torch_compile_caching_tutorial.html + :tags: Model-Optimization + +.. Reducing Cold Start Compilation Time with Regional Compilation + +.. customcarditem:: + :header: Reducing torch.compile cold start compilation time with regional compilation + :card_description: Learn how to use regional compilation to control cold start compile time + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/regional_compilation.html + :tags: Model-Optimization + .. Intel(R) Extension for PyTorch* .. customcarditem:: @@ -296,6 +375,13 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu .. Distributed Training +.. customcarditem:: + :header: Getting Started with DeviceMesh + :card_description: Learn how to use DeviceMesh + :image: ../_static/img/thumbnails/cropped/profiler.png + :link: ../recipes/distributed_device_mesh.html + :tags: Distributed-Training + .. customcarditem:: :header: Shard Optimizer States with ZeroRedundancyOptimizer :card_description: How to use ZeroRedundancyOptimizer to reduce memory consumption. @@ -317,13 +403,36 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/distributed_optim_torchscript.html :tags: Distributed-Training,TorchScript - .. customcarditem:: +.. customcarditem:: :header: Getting Started with Distributed Checkpoint (DCP) :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package. :image: ../_static/img/thumbnails/cropped/Getting-Started-with-DCP.png - :link: ../recipes/DCP_tutorial.html + :link: ../recipes/distributed_checkpoint_recipe.html :tags: Distributed-Training +.. customcarditem:: + :header: Asynchronous Checkpointing (DCP) + :card_description: Learn how to checkpoint distributed models with Distributed Checkpoint package. + :image: ../_static/img/thumbnails/cropped/Getting-Started-with-DCP.png + :link: ../recipes/distributed_async_checkpoint_recipe.html + :tags: Distributed-Training + +.. customcarditem:: + :header: Getting Started with CommDebugMode + :card_description: Learn how to use CommDebugMode for DTensors + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/distributed_comm_debug_mode.html + :tags: Distributed-Training + +.. TorchServe + +.. customcarditem:: + :header: Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint + :card_description: Learn how to deploy model in Vertex AI with TorchServe + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/torchserve_vertexai_tutorial.html + :tags: Production + .. End of tutorial card section .. raw:: html @@ -342,8 +451,8 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu .. toctree:: :hidden: - /recipes/recipes/loading_data_recipe /recipes/recipes/defining_a_neural_network + /recipes/torch_logs /recipes/recipes/what_is_state_dict /recipes/recipes/saving_and_loading_models_for_inference /recipes/recipes/saving_and_loading_a_general_checkpoint @@ -358,7 +467,9 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/dynamic_quantization /recipes/recipes/amp_recipe /recipes/recipes/tuning_guide + /recipes/recipes/xeon_run_cpu /recipes/recipes/intel_extension_for_pytorch + /recipes/compiling_optimizer /recipes/torch_compile_backend_ipex /recipes/torchscript_inference /recipes/deployment_with_flask @@ -367,3 +478,5 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/cuda_rpc /recipes/distributed_optim_torchscript /recipes/mobile_interpreter + /recipes/distributed_comm_debug_mode + /recipes/torch_export_challenges_solutions diff --git a/recipes_source/regional_compilation.py b/recipes_source/regional_compilation.py new file mode 100644 index 00000000000..0a665e04b52 --- /dev/null +++ b/recipes_source/regional_compilation.py @@ -0,0 +1,178 @@ +""" +Reducing torch.compile cold start compilation time with regional compilation +============================================================================ + +**Author:** `Animesh Jain `_ + +As deep learning models get larger, the compilation time of these models also +increases. This extended compilation time can result in a large startup time in +inference services or wasted resources in large-scale training. This recipe +shows an example of how to reduce the cold start compilation time by choosing to +compile a repeated region of the model instead of the entire model. + +Prerequisites +---------------- + +* Pytorch 2.5 or later + +Setup +----- +Before we begin, we need to install ``torch`` if it is not already +available. + +.. code-block:: sh + + pip install torch + +.. note:: + This feature is available starting with the 2.5 release. If you are using version 2.4, + you can enable the configuration flag ``torch._dynamo.config.inline_inbuilt_nn_modules=True`` + to prevent recompilations during regional compilation. In version 2.5, this flag is enabled by default. +""" + +from time import perf_counter + +###################################################################### +# Steps +# ----- +# +# In this recipe, we will follow these steps: +# +# 1. Import all necessary libraries. +# 2. Define and initialize a neural network with repeated regions. +# 3. Understand the difference between the full model and the regional compilation. +# 4. Measure the compilation time of the full model and the regional compilation. +# +# First, let's import the necessary libraries for loading our data: +# +# +# + +import torch +import torch.nn as nn + + +########################################################## +# Next, let's define and initialize a neural network with repeated regions. +# +# Typically, neural networks are composed of repeated layers. For example, a +# large language model is composed of many Transformer blocks. In this recipe, +# we will create a ``Layer`` using the ``nn.Module`` class as a proxy for a repeated region. +# We will then create a ``Model`` which is composed of 64 instances of this +# ``Layer`` class. +# +class Layer(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear1 = torch.nn.Linear(10, 10) + self.relu1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(10, 10) + self.relu2 = torch.nn.ReLU() + + def forward(self, x): + a = self.linear1(x) + a = self.relu1(a) + a = torch.sigmoid(a) + b = self.linear2(a) + b = self.relu2(b) + return b + + +class Model(torch.nn.Module): + def __init__(self, apply_regional_compilation): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + # Apply compile only to the repeated layers. + if apply_regional_compilation: + self.layers = torch.nn.ModuleList( + [torch.compile(Layer()) for _ in range(64)] + ) + else: + self.layers = torch.nn.ModuleList([Layer() for _ in range(64)]) + + def forward(self, x): + # In regional compilation, the self.linear is outside of the scope of `torch.compile`. + x = self.linear(x) + for layer in self.layers: + x = layer(x) + return x + + +#################################################### +# Next, let's review the difference between the full model and the regional compilation. +# +# In full model compilation, the entire model is compiled as a whole. This is the common approach +# most users take with ``torch.compile``. In this example, we apply ``torch.compile`` to +# the ``Model`` object. This will effectively inline the 64 layers, producing a +# large graph to compile. You can look at the full graph by running this recipe +# with ``TORCH_LOGS=graph_code``. +# +# + +model = Model(apply_regional_compilation=False).cuda() +full_compiled_model = torch.compile(model) + + +################################################### +# The regional compilation, on the other hand, compiles a region of the model. +# By strategically choosing to compile a repeated region of the model, we can compile a +# much smaller graph and then reuse the compiled graph for all the regions. +# In the example, ``torch.compile`` is applied only to the ``layers`` and not the full model. +# + +regional_compiled_model = Model(apply_regional_compilation=True).cuda() + +##################################################### +# Applying compilation to a repeated region, instead of full model, leads to +# large savings in compile time. Here, we will just compile a layer instance and +# then reuse it 64 times in the ``Model`` object. +# +# Note that with repeated regions, some part of the model might not be compiled. +# For example, the ``self.linear`` in the ``Model`` is outside of the scope of +# regional compilation. +# +# Also, note that there is a tradeoff between performance speedup and compile +# time. Full model compilation involves a larger graph and, +# theoretically, offers more scope for optimizations. However, for practical +# purposes and depending on the model, we have observed many cases with minimal +# speedup differences between the full model and regional compilation. + + +################################################### +# Next, let's measure the compilation time of the full model and the regional compilation. +# +# ``torch.compile`` is a JIT compiler, which means that it compiles on the first invocation. +# In the code below, we measure the total time spent in the first invocation. While this method is not +# precise, it provides a good estimate since the majority of the time is spent in +# compilation. + + +def measure_latency(fn, input): + # Reset the compiler caches to ensure no reuse between different runs + torch.compiler.reset() + with torch._inductor.utils.fresh_inductor_cache(): + start = perf_counter() + fn(input) + torch.cuda.synchronize() + end = perf_counter() + return end - start + + +input = torch.randn(10, 10, device="cuda") +full_model_compilation_latency = measure_latency(full_compiled_model, input) +print(f"Full model compilation time = {full_model_compilation_latency:.2f} seconds") + +regional_compilation_latency = measure_latency(regional_compiled_model, input) +print(f"Regional compilation time = {regional_compilation_latency:.2f} seconds") + +assert regional_compilation_latency < full_model_compilation_latency + +############################################################################ +# Conclusion +# ----------- +# +# This recipe shows how to control the cold start compilation time if your model +# has repeated regions. This approach requires user modifications to apply `torch.compile` to +# the repeated regions instead of more commonly used full model compilation. We +# are continually working on reducing cold start compilation time. +# diff --git a/recipes_source/script_optimized.rst b/recipes_source/script_optimized.rst index f4384b1a3ae..ed64419ff41 100644 --- a/recipes_source/script_optimized.rst +++ b/recipes_source/script_optimized.rst @@ -1,218 +1,11 @@ Script and Optimize for Mobile Recipe ===================================== -This recipe demonstrates how to convert a PyTorch model to TorchScript which can run in a high-performance C++ environment such as iOS and Android, and how to optimize the converted TorchScript model for mobile deployment. +This tutorial has been deprecated. There is a new tutorial on this topic. -Introduction ------------- +Redirecting in 3 seconds... -After a PyTorch model is trained and optionally but preferably quantized (see `Quantization Recipe `_ for more details), one essential step before the model can be used in iOS and Android apps is to convert the Python-dependent model to TorchScript, which can then further be optimized for mobile apps. Conversion to TorchScript can be as simple as a single call, or as complicated as changing the original model in many different places. +.. raw:: html -Pre-requisites --------------- + -PyTorch 1.6.0 or 1.7.0 - -Conversion to TorchScript -------------------------- - -There are two basic ways to convert a PyTorch model to TorchScript, using `trace` and `script`. Mixing `trace` and `script` may also be needed in some cases - see `here `_ for more information. - -Use the `trace` Method -^^^^^^^^^^^^^^^^^^^^^^ - -To use the `trace` method on a model, an example or dummy input for the model needs to be specified, the actual input size needs to be the same as the example input size, and the model definition cannot have control flow such as `if` or `for`. The reason for these constraints is that running `trace` on a model with an example input simply calls the model's `forward` method with the input and all operations executed in the model layers are recorded, creating the trace of the model. - -:: - - import torch - - dummy_input = torch.rand(1, 3, 224, 224) - torchscript_model = torch.jit.trace(model_quantized, dummy_input) - - -Use the `script` Method -^^^^^^^^^^^^^^^^^^^^^^^ - -For the example above, calling `script` below makes no difference: - -:: - - torchscript_model = torch.jit.script(model_quantized) - -But if a model has some flow control, then `trace` won't correctly record all the possible traces. Take some code snippet of an example model definition from `here `_ for example: - -:: - - import torch - - class MyDecisionGate(torch.nn.Module): - def forward(self, x): - if x.sum() > 0: - return x - else: - return -x - - x = torch.rand(3, 4) - traced_cell = torch.jit.trace(MyDecisionGate(), x) - print(traced_cell.code) - -The code above will output: - -:: - - TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can''t record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! - - if x.sum() > 0: - def forward(self, - x: Tensor) -> Tensor: - return x - - -Note that "the trace might not generalize to other inputs" warning above means that if the model has any kind of data-dependent control flow, `trace` is not the right answer. But if we replace the last two lines of the Python code snippet above (before the code output) with: - -:: - - scripted_cell = torch.jit.script(MyDecisionGate()) - print(scripted_cell.code) - -The scripted model as shown by the `print` result below will be covering all possible inputs, thus generalizing to other inputs: - -:: - - def forward(self, - x: Tensor) -> Tensor: - _0 = bool(torch.gt(torch.sum(x, dtype=None), 0)) - if _0: - _1 = x - else: - _1 = torch.neg(x) - return _1 - - -This is another example of using `trace` and `script` - it converts the model trained in the PyTorch tutorial `NLP FROM SCRATCH: TRANSLATION WITH A SEQUENCE TO SEQUENCE NETWORK AND ATTENTION `_: - -:: - - encoder = EncoderRNN(input_lang.n_words, hidden_size) - decoder = AttnDecoderRNN(hidden_size, output_lang.n_words) - - # method 1: using trace with example inputs - - encoder_input=torch.tensor([1]) - encoder_hidden=torch.zeros(1, 1, hidden_size) - - decoder_input1=torch.tensor([[0]]) - decoder_input2=torch.zeros(1, 1, hidden_size) - decoder_input3=torch.zeros(MAX_LENGTH, hidden_size) - - traced_encoder = torch.jit.trace(encoder, (encoder_input, encoder_hidden)) - traced_decoder = torch.jit.trace(decoder, (decoder_input1, decoder_input2, decoder_input3)) - - # method 2: using script - - scripted_encoder = torch.jit.script(encoder) - scripted_decoder = torch.jit.script(decoder) - -So is it true that one can simply always use the `script` call and the model is converted to TorchScript? The answer is no, because TorchScript is actually a subset of Python and to make `script` work, the PyTorch model definition must only use the language features of that TorchScript subset of Python. `TorchScript Language Reference `_ covers all the details of what is supported in TorchScript. Below we will describe some of the common errors when using the `script` method. - - -Fix Common Errors When Using the `script` Method ----------------------------------------------------- - -If you apply the `script` method to a non-trivial model, chances are you may encounter several types of errors. Check out `this tutorial `_ for a complete example of converting a chatbot model to TorchScript. But follow the steps below to fix common errors when you run the `script` method: - -1. RuntimeError `attribute lookup is not defined on python value of type` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For this error, pass the value of the model as a parameter in the constructor. This is because when calling `script` on a model that accepts another model as a parameter, the model passed is actually of type `TracedModule` or `ScriptModule`, not of type `Module`, making the the model attribute not defined when scripting. - -For example, the `LuongAttnDecoderRNN` module in the tutorial above has an attribute `n_layers`, and the `GreedySearchDecoder` module refers to the `n_layers` attribute of a `decoder` instance of the `LuongAttnDecoderRNN` module, so in order to make `script` work, the `GreedySearchDecoder` module's constructor needs to be changed from: - -:: - - def __init__(self, encoder, decoder): - -to: - -:: - - def __init__(self, encoder, decoder, decoder_n_layers): - ... - self._decoder_n_layers = decoder_n_layers - - -and the `GreedySearchDecoder`'s `forward` method needs to refer `self._decoder_n_layers` instead of `decoder.n_layers`. - -2. RuntimeError `python value of type '...' cannot be used as a value.` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The complete error message for this one continues with `Perhaps it is a closed over global variable? If so, please consider passing it in as an argument or use a local variable instead.`, store global variables' values as attributes in the model constructor (there's no need to add them to a special list called `__constants__`). The reason is that global values can be used conveniently in normal model training and inference, but the global values are not accessible during the scripting. - -For example, `device` and `SOS_token` are global variables, and to make `script` work, they need to be added to the `GreedySearchDecoder`'s constructor: - -:: - - self._device = device - self._SOS_token = SOS_token - -and referred to as `self._device` and `self._SOS_token` instead of `device` and `SOS_token` in the `GreedySearchDecoder`'s `forward` method. - -3. RuntimeError `all inputs of range must be '...', found Tensor (inferred) in argument` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The error message continues with: `add type definitions for each of the module's forward method arguments. Because all parameters to a TorchScript function are of the `torch.Tensor` type by default, you need to specifically declare the type for each parameter that is not of type 'Tensor'. For a complete list of TorchScript-supported types, see `here `_. - -For example, the `GreedySearchDecoder`'s `forward` method signature needs to be changed from: - -:: - - def forward(self, input_seq, input_length, max_length): - -to: - -:: - - def forward(self, input_seq, input_length, max_length : int): - -After using the `trace` or `script` method above, and fixing possible errors, you should have a TorchScript model ready to be optimized for mobile. - - -Optimize a TorchScript Model --------------------------------------- - -Simply run the following code snippet to optimize a TorchScript model generated with the `trace` and/or `script` method: - -:: - - from torch.utils.mobile_optimizer import optimize_for_mobile - optimized_torchscript_model = optimize_for_mobile(torchscript_model) - -The optimized model can then be saved and deployed in mobile apps: - -:: - - optimized_torchscript_model.save("optimized_torchscript_model.pth") - -By default, for the CPU backend, `optimize_for_mobile` performs the following types of optimizations: - -* `Conv2D and BatchNorm fusion` which folds Conv2d-BatchNorm2d into Conv2d; - -* `Insert and fold prepacked ops` which rewrites the model graph to replace 2D convolutions and linear ops with their prepacked counterparts. - -* `ReLU and hardtanh fusion` which rewrites graph by finding ReLU/hardtanh ops and fuses them together. - -* `Dropout removal` which removes dropout nodes from this module when training is false. - -* `Conv packed params hoisting` which moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics. - -For the Vulkan backend,`optimize_for_mobile` performs the following type of optimization: - -* `Automatic GPU transfer` which rewrites the graph so that moving input and output data to and from the GPU becomes part of the model. - -Optimization types can be disabled by passing an optimization blocklist as an argument to `optimize_for_mobile`. - -Learn More ------------------ -1. The official `TorchScript Language Reference `_. -2. The `torch.utils.mobile_optimizer` `API documentation `_. diff --git a/recipes_source/torch_compile_backend_ipex.rst b/recipes_source/torch_compile_backend_ipex.rst index 8d38a689b88..58a53b525a0 100644 --- a/recipes_source/torch_compile_backend_ipex.rst +++ b/recipes_source/torch_compile_backend_ipex.rst @@ -1,7 +1,7 @@ -Intel® Extension for PyTorch* Backend -===================================== +Intel® Extension for PyTorch* Backend on Intel® CPUs +==================================================== -To work better with `torch.compile`, Intel® Extension for PyTorch* implements a backend ``ipex``. +To work better with `torch.compile` on Intel® CPUs, Intel® Extension for PyTorch* implements a backend ``ipex``. It targets to improve hardware resource usage efficiency on Intel platforms for better performance. The `ipex` backend is implemented with further customizations designed in Intel® Extension for PyTorch* for the model compilation. diff --git a/recipes_source/torch_compile_caching_tutorial.rst b/recipes_source/torch_compile_caching_tutorial.rst new file mode 100644 index 00000000000..3c024828f9f --- /dev/null +++ b/recipes_source/torch_compile_caching_tutorial.rst @@ -0,0 +1,76 @@ +Compile Time Caching in ``torch.compile`` +========================================================= +**Authors:** `Oguz Ulgen `_ and `Sam Larsen `_ + +Introduction +------------------ + +PyTorch Inductor implements several caches to reduce compilation latency. +This recipe demonstrates how you can configure various parts of the caching in ``torch.compile``. + +Prerequisites +------------------- + +Before starting this recipe, make sure that you have the following: + +* Basic understanding of ``torch.compile``. See: + + * `torch.compiler API documentation `__ + * `Introduction to torch.compile `__ + +* PyTorch 2.4 or later + +Inductor Cache Settings +---------------------------- + +Most of these caches are in-memory, only used within the same process, and are transparent to the user. An exception is caches that store compiled FX graphs (FXGraphCache, AOTAutogradCache). These caches allow Inductor to avoid recompilation across process boundaries when it encounters the same graph with the same Tensor input shapes (and the same configuration). The default implementation stores compiled artifacts in the system temp directory. An optional feature also supports sharing those artifacts within a cluster by storing them in a Redis database. + +There are a few settings relevant to caching and to FX graph caching in particular. +The settings are accessible via environment variables listed below or can be hard-coded in Inductor’s config file. + +TORCHINDUCTOR_FX_GRAPH_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables the local FX graph cache feature, i.e., by storing artifacts in the host’s temp directory. ``1`` enables, and any other value disables it. By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below). + +TORCHINDUCTOR_AUTOGRAD_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting extends FXGraphCache to store cached results at the AOTAutograd level, instead of at the Inductor level. ``1`` enables, and any other value disables it. +By default, the disk location is per username, but users can enable sharing across usernames by specifying ``TORCHINDUCTOR_CACHE_DIR`` (below). +`TORCHINDUCTOR_AUTOGRAD_CACHE` requires `TORCHINDUCTOR_FX_GRAPH_CACHE` to work. The same cache dir stores cache entries for AOTAutogradCache (under `{TORCHINDUCTOR_CACHE_DIR}/aotautograd`) and FXGraphCache (under `{TORCHINDUCTOR_CACHE_DIR}/fxgraph`). + +TORCHINDUCTOR_CACHE_DIR +~~~~~~~~~~~~~~~~~~~~~~~~ +This setting specifies the location of all on-disk caches. By default, the location is in the system temp directory under ``torchinductor_``, for example, ``/tmp/torchinductor_myusername``. + +Note that if ``TRITON_CACHE_DIR`` is not set in the environment, Inductor sets the Triton cache directory to this same temp location, under the Triton subdirectory. + +TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables the remote FX graph cache feature. The current implementation uses Redis. ``1`` enables caching, and any other value disables it. The following environment variables configure the host and port of the Redis server: + +``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``) +``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``) + +Note that if Inductor locates a remote cache entry, it stores the compiled artifact in the local on-disk cache; that local artifact would be served on subsequent runs on the same machine. + +TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Like TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE, this setting enables the remote AOT AutogradCache feature. The current implementation uses Redis. ``1`` enables caching, and any other value disables it. The following environment variables configure the host and port of the Redis server: +``TORCHINDUCTOR_REDIS_HOST`` (defaults to ``localhost``) +``TORCHINDUCTOR_REDIS_PORT`` (defaults to ``6379``) + +`TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE`` depends on `TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE` to be enabled to work. The same Redis server can store both AOTAutograd and FXGraph cache results. + +TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This setting enables a remote cache for Inductor’s autotuner. As with the remote FX graph cache, the current implementation uses Redis. ``1`` enables caching, and any other value disables it. The same host / port environment variables listed above apply to this cache. + +TORCHINDUCTOR_FORCE_DISABLE_CACHES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Set this value to ``1`` to disable all Inductor caching. This setting is useful for tasks like experimenting with cold-start compile times or forcing recompilation for debugging purposes. + +Conclusion +------------- +In this recipe, we have learned that PyTorch Inductor's caching mechanisms significantly reduce compilation latency by utilizing both local and remote caches, which operate seamlessly in the background without requiring user intervention. +Additionally, we explored the various settings and environment variables that allow users to configure and optimize these caching features according to their specific needs. + diff --git a/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py new file mode 100644 index 00000000000..7d183af6fd1 --- /dev/null +++ b/recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +""" +Using User-Defined Triton Kernels with ``torch.compile`` +========================================================= +**Author:** `Oguz Ulgen `_ +""" + +###################################################################### +# User-defined Triton kernels can be used to optimize specific parts of your +# model's computation. These kernels are written in Triton's language, which is designed +# to make it easier to achieve peak hardware performance. By using user-defined Triton +# kernels with ``torch.compile``, you can integrate these optimized computations into +# your PyTorch model, potentially achieving significant performance improvements. +# +# This recipes demonstrates how you can use user-defined Triton kernels with ``torch.compile``. +# +# Prerequisites +# ------------------- +# +# Before starting this recipe, make sure that you have the following: +# +# * Basic understanding of ``torch.compile`` and Triton. See: +# +# * `torch.compiler API documentation `__ +# * `Introduction to torch.compile `__ +# * `Triton language documentation `__ +# +# * PyTorch 2.3 or later +# * A GPU that supports Triton +# + +import torch +from torch.utils._triton import has_triton + +###################################################################### +# Basic Usage +# -------------------- +# +# In this example, we will use a simple vector addition kernel from the Triton documentation +# with ``torch.compile``. +# For reference, see `Triton documentation `__. +# + +if not has_triton(): + print("Skipping because triton is not supported on this device.") +else: + import triton + from triton import language as tl + + @triton.jit + def add_kernel( + in_ptr0, + in_ptr1, + out_ptr, + n_elements, + BLOCK_SIZE: "tl.constexpr", + ): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(in_ptr0 + offsets, mask=mask) + y = tl.load(in_ptr1 + offsets, mask=mask) + output = x + y + tl.store(out_ptr + offsets, output, mask=mask) + + @torch.compile(fullgraph=True) + def add_fn(x, y): + output = torch.zeros_like(x) + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4) + return output + + x = torch.randn(4, device="cuda") + y = torch.randn(4, device="cuda") + out = add_fn(x, y) + print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}") + +###################################################################### +# Advanced Usage +# ------------------------------------------------------------------- +# +# Triton's autotune feature is a powerful tool that automatically optimizes the configuration +# parameters of your Triton kernels. It explores a range of possible configurations and +# selects the one that delivers the best performance for your specific use case. +# +# When used with ``torch.compile``, ``triton.autotune`` can help ensure that your PyTorch +# model is running as efficiently as possible. Here is an example of using ``torch.compile`` +# and ``triton.autotune``. +# +# .. note:: +# +# ``torch.compile`` only supports configs and key arguments to ``triton.autotune``. + +if not has_triton(): + print("Skipping because triton is not supported on this device.") +else: + import triton + from triton import language as tl + + @triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 4}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_SIZE": 4}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_SIZE": 2}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_SIZE": 2}, num_stages=4, num_warps=4), + ], + key=[], + ) + @triton.jit + def add_kernel_autotuned( + in_ptr0, + in_ptr1, + out_ptr, + n_elements, + BLOCK_SIZE: "tl.constexpr", + ): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(in_ptr0 + offsets, mask=mask) + y = tl.load(in_ptr1 + offsets, mask=mask) + output = x + y + tl.store(out_ptr + offsets, output, mask=mask) + + @torch.compile(fullgraph=True) + def add_fn(x, y): + output = torch.zeros_like(x) + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + add_kernel_autotuned[grid](x, y, output, n_elements) + return output + + x = torch.randn(4, device="cuda") + y = torch.randn(4, device="cuda") + out = add_fn(x, y) + print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}") + +###################################################################### +# Composibility and Limitations +# -------------------------------------------------------------------- +# +# As of PyTorch 2.3, the support for user-defined Triton kernels in ``torch.compile`` +# includes dynamic shapes, ``torch.autograd.Function``, JIT inductor, and AOT inductor. +# You can use these features together to build complex, high-performance models. +# +# However, there are certain limitations to be aware of: +# +# * **Tensor Subclasses:** Currently, there is no support for +# tensor subclasses and other advanced features. +# * **Triton Features:** While ``triton.heuristics`` can be used either standalone or +# before ``triton.autotune``, it cannot be used after ``triton.autotune``. This +# implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used +# together, ``triton.heuristics`` must be used first. +# +# Conclusion +# ----------- +# In this recipe, we explored how to utilize user-defined Triton kernels +# with ``torch.compile``. We delved into the basic usage of a simple +# vector addition kernel and advanced usage involving Triton's autotune +# feature. We also discussed the composability of user-defined Triton +# kernels with other PyTorch features and highlighted some current limitations. +# +# See Also +# --------- +# +# * `Compiling the Optimizers `__ +# * `Implementing High-Performance Transformers with Scaled Dot Product Attention `__ diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py new file mode 100644 index 00000000000..312491b660f --- /dev/null +++ b/recipes_source/torch_export_aoti_python.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- + +""" +.. meta:: + :description: An end-to-end example of how to use AOTInductor for Python runtime. + :keywords: torch.export, AOTInductor, torch._inductor.aot_compile, torch._export.aot_load + +``torch.export`` AOTInductor Tutorial for Python runtime (Beta) +=============================================================== +**Author:** Ankith Gunapal, Bin Bao, Angela Yi +""" + +###################################################################### +# +# .. warning:: +# +# ``torch._inductor.aot_compile`` and ``torch._export.aot_load`` are in Beta status and are subject to backwards compatibility +# breaking changes. This tutorial provides an example of how to use these APIs for model deployment using Python runtime. +# +# It has been shown `previously `__ how AOTInductor can be used +# to do Ahead-of-Time compilation of PyTorch exported models by creating +# a shared library that can be run in a non-Python environment. +# +# +# In this tutorial, you will learn an end-to-end example of how to use AOTInductor for Python runtime. +# We will look at how to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a +# shared library. Additionally, we will examine how to execute the shared library in Python runtime using :func:`torch._export.aot_load`. +# You will learn about the speed up seen in the first inference time using AOTInductor, especially when using +# ``max-autotune`` mode which can take some time to execute. +# +# **Contents** +# +# .. contents:: +# :local: + +###################################################################### +# Prerequisites +# ------------- +# * PyTorch 2.4 or later +# * Basic understanding of ``torch.export`` and AOTInductor +# * Complete the `AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models `_ tutorial + +###################################################################### +# What you will learn +# ---------------------- +# * How to use AOTInductor for python runtime. +# * How to use :func:`torch._inductor.aot_compile` along with :func:`torch.export.export` to generate a shared library +# * How to run a shared library in Python runtime using :func:`torch._export.aot_load`. +# * When do you use AOTInductor for python runtime + +###################################################################### +# Model Compilation +# ----------------- +# +# We will use the TorchVision pretrained `ResNet18` model and TorchInductor on the +# exported PyTorch program using :func:`torch._inductor.aot_compile`. +# +# .. note:: +# +# This API also supports :func:`torch.compile` options like ``mode`` +# This means that if used on a CUDA enabled device, you can, for example, set ``"max_autotune": True`` +# which leverages Triton based matrix multiplications & convolutions, and enables CUDA graphs by default. +# +# We also specify ``dynamic_shapes`` for the batch dimension. In this example, ``min=2`` is not a bug and is +# explained in `The 0/1 Specialization Problem `__ + + +import os +import torch +from torchvision.models import ResNet18_Weights, resnet18 + +model = resnet18(weights=ResNet18_Weights.DEFAULT) +model.eval() + +with torch.inference_mode(): + + # Specify the generated shared library path + aot_compile_options = { + "aot_inductor.output_path": os.path.join(os.getcwd(), "resnet18_pt2.so"), + } + if torch.cuda.is_available(): + device = "cuda" + aot_compile_options.update({"max_autotune": True}) + else: + device = "cpu" + + model = model.to(device=device) + example_inputs = (torch.randn(2, 3, 224, 224, device=device),) + + # min=2 is not a bug and is explained in the 0/1 Specialization Problem + batch_dim = torch.export.Dim("batch", min=2, max=32) + exported_program = torch.export.export( + model, + example_inputs, + # Specify the first dimension of the input x as dynamic + dynamic_shapes={"x": {0: batch_dim}}, + ) + so_path = torch._inductor.aot_compile( + exported_program.module(), + example_inputs, + # Specify the generated shared library path + options=aot_compile_options + ) + + +###################################################################### +# Model Inference in Python +# ------------------------- +# +# Typically, the shared object generated above is used in a non-Python environment. In PyTorch 2.3, +# we added a new API called :func:`torch._export.aot_load` to load the shared library in the Python runtime. +# The API follows a structure similar to the :func:`torch.jit.load` API . You need to specify the path +# of the shared library and the device where it should be loaded. +# +# .. note:: +# In the example above, we specified ``batch_size=1`` for inference and it still functions correctly even though we specified ``min=2`` in +# :func:`torch.export.export`. + + +import os +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +model_so_path = os.path.join(os.getcwd(), "resnet18_pt2.so") + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + output = model(example_inputs) + +###################################################################### +# When to use AOTInductor for Python Runtime +# ------------------------------------------ +# +# One of the requirements for using AOTInductor is that the model shouldn't have any graph breaks. +# Once this requirement is met, the primary use case for using AOTInductor Python Runtime is for +# model deployment using Python. +# There are mainly two reasons why you would use AOTInductor Python Runtime: +# +# - ``torch._inductor.aot_compile`` generates a shared library. This is useful for model +# versioning for deployments and tracking model performance over time. +# - With :func:`torch.compile` being a JIT compiler, there is a warmup +# cost associated with the first compilation. Your deployment needs to account for the +# compilation time taken for the first inference. With AOTInductor, the compilation is +# done offline using ``torch.export.export`` & ``torch._indutor.aot_compile``. The deployment +# would only load the shared library using ``torch._export.aot_load`` and run inference. +# +# +# The section below shows the speedup achieved with AOTInductor for first inference +# +# We define a utility function ``timed`` to measure the time taken for inference +# + +import time +def timed(fn): + # Returns the result of running `fn()` and the time it took for `fn()` to run, + # in seconds. We use CUDA events and synchronization for accurate + # measurement on CUDA enabled devices. + if torch.cuda.is_available(): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + else: + start = time.time() + + result = fn() + if torch.cuda.is_available(): + end.record() + torch.cuda.synchronize() + else: + end = time.time() + + # Measure time taken to execute the function in miliseconds + if torch.cuda.is_available(): + duration = start.elapsed_time(end) + else: + duration = (end - start) * 1000 + + return result, duration + + +###################################################################### +# Lets measure the time for first inference using AOTInductor + +torch._dynamo.reset() + +model = torch._export.aot_load(model_so_path, device) +example_inputs = (torch.randn(1, 3, 224, 224, device=device),) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms") + + +###################################################################### +# Lets measure the time for first inference using ``torch.compile`` + +torch._dynamo.reset() + +model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device) +model.eval() + +model = torch.compile(model) +example_inputs = torch.randn(1, 3, 224, 224, device=device) + +with torch.inference_mode(): + _, time_taken = timed(lambda: model(example_inputs)) + print(f"Time taken for first inference for torch.compile is {time_taken:.2f} ms") + +###################################################################### +# We see that there is a drastic speedup in first inference time using AOTInductor compared +# to ``torch.compile`` + +###################################################################### +# Conclusion +# ---------- +# +# In this recipe, we have learned how to effectively use the AOTInductor for Python runtime by +# compiling and loading a pretrained ``ResNet18`` model using the ``torch._inductor.aot_compile`` +# and ``torch._export.aot_load`` APIs. This process demonstrates the practical application of +# generating a shared library and running it within a Python environment, even with dynamic shape +# considerations and device-specific optimizations. We also looked at the advantage of using +# AOTInductor in model deployments, with regards to speed up in first inference time. diff --git a/recipes_source/torch_export_challenges_solutions.rst b/recipes_source/torch_export_challenges_solutions.rst new file mode 100644 index 00000000000..1f8b1ae45a4 --- /dev/null +++ b/recipes_source/torch_export_challenges_solutions.rst @@ -0,0 +1,331 @@ +Demonstration of torch.export flow, common challenges and the solutions to address them +======================================================================================= +**Authors:** `Ankith Gunapal `__, `Jordi Ramon `__, `Marcos Carranza `__ + +In the `Introduction to torch.export Tutorial `__ , we learned how to use `torch.export `__. +This tutorial expands on the previous one and explores the process of exporting popular models with code, as well as addresses common challenges that may arise with ``torch.export``. + +In this tutorial, you will learn how to export models for these use cases: + +* Video classifier (`MViT `__) +* Automatic Speech Recognition (`OpenAI Whisper-Tiny `__) +* Image Captioning (`BLIP `__) +* Promptable Image Segmentation (`SAM2 `__) + +Each of the four models were chosen to demonstrate unique features of `torch.export`, as well as some practical considerations +and issues faced in the implementation. + +Prerequisites +------------- + +* PyTorch 2.4 or later +* Basic understanding of ``torch.export`` and PyTorch Eager inference. + + +Key requirement for ``torch.export``: No graph break +---------------------------------------------------- + +`torch.compile `__ speeds up PyTorch code by using JIT to compile PyTorch code into optimized kernels. It optimizes the given model +using ``TorchDynamo`` and creates an optimized graph , which is then lowered into the hardware using the backend specified in the API. +When TorchDynamo encounters unsupported Python features, it breaks the computation graph, lets the default Python interpreter +handle the unsupported code, and then resumes capturing the graph. This break in the computation graph is called a `graph break `__. + +One of the key differences between ``torch.export`` and ``torch.compile`` is that ``torch.export`` doesn’t support graph breaks +which means that the entire model or part of the model that you are exporting needs to be a single graph. This is because handling graph breaks +involves interpreting the unsupported operation with default Python evaluation, which is incompatible with what ``torch.export`` is +designed for. You can read details about the differences between the various PyTorch frameworks in this `link `__ + +You can identify graph breaks in your program by using the following command: + +.. code:: sh + + TORCH_LOGS="graph_breaks" python .py + +You will need to modify your program to get rid of graph breaks. Once resolved, you are ready to export the model. +PyTorch runs `nightly benchmarks `__ for `torch.compile` on popular HuggingFace and TIMM models. +Most of these models have no graph breaks. + +The models in this recipe have no graph breaks, but fail with `torch.export`. + +Video Classification +-------------------- + +MViT is a class of models based on `MultiScale Vision Transformers `__. This model has been trained for video classification using the `Kinetics-400 Dataset `__. +This model with a relevant dataset can be used for action recognition in the context of gaming. + + +The code below exports MViT by tracing with ``batch_size=2`` and then checks if the ExportedProgram can run with ``batch_size=4``. + +.. code:: python + + import numpy as np + import torch + from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b + import traceback as tb + + model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT) + + # Create a batch of 2 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(2,16, 224, 224, 3) + # Transpose to get [1, 3, num_clips, height, width]. + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + + # Export the model. + exported_program = torch.export.export( + model, + (input_frames,), + ) + + # Create a batch of 4 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(4,16, 224, 224, 3) + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + try: + exported_program.module()(input_frames) + except Exception: + tb.print_exc() + + +Error: Static batch size +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sh + + raise RuntimeError( + RuntimeError: Expected input at *args[0].shape[0] to be equal to 2, but got 4 + + +By default, the exporting flow will trace the program assuming that all input shapes are static, so if you run the program with +input shapes that are different than the ones you used while tracing, you will run into an error. + +Solution +~~~~~~~~ + +To address the error, we specify the first dimension of the input (``batch_size``) to be dynamic , specifying the expected range of ``batch_size``. +In the corrected example shown below, we specify that the expected ``batch_size`` can range from 1 to 16. +One detail to notice that ``min=2`` is not a bug and is explained in `The 0/1 Specialization Problem `__. A detailed description of dynamic shapes +for ``torch.export`` can be found in the export tutorial. The code shown below demonstrates how to export mViT with dynamic batch sizes: + +.. code:: python + + import numpy as np + import torch + from torchvision.models.video import MViT_V1_B_Weights, mvit_v1_b + import traceback as tb + + + model = mvit_v1_b(weights=MViT_V1_B_Weights.DEFAULT) + + # Create a batch of 2 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(2,16, 224, 224, 3) + + # Transpose to get [1, 3, num_clips, height, width]. + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + + # Export the model. + batch_dim = torch.export.Dim("batch", min=2, max=16) + exported_program = torch.export.export( + model, + (input_frames,), + # Specify the first dimension of the input x as dynamic + dynamic_shapes={"x": {0: batch_dim}}, + ) + + # Create a batch of 4 videos, each with 16 frames of shape 224x224x3. + input_frames = torch.randn(4,16, 224, 224, 3) + input_frames = np.transpose(input_frames, (0, 4, 1, 2, 3)) + try: + exported_program.module()(input_frames) + except Exception: + tb.print_exc() + + +Automatic Speech Recognition +--------------- + +**Automatic Speech Recognition** (ASR) is the use of machine learning to transcribe spoken language into text. +`Whisper `__ is a Transformer based encoder-decoder model from OpenAI, which was trained on 680k hours of labelled data for ASR and speech translation. +The code below tries to export ``whisper-tiny`` model for ASR. + + +.. code:: python + + import torch + from transformers import WhisperProcessor, WhisperForConditionalGeneration + from datasets import load_dataset + + # load model + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + + # dummy inputs for exporting the model + input_features = torch.randn(1,80, 3000) + attention_mask = torch.ones(1, 3000) + decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id + + model.eval() + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,)) + + + +Error: strict tracing with TorchDynamo +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: console + + torch._dynamo.exc.InternalTorchDynamoError: AttributeError: 'DynamicCache' object has no attribute 'key_cache' + + +By default ``torch.export`` traces your code using `TorchDynamo `__, a byte-code analysis engine, which symbolically analyzes your code and builds a graph. +This analysis provides a stronger guarantee about safety but not all Python code is supported. When we export the ``whisper-tiny`` model using the +default strict mode, it typically returns an error in Dynamo due to an unsupported feature. To understand why this errors in Dynamo, you can refer to this `GitHub issue `__. + +Solution +~~~~~~~~ + +To address the above error , ``torch.export`` supports the ``non_strict`` mode where the program is traced using the Python interpreter, which works similar to +PyTorch eager execution. The only difference is that all ``Tensor`` objects will be replaced by ``ProxyTensors``, which will record all their operations into +a graph. By using ``strict=False``, we are able to export the program. + +.. code:: python + + import torch + from transformers import WhisperProcessor, WhisperForConditionalGeneration + from datasets import load_dataset + + # load model + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + + # dummy inputs for exporting the model + input_features = torch.randn(1,80, 3000) + attention_mask = torch.ones(1, 3000) + decoder_input_ids = torch.tensor([[1, 1, 1 , 1]]) * model.config.decoder_start_token_id + + model.eval() + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(input_features, attention_mask, decoder_input_ids,), strict=False) + +Image Captioning +---------------- + +**Image Captioning** is the task of defining the contents of an image in words. In the context of gaming, Image Captioning can be used to enhance the +gameplay experience by dynamically generating text description of the various game objects in the scene, thereby providing the gamer with additional +details. `BLIP `__ is a popular model for Image Captioning `released by SalesForce Research `__. The code below tries to export BLIP with ``batch_size=1``. + + +.. code:: python + + import torch + from models.blip import blip_decoder + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + image_size = 384 + image = torch.randn(1, 3,384,384).to(device) + caption_input = "" + + model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth' + model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base') + model.eval() + model = model.to(device) + + exported_program: torch.export.ExportedProgram= torch.export.export(model, args=(image,caption_input,), strict=False) + + + +Error: Cannot mutate tensors with frozen storage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While exporting a model, it might fail because the model implementation might contain certain Python operations which are not yet supported by ``torch.export``. +Some of these failures may have a workaround. BLIP is an example where the original model errors, which can be resolved by making a small change in the code. +``torch.export`` lists the common cases of supported and unsupported operations in `ExportDB `__ and shows how you can modify your code to make it export compatible. + +.. code:: console + + File "/BLIP/models/blip.py", line 112, in forward + text.input_ids[:,0] = self.tokenizer.bos_token_id + File "/anaconda3/envs/export/lib/python3.10/site-packages/torch/_subclasses/functional_tensor.py", line 545, in __torch_dispatch__ + outs_unwrapped = func._op_dk( + RuntimeError: cannot mutate tensors with frozen storage + + + +Solution +~~~~~~~~ + +Clone the `tensor `__ where export fails. + +.. code:: python + + text.input_ids = text.input_ids.clone() # clone the tensor + text.input_ids[:,0] = self.tokenizer.bos_token_id + +.. note:: + This constraint has been relaxed in PyTorch 2.7 nightlies. This should work out-of-the-box in PyTorch 2.7 + +Promptable Image Segmentation +----------------------------- + +**Image segmentation** is a computer vision technique that divides a digital image into distinct groups of pixels, or segments, based on their characteristics. +`Segment Anything Model (SAM) `__) introduced promptable image segmentation, which predicts object masks given prompts that indicate the desired object. `SAM 2 `__ is +the first unified model for segmenting objects across images and videos. The `SAM2ImagePredictor `__ class provides an easy interface to the model for prompting +the model. The model can take as input both point and box prompts, as well as masks from the previous iteration of prediction. Since SAM2 provides strong +zero-shot performance for object tracking, it can be used for tracking game objects in a scene. + + +The tensor operations in the predict method of `SAM2ImagePredictor `__ are happening in the `_predict `__ method. So, we try to export like this. + +.. code:: python + + ep = torch.export.export( + self._predict, + args=(unnorm_coords, labels, unnorm_box, mask_input, multimask_output), + kwargs={"return_logits": return_logits}, + strict=False, + ) + + +Error: Model is not of type ``torch.nn.Module`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``torch.export`` expects the module to be of type ``torch.nn.Module``. However, the module we are trying to export is a class method. Hence it errors. + +.. code:: console + + Traceback (most recent call last): + File "/sam2/image_predict.py", line 20, in + masks, scores, _ = predictor.predict( + File "/sam2/sam2/sam2_image_predictor.py", line 312, in predict + ep = torch.export.export( + File "python3.10/site-packages/torch/export/__init__.py", line 359, in export + raise ValueError( + ValueError: Expected `mod` to be an instance of `torch.nn.Module`, got . + + +Solution +~~~~~~~~ + +We write a helper class, which inherits from ``torch.nn.Module`` and call the ``_predict method`` in the ``forward`` method of the class. The complete code can be found `here `__. + +.. code:: python + + class ExportHelper(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(_, *args, **kwargs): + return self._predict(*args, **kwargs) + + model_to_export = ExportHelper() + ep = torch.export.export( + model_to_export, + args=(unnorm_coords, labels, unnorm_box, mask_input, multimask_output), + kwargs={"return_logits": return_logits}, + strict=False, + ) + +Conclusion +---------- + +In this tutorial, we have learned how to use ``torch.export`` to export models for popular use cases by addressing challenges through correct configuration and simple code modifications. +Once you are able to export a model, you can lower the ``ExportedProgram`` into your hardware using `AOTInductor `__ in case of servers and `ExecuTorch `__ in case of edge device. +To learn more about ``AOTInductor`` (AOTI), please refer to the `AOTI tutorial `__. +To learn more about ``ExecuTorch`` , please refer to the `ExecuTorch tutorial `__. diff --git a/recipes_source/torch_logs.py b/recipes_source/torch_logs.py new file mode 100644 index 00000000000..b5c3f0bd8ac --- /dev/null +++ b/recipes_source/torch_logs.py @@ -0,0 +1,96 @@ +""" +(beta) Using TORCH_LOGS python API with torch.compile +========================================================================================== +**Author:** `Michael Lazos `_ +""" + +import logging + +###################################################################### +# +# This tutorial introduces the ``TORCH_LOGS`` environment variable, as well as the Python API, and +# demonstrates how to apply it to observe the phases of ``torch.compile``. +# +# .. note:: +# +# This tutorial requires PyTorch 2.2.0 or later. +# +# + + +###################################################################### +# Setup +# ~~~~~~~~~~~~~~~~~~~~~ +# In this example, we'll set up a simple Python function which performs an elementwise +# add and observe the compilation process with ``TORCH_LOGS`` Python API. +# +# .. note:: +# +# There is also an environment variable ``TORCH_LOGS``, which can be used to +# change logging settings at the command line. The equivalent environment +# variable setting is shown for each example. + +import torch + +# exit cleanly if we are on a device that doesn't support torch.compile +if torch.cuda.get_device_capability() < (7, 0): + print("Skipping because torch.compile is not supported on this device.") +else: + @torch.compile() + def fn(x, y): + z = x + y + return z + 2 + + + inputs = (torch.ones(2, 2, device="cuda"), torch.zeros(2, 2, device="cuda")) + + +# print separator and reset dynamo +# between each example + def separator(name): + print(f"==================={name}=========================") + torch._dynamo.reset() + + + separator("Dynamo Tracing") +# View dynamo tracing +# TORCH_LOGS="+dynamo" + torch._logging.set_logs(dynamo=logging.DEBUG) + fn(*inputs) + + separator("Traced Graph") +# View traced graph +# TORCH_LOGS="graph" + torch._logging.set_logs(graph=True) + fn(*inputs) + + separator("Fusion Decisions") +# View fusion decisions +# TORCH_LOGS="fusion" + torch._logging.set_logs(fusion=True) + fn(*inputs) + + separator("Output Code") +# View output code generated by inductor +# TORCH_LOGS="output_code" + torch._logging.set_logs(output_code=True) + fn(*inputs) + + separator("") + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# +# In this tutorial we introduced the TORCH_LOGS environment variable and python API +# by experimenting with a small number of the available logging options. +# To view descriptions of all available options, run any python script +# which imports torch and set TORCH_LOGS to "help". +# +# Alternatively, you can view the `torch._logging documentation`_ to see +# descriptions of all available logging options. +# +# For more information on torch.compile, see the `torch.compile tutorial`_. +# +# .. _torch._logging documentation: https://pytorch.org/docs/main/logging.html +# .. _torch.compile tutorial: https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html diff --git a/recipes_source/torchscript_inference.rst b/recipes_source/torchscript_inference.rst index 54068e70723..8c78413edd1 100644 --- a/recipes_source/torchscript_inference.rst +++ b/recipes_source/torchscript_inference.rst @@ -1,6 +1,8 @@ TorchScript for Deployment ========================== +.. warning:: TorchScript is no longer in active development. + In this recipe, you will learn: - What TorchScript is diff --git a/recipes_source/torchserve_vertexai_tutorial.rst b/recipes_source/torchserve_vertexai_tutorial.rst new file mode 100644 index 00000000000..9c748e7b8c1 --- /dev/null +++ b/recipes_source/torchserve_vertexai_tutorial.rst @@ -0,0 +1,144 @@ +Deploying a PyTorch Stable Diffusion model as a Vertex AI Endpoint +================================================================== + +Deploying large models, like Stable Diffusion, can be challenging and time-consuming. + +In this recipe, we will show how you can streamline the deployment of a PyTorch Stable Diffusion +model by leveraging Vertex AI. + +PyTorch is the framework used by Stability AI on Stable +Diffusion v1.5. Vertex AI is a fully-managed machine learning platform with tools and +infrastructure designed to help ML practitioners accelerate and scale ML in production with +the benefit of open-source frameworks like PyTorch. + +In four steps you can deploy a PyTorch Stable Diffusion model (v1.5). + +Deploying your Stable Diffusion model on a Vertex AI Endpoint can be done in four steps: + +* Create a custom TorchServe handler. + +* Upload model artifacts to Google Cloud Storage (GCS). + +* Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image. + +* Deploy the Vertex AI model onto an endpoint. + +Let’s have a look at each step in more detail. You can follow and implement the steps using the +`Notebook example `__. + +NOTE: Please keep in mind that this recipe requires a billable Vertex AI as explained in more details in the notebook example. + +Create a custom TorchServe handler +---------------------------------- + +TorchServe is an easy and flexible tool for serving PyTorch models. The model deployed to Vertex AI +uses TorchServe to handle requests and return responses from the model. +You must create a custom TorchServe handler to include in the model artifacts uploaded to Vertex AI. Include the handler file in the +directory with the other model artifacts, like this: `model_artifacts/handler.py`. + +After creating the handler file, you must package the handler as a model archiver (MAR) file. +The output file must be named `model.mar`. + + +.. code:: shell + + !torch-model-archiver \ + -f \ + --model-name \ + --version 1.0 \ + --handler model_artifacts/handler.py \ + --export-path model_artifacts + +Upload model artifacts to Google Cloud Storage (GCS) +---------------------------------------------------- + +In this step we are uploading +`model artifacts `__ +to GCS, like the model file or handler. The advantage of storing your artifacts on GCS is that you can +track the artifacts in a central bucket. + + +.. code:: shell + + BUCKET_NAME = "your-bucket-name-unique" # @param {type:"string"} + BUCKET_URI = f"gs://{BUCKET_NAME}/" + + # Will copy the artifacts into the bucket + !gsutil cp -r model_artifacts $BUCKET_URI + +Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image +---------------------------------------------------------------------------------------- + +Once you've uploaded the model artifacts into a GCS bucket, you can upload your PyTorch model to +`Vertex AI Model Registry `__. +From the Vertex AI Model Registry, you have an overview of your models +so you can better organize, track, and train new versions. For this you can use the +`Vertex AI SDK `__ +and this +`pre-built PyTorch container `__. + + +.. code:: shell + + from google.cloud import aiplatform as vertexai + PYTORCH_PREDICTION_IMAGE_URI = ( + "us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.1-12:latest" + ) + MODEL_DISPLAY_NAME = "stable_diffusion_1_5-unique" + MODEL_DESCRIPTION = "stable_diffusion_1_5 container" + + vertexai.init(project='your_project', location='us-central1', staging_bucket=BUCKET_NAME) + + model = aiplatform.Model.upload( + display_name=MODEL_DISPLAY_NAME, + description=MODEL_DESCRIPTION, + serving_container_image_uri=PYTORCH_PREDICTION_IMAGE_URI, + artifact_uri=BUCKET_URI, + ) + +Deploy the Vertex AI model onto an endpoint +------------------------------------------- + +Once the model has been uploaded to Vertex AI Model Registry you can then take it and deploy +it to an Vertex AI Endpoint. For this you can use the Console or the Vertex AI SDK. In this +example you will deploy the model on a NVIDIA Tesla P100 GPU and n1-standard-8 machine. You can +specify your machine type. + + +.. code:: shell + + endpoint = aiplatform.Endpoint.create(display_name=ENDPOINT_DISPLAY_NAME) + + model.deploy( + endpoint=endpoint, + deployed_model_display_name=MODEL_DISPLAY_NAME, + machine_type="n1-standard-8", + accelerator_type="NVIDIA_TESLA_P100", + accelerator_count=1, + traffic_percentage=100, + deploy_request_timeout=1200, + sync=True, + ) + +If you follow this +`notebook `__ +you can also get online predictions using the Vertex AI SDK as shown in the following snippet. + + +.. code:: shell + + instances = [{"prompt": "An examplePup dog with a baseball jersey."}] + response = endpoint.predict(instances=instances) + + with open("img.jpg", "wb") as g: + g.write(base64.b64decode(response.predictions[0])) + + display.Image("img.jpg") + +Create a Vertex AI model with the model artifacts and a prebuilt PyTorch container image + +More resources +-------------- + +This tutorial was created using the vendor documentation. To refer to the original documentation on the vendor site, please see +`torchserve example `__. diff --git a/recipes_source/xeon_run_cpu.rst b/recipes_source/xeon_run_cpu.rst new file mode 100644 index 00000000000..6426bc57819 --- /dev/null +++ b/recipes_source/xeon_run_cpu.rst @@ -0,0 +1,364 @@ +Optimizing CPU Performance on Intel® Xeon® with run_cpu Script +====================================================================== + +There are several configuration options that can impact the performance of PyTorch inference when executed on Intel® Xeon® Scalable Processors. +To get peak performance, the ``torch.backends.xeon.run_cpu`` script is provided that optimizes the configuration of thread and memory management. +For thread management, the script configures thread affinity and the preload of Intel® OMP library. +For memory management, it configures NUMA binding and preloads optimized memory allocation libraries, such as TCMalloc and JeMalloc. +In addition, the script provides tunable parameters for compute resource allocation in both single instance and multiple instance scenarios, +helping the users try out an optimal coordination of resource utilization for the specific workloads. + +What You Will Learn +------------------- + +* How to utilize tools like ``numactl``, ``taskset``, Intel® OpenMP Runtime Library and optimized memory + allocators such as ``TCMalloc`` and ``JeMalloc`` for enhanced performance. +* How to configure CPU resources and memory management to maximize PyTorch inference performance on Intel® Xeon® processors. + +Introduction of the Optimizations +--------------------------------- + +Applying NUMA Access Control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is beneficial that an increasing number of CPU cores are being provided to users within a single socket, as this offers greater computational resources. +However, this also leads to competition for memory access, which can cause programs to stall due to busy memory. +To address this problem, Non-Uniform Memory Access (NUMA) was introduced. +Unlike Uniform Memory Access (UMA), where all memories are equally accessible to all cores, +NUMA organizes memory into multiple groups. Certain number of memories are directly attached to one socket's integrated memory controller to become local memory of this socket. +Local memory access is much faster than remote memory access. + +Users can get CPU information with ``lscpu`` command on Linux to learn how many cores and sockets are there on the machine. +Additionally, this command provides NUMA information, such as the distribution of CPU cores. +Below is an example of executing ``lscpu`` on a machine equipped with an Intel® Xeon® CPU Max 9480: + +.. code-block:: console + + $ lscpu + ... + CPU(s): 224 + On-line CPU(s) list: 0-223 + Vendor ID: GenuineIntel + Model name: Intel (R) Xeon (R) CPU Max 9480 + CPU family: 6 + Model: 143 + Thread(s) per core: 2 + Core(s) per socket: 56 + Socket(s): 2 + ... + NUMA: + NUMA node(s): 2 + NUMA node0 CPU(s): 0-55,112-167 + NUMA node1 CPU(s): 56-111,168-223 + ... + +* Two sockets were detected, each containing 56 physical cores. With Hyper-Threading enabled, each core can handle 2 threads, resulting in 56 logical cores per socket. Therefore, the machine has a total of 224 CPU cores in service. +* Typically, physical cores are indexed before logical cores. In this scenario, cores 0-55 are the physical cores on the first NUMA node, and cores 56-111 are the physical cores on the second NUMA node. +* Logical cores are indexed subsequently: cores 112-167 correspond to the logical cores on the first NUMA node, and cores 168-223 to those on the second NUMA node. + +Typically, running PyTorch programs with compute intense workloads should avoid using logical cores to get good performance. + +Linux provides a tool called ``numactl`` that allows user control of NUMA policy for processes or shared memory. +It runs processes with a specific NUMA scheduling or memory placement policy. +As described above, cores share high-speed cache in one socket, thus it is a good idea to avoid cross socket computations. +From a memory access perspective, bounding memory access locally is much faster than accessing remote memories. +``numactl`` command should have been installed in recent Linux distributions. In case it is missing, you can install it manually with the installation command, like on Ubuntu: + +.. code-block:: console + + $ apt-get install numactl + +on CentOS you can run the following command: + +.. code-block:: console + + $ yum install numactl + +The ``taskset`` command in Linux is another powerful utility that allows you to set or retrieve the CPU affinity of a running process. +``taskset`` are pre-installed in most Linux distributions and in case it's not, on Ubuntu you can install it with the command: + +.. code-block:: console + + $ apt-get install util-linux + +on CentOS you can run the following command: + +.. code-block:: console + + $ yum install util-linux + +Using Intel® OpenMP Runtime Library +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +OpenMP is an implementation of multithreading, a method of parallelizing where a primary thread (a series of instructions executed consecutively) forks a specified number of sub-threads and the system divides a task among them. The threads then run concurrently, with the runtime environment allocating threads to different processors. +Users can control OpenMP behaviors with some environment variable settings to fit for their workloads, the settings are read and executed by OMP libraries. By default, PyTorch uses GNU OpenMP Library (GNU libgomp) for parallel computation. On Intel® platforms, Intel® OpenMP Runtime Library (libiomp) provides OpenMP API specification support. It usually brings more performance benefits compared to libgomp. + +The Intel® OpenMP Runtime Library can be installed using one of these commands: + +.. code-block:: console + + $ pip install intel-openmp + +or + +.. code-block:: console + + $ conda install mkl + +Choosing an Optimized Memory Allocator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Memory allocator plays an important role from performance perspective as well. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus results in a faster execution. From practical experiences, for deep learning workloads, ``TCMalloc`` or ``JeMalloc`` can get better performance by reusing memory as much as possible than default malloc operations. + +You can install ``TCMalloc`` by running the following command on Ubuntu: + +.. code-block:: console + + $ apt-get install google-perftools + +On CentOS, you can install it by running: + +.. code-block:: console + + $ yum install gperftools + +In a conda environment, it can also be installed by running: + +.. code-block:: console + + $ conda install conda-forge::gperftools + +On Ubuntu ``JeMalloc`` can be installed by this command: + +.. code-block:: console + + $ apt-get install libjemalloc2 + +On CentOS it can be installed by running: + +.. code-block:: console + + $ yum install jemalloc + +In a conda environment, it can also be installed by running: + +.. code-block:: console + + $ conda install conda-forge::jemalloc + +Quick Start Example Commands +---------------------------- + +1. To run single-instance inference with 1 thread on 1 CPU core (only Core #0 would be used): + +.. code-block:: console + + $ python -m torch.backends.xeon.run_cpu --ninstances 1 --ncores-per-instance 1 [program_args] + +2. To run single-instance inference on a single CPU node (NUMA socket): + +.. code-block:: console + + $ python -m torch.backends.xeon.run_cpu --node-id 0 [program_args] + +3. To run multi-instance inference, 8 instances with 14 cores per instance on a 112-core CPU: + +.. code-block:: console + + $ python -m torch.backends.xeon.run_cpu --ninstances 8 --ncores-per-instance 14 [program_args] + +4. To run inference in throughput mode, in which all the cores in each CPU node set up an instance: + +.. code-block:: console + + $ python -m torch.backends.xeon.run_cpu --throughput-mode [program_args] + +.. note:: + + Term "instance" here doesn't refer to a cloud instance. This script is executed as a single process which invokes multiple "instances" which are formed from multiple threads. "Instance" is kind of group of threads in this context. + +Using ``torch.backends.xeon.run_cpu`` +------------------------------------- + +The argument list and usage guidance can be shown with the following command: + +.. code-block:: console + + $ python -m torch.backends.xeon.run_cpu –h + usage: run_cpu.py [-h] [--multi-instance] [-m] [--no-python] [--enable-tcmalloc] [--enable-jemalloc] [--use-default-allocator] [--disable-iomp] [--ncores-per-instance] [--ninstances] [--skip-cross-node-cores] [--rank] [--latency-mode] [--throughput-mode] [--node-id] [--use-logical-core] [--disable-numactl] [--disable-taskset] [--core-list] [--log-path] [--log-file-prefix] [program_args] + +The command above has the following positional arguments: + +.. list-table:: + :widths: 25 50 + :header-rows: 1 + + * - knob + - help + * - ``program`` + - The full path of the program/script to be launched. + * - ``program_args`` + - The input arguments for the program/script to be launched. + +Explanation of the options +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The generic option settings (knobs) include the following: + +.. list-table:: + :widths: 25 10 15 50 + :header-rows: 1 + + * - knob + - type + - default value + - help + * - ``-h``, ``--help`` + - + - + - To show the help message and exit. + * - ``-m``, ``--module`` + - + - + - To change each process to interpret the launch script as a python module, executing with the same behavior as "python -m". + * - ``--no-python`` + - bool + - False + - To avoid prepending the program with "python" - just execute it directly. Useful when the script is not a Python script. + * - ``--log-path`` + - str + - ``''`` + - To specify the log file directory. Default path is ``''``, which means disable logging to files. + * - ``--log-file-prefix`` + - str + - "run" + - Prefix of the log file name. + +Knobs for applying or disabling optimizations are: + +.. list-table:: + :widths: 25 10 15 50 + :header-rows: 1 + + * - knob + - type + - default value + - help + * - ``--enable-tcmalloc`` + - bool + - False + - To enable ``TCMalloc`` memory allocator. + * - ``--enable-jemalloc`` + - bool + - False + - To enable ``JeMalloc`` memory allocator. + * - ``--use-default-allocator`` + - bool + - False + - To use default memory allocator. Neither ``TCMalloc`` nor ``JeMalloc`` would be used. + * - ``--disable-iomp`` + - bool + - False + - By default, Intel® OpenMP lib will be used if installed. Setting this flag would disable the usage of Intel® OpenMP. + +.. note:: + + Memory allocators influence performance. If the user does not specify a desired memory allocator, the ``run_cpu`` script will search if any of them is installed in the order of TCMalloc > JeMalloc > PyTorch default memory allocator, and takes the first matched one. + +Knobs for controlling instance number and compute resource allocation are: + +.. list-table:: + :widths: 25 10 15 50 + :header-rows: 1 + + * - knob + - type + - default value + - help + * - ``--ninstances`` + - int + - 0 + - Number of instances. + * - ``--ncores-per-instance`` + - int + - 0 + - Number of cores used by each instance. + * - ``--node-id`` + - int + - -1 + - The node ID to be used for multi-instance, by default all nodes will be used. + * - ``--core-list`` + - str + - ``''`` + - To specify the core list as ``'core_id, core_id, ....'`` or core range as ``'core_id-core_id'``. By dafault all the cores will be used. + * - ``--use-logical-core`` + - bool + - False + - By default only physical cores are used. Specifying this flag enables logical cores usage. + * - ``--skip-cross-node-cores`` + - bool + - False + - To prevent the workload to be executed on cores across NUMA nodes. + * - ``--rank`` + - int + - -1 + - To specify instance index to assign ncores_per_instance for rank; otherwise ncores_per_instance will be assigned sequentially to the instances. + * - ``--multi-instance`` + - bool + - False + - A quick set to invoke multiple instances of the workload on multi-socket CPU servers. + * - ``--latency-mode`` + - bool + - False + - A quick set to invoke benchmarking with latency mode, in which all physical cores are used and 4 cores per instance. + * - ``--throughput-mode`` + - bool + - False + - A quick set to invoke benchmarking with throughput mode, in which all physical cores are used and 1 numa node per instance. + * - ``--disable-numactl`` + - bool + - False + - By default ``numactl`` command is used to control NUMA access. Setting this flag will disable it. + * - ``--disable-taskset`` + - bool + - False + - To disable the usage of ``taskset`` command. + +.. note:: + + Environment variables that will be set by this script include the following: + + .. list-table:: + :widths: 25 50 + :header-rows: 1 + + * - Environment Variable + - Value + * - LD_PRELOAD + - Depending on knobs you set, /libiomp5.so, /libjemalloc.so, /libtcmalloc.so might be appended to LD_PRELOAD. + * - KMP_AFFINITY + - If libiomp5.so is preloaded, KMP_AFFINITY could be set to ``"granularity=fine,compact,1,0"``. + * - KMP_BLOCKTIME + - If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1". + * - OMP_NUM_THREADS + - Value of ``ncores_per_instance`` + * - MALLOC_CONF + - If libjemalloc.so is preloaded, MALLOC_CONF will be set to ``"oversize_threshold:1,background_thread:true,metadata_thp:auto"``. + + Please note that the script respects environment variables set preliminarily. For example, if you have set the environment variables mentioned above before running the script, the values of the variables will not be overwritten by the script. + +Conclusion +---------- + +In this tutorial, we explored a variety of advanced configurations and tools designed to optimize PyTorch inference performance on Intel® Xeon® Scalable Processors. +By leveraging the ``torch.backends.xeon.run_cpu`` script, we demonstrated how to fine-tune thread and memory management to achieve peak performance. +We covered essential concepts such as NUMA access control, optimized memory allocators like ``TCMalloc`` and ``JeMalloc``, and the use of Intel® OpenMP for efficient multithreading. + +Additionally, we provided practical command-line examples to guide you through setting up single and multiple instance scenarios, ensuring optimal resource utilization tailored to specific workloads. +By understanding and applying these techniques, users can significantly enhance the efficiency and speed of their PyTorch applications on Intel® Xeon® platforms. + +See also: + +* `PyTorch Performance Tuning Guide `__ +* `PyTorch Multiprocessing Best Practices `__ +* Grokking PyTorch Intel CPU performance: `Part 1 `__ `Part 2 `__ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d323e7a1409..00000000000 --- a/requirements.txt +++ /dev/null @@ -1,61 +0,0 @@ -# --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version. -# Refer to ./jenkins/build.sh for tutorial build instructions - -sphinx==5.0.0 -sphinx-gallery==0.11.1 -sphinx_design -docutils==0.16 -sphinx-copybutton -tqdm -numpy -matplotlib -librosa -torch -torchvision -torchtext -torchdata -networkx -PyHamcrest -bs4 -awscliv2==2.1.1 -flask -spacy==3.4.1 -ray[tune]==2.4.0 -tensorboard -jinja2==3.0.3 -pytorch-lightning -torchx -torchrl -ax-platform -nbformat>=4.2.0 -datasets -transformers -torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable -onnx -onnxscript -onnxruntime - -importlib-metadata==6.8.0 - -# PyTorch Theme --e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme - -ipython - -sphinxcontrib.katex -# to run examples -boto3 -pandas -requests -scikit-image -scipy==1.11.1 -numba==0.57.1 -pillow==10.0.1 -wget -gym-super-mario-bros==7.4.0 -pyopengl -gymnasium[mujoco]==0.27.0 -timm -iopath -pygame==2.1.2 - diff --git a/requirements.txt b/requirements.txt new file mode 120000 index 00000000000..72b541c1ebf --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +.ci/docker/requirements.txt \ No newline at end of file diff --git a/tutorial_submission_policy.md b/tutorial_submission_policy.md new file mode 100644 index 00000000000..c5c3a800876 --- /dev/null +++ b/tutorial_submission_policy.md @@ -0,0 +1,107 @@ +# PyTorch Tutorial Submission Policy + +This policy outlines the criteria and process for submitting new +tutorials to the PyTorch community. +Our goal is to ensure that all tutorials are of high quality, +relevant, and up-to-date, supporting both the growth of the PyTorch +users and the evolution of the PyTorch framework itself. By following +these guidelines, contributors can help us maintain a robust and +informative educational environment. + +## Acceptance Criteria For New Tutorials + +We accept new tutorials that adhere to one of the following use cases: + +* **Demonstrate New PyTorch Features:** Tutorials that support new features + for upcoming PyTorch releases are typically authored by the engineers who + are developing these features. These tutorials are crucial for showcasing + the latest advancements in PyTorch. We typically do not require more than + one tutorial per feature. + +* **Tutorials showcasing PyTorch usage with other tools and libraries:** We + accept community-contributed tutorials that illustrate innovative uses of + PyTorch alongside other open-source projects, models, and tools. Please + ensure that your tutorial remains neutral and does not promote or endorse + proprietary technologies over others. + +The first use case does not require going through the submission +process outlined below. If your tutorial falls under the second category, +please read and follow the instructions in the +**Submission Process For Community-Contributed Tutorials** section. + +## Submission Process For Community-Contributed Tutorials + +To maintain the quality and relevance of tutorials, we request that +community-contributed tutorials undergo a review process. If you are +interested in contributing a tutorial, please follow these steps: + +1. **Create an issue:** + * Open an issue in the pytorch/tutorials repository proposing the + new tutorial. Clearly explain the importance of the tutorial and + confirm that there is no existing tutorial covering the same or + similar topic. A tutorial should not disproportionately endorse + one technology over another. Please consult with Core Maintainers + to ensure your content adheres to these guidelines. + Use the provided [ISSUE_TEMPLATE](https://github.com/pytorch/tutorials/blob/main/.github/ISSUE_TEMPLATE/feature-request.yml) for the new tutorial request - select **Feature request** when submitting an issue. + + * If there is an existing tutorial on the topic that you would + like to significantly refactor, you can submit a PR. In the + description of the PR, explain why the changes are needed and + how they improve the tutorial. + + * These issues will be triaged by PyTorch maintainers on a case-by-case basis. + * Link any supporting materials including discussions in other repositories. + +1. **Await Approval:** + * Wait for a response from the PyTorch Tutorials maintainers. A PyTorch + tutorial maintainer will review your proposal and + determine whether a tutorial on the proposed topic is desirable. + A comment and an **approved** label will be added to your issue + by a maintainer. The review process for new tutorial PRs submitted + without the corresponding issue may take longer. + +1. **Adhere to writing and styling guidelines:** + * Once approved, follow the guidelines outlined in [CONTRIBUTING.md](https://github.com/pytorch/tutorials/blob/main/CONTRIBUTING.md) + and use the provided [template](https://github.com/pytorch/tutorials/blob/main/beginner_source/template_tutorial.py) for creating your tutorial. + * Link the issue in which you received approval for your tutorial + in the PR. + * We accept tutorials in both ``.rst`` (ReStructuredText) and ``.py`` + (Python) formats. However, unless your tutorial involves using + multiple GPU, parallel/distributed training, or requires extended + execution time (25 minutes or more), we prefer submissions + in Python file format. + +## Maintaining Tutorials + +When you submit a new tutorial, we encourage you to keep it in sync +with the latest PyTorch updates and features. Additionally, we may +contact you to review any PRs, issues, and other related matters to +ensure the tutorial remains a valuable resource. + +Please note the following: + +* If a tutorial breaks against the main branch, it will + be excluded from the build and an issue will be filed against it, + with the author/maintainer notified. If the issue is not resolved + within 90 days, the tutorial might be deleted from the repository. + +* We recommend that each tutorial is reviewed at least once a year to + ensure its relevance. + +## Deleting Stale Tutorials + +A tutorial might be considered stale when it no longer aligns with +the latest PyTorch updates, features, or best practices or best +practices: + +* The tutorial is no longer functional due to changes in PyTorch or + its dependencies +* The tutorial has been superseded by a newer, more comprehensive, or + more accurate tutorial +* The tutorial does not run successfully in the (CI), indicating + potential compatibility or dependency issues. + +If a tutorial is deemed stale, we will attempt to contact the code owner, +or someone from the tutorial mainatainers might attempt to update it. +However, if despite those attempts we fail to fix it, the tutorial +might be removed from the repository.