|  | 
| 19 | 19 |       gpu-arch-version: "12.4" | 
| 20 | 20 |       timeout: 60 | 
| 21 | 21 |       script: | | 
|  | 22 | +        set -xeou pipefail | 
| 22 | 23 |         echo "::group::Print machine info" | 
| 23 | 24 |         uname -a | 
| 24 | 25 |         echo "::endgroup::" | 
|  | 
| 39 | 40 |         echo "::endgroup::" | 
| 40 | 41 | 
 | 
| 41 | 42 |         echo "::group::Run inference" | 
| 42 |  | -        export MODEL_PATH=checkpoints/stories15M/stories15M.pt | 
|  | 43 | +        export MODEL_DIR=checkpoints/stories15M/ | 
|  | 44 | +        export MODEL_PATH=${MODEL_DIR}/stories15M.pt | 
| 43 | 45 |         export MODEL_NAME=stories15M | 
| 44 |  | -        export MODEL_DIR=/tmp | 
|  | 46 | +
 | 
| 45 | 47 | 
 | 
| 46 | 48 |         for DTYPE in bfloat16 float16 float32; do | 
| 47 | 49 |           ################################################################### | 
|  | 
| 83 | 85 |         echo "tests complete" | 
| 84 | 86 |         echo "******************************************" | 
| 85 | 87 |         echo "::endgroup::" | 
|  | 88 | +
 | 
|  | 89 | +
 | 
|  | 90 | +  test-sdpa-backends-export: | 
|  | 91 | +    permissions: | 
|  | 92 | +      id-token: write | 
|  | 93 | +      contents: read | 
|  | 94 | +    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | 
|  | 95 | +    with: | 
|  | 96 | +      runner: linux.g5.4xlarge.nvidia.gpu | 
|  | 97 | +      gpu-arch-type: cuda | 
|  | 98 | +      gpu-arch-version: "12.4" | 
|  | 99 | +      timeout: 60 | 
|  | 100 | +      script: | | 
|  | 101 | +        set -xeou pipefail | 
|  | 102 | +        echo "::group::Print machine info" | 
|  | 103 | +        uname -a | 
|  | 104 | +        echo "::endgroup::" | 
|  | 105 | +
 | 
|  | 106 | +        echo "::group::Download checkpoints" | 
|  | 107 | +        # Install requirements | 
|  | 108 | +        ./install/install_requirements.sh cuda | 
|  | 109 | +        pip3 list | 
|  | 110 | +        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | 
|  | 111 | +        echo "::endgroup::" | 
|  | 112 | +
 | 
|  | 113 | +        echo "::group::Download checkpoints" | 
|  | 114 | +        mkdir -p checkpoints/stories15M | 
|  | 115 | +        pushd checkpoints/stories15M | 
|  | 116 | +        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | 
|  | 117 | +        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | 
|  | 118 | +        popd | 
|  | 119 | +        echo "::endgroup::" | 
|  | 120 | +
 | 
|  | 121 | +        echo "::group::Run inference" | 
|  | 122 | +        export MODEL_DIR=checkpoints/stories15M/ | 
|  | 123 | +        export MODEL_PATH=${MODEL_DIR}/stories15M.pt | 
|  | 124 | +        export MODEL_NAME=stories15M | 
|  | 125 | +
 | 
|  | 126 | +        ./torchchat/utils/scripts/build_native.sh aoti | 
|  | 127 | +         | 
|  | 128 | +        for DEVICE in cpu cuda; do | 
|  | 129 | +          # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml | 
|  | 130 | +          # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that) | 
|  | 131 | +          for DTYPE in bfloat16 float16 float32; do | 
|  | 132 | +            for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do | 
|  | 133 | +              echo "***************************************************************" | 
|  | 134 | +              echo "*** $DEVICE $DTYPE $SDPA" | 
|  | 135 | +              ################################################################### | 
|  | 136 | +              # Export DSO and run with Python | 
|  | 137 | +              python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}  | 
|  | 138 | +              python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time" | 
|  | 139 | +              ################################################################### | 
|  | 140 | +              # Export AOTI and run with aoti_run | 
|  | 141 | +              python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}  | 
|  | 142 | +              ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time" | 
|  | 143 | +              ################################################################### | 
|  | 144 | +            done | 
|  | 145 | +          done | 
|  | 146 | +        done | 
|  | 147 | +
 | 
|  | 148 | +        echo "tests complete" | 
|  | 149 | +        echo "******************************************" | 
|  | 150 | +        echo "::endgroup::" | 
0 commit comments