Update more-tests.yml

mikekgfb · web-flow · commit 30cd5b7d6d05 · 2025-01-23T19:33:14.000-08:00
Add tests for sdpa backends with server export (x86 cpu &amp; cuda)
diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
@@ -83,3 +83,61 @@ jobs:
         echo "tests complete"
         echo "******************************************"
         echo "::endgroup::"
+
+
+  test-sdpa-backends-export:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        # Install requirements
+        ./install/install_requirements.sh cuda
+        pip3 list
+        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        mkdir -p checkpoints/stories15M
+        pushd checkpoints/stories15M
+        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export MODEL_NAME=stories15M
+        export MODEL_DIR=/tmp
+
+        for DEVICE in cpu cuda; do
+          # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml
+          # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that)
+          for DTYPE in bfloat16 float16 float32; do
+            for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do
+              ###################################################################
+              # Export DSO and run with Python
+              python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0
+              python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time"
+              ###################################################################
+              # Export AOTI and run with aoti_run
+              python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 
+              ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time"
+              ###################################################################
+            done
+          done
+        done
+
+        echo "tests complete"
+        echo "******************************************"
+        echo "::endgroup::"