pytorch · Jack-Khuu · Feb 18, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 6, 2025
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -291,6 +291,16 @@ jobs:
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
         echo "::endgroup::"
 
+        echo "::group::Run inference with quantize file"
+        for DEVICE in cpu; do # cuda 
+          # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
+          # follow up with torchao as a separate PR
+          echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
+          python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+          python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        done
+        echo "::endgroup::"
+
   test-gpu-aoti-float32:
     permissions:
       id-token: write
@@ -335,6 +345,11 @@ jobs:
         fi
         echo "::endgroup::"
 
+        # echo "::group::Run inference with quantize file"
+        # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        # echo "::endgroup::"
+
   test-gpu-aoti-float16:
     permissions:
       id-token: write
@@ -376,10 +391,15 @@ jobs:
         echo "::group::Run inference with quantize file"
         if [ $(uname -s) == Darwin ]; then
           python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
-             python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
+          python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
         fi
         echo "::endgroup::"
 
+        # echo "::group::Run inference with quantize file"
+        # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
+        # echo "::endgroup::"
+
   test-gpu-eval-sanity-check:
     permissions:
       id-token: write
@@ -495,10 +515,11 @@ jobs:
           python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
 
           echo "******************************************"
-          echo "*** --quantize torchchat/quant_config/mobile.json ***"
+          echo "*** can't test --quantize torchchat/quant_config/mobile.json ***"
+          echo "*** testing --quantize torchchat/quant_config/mobile-32.json ***"
           echo "******************************************"
-          # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
-          # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
+          python torchchat.py export --quantize torchchat/quant_config/mobile-32.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
+          python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
 
 
           echo "******************************************"