Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 28a36e2

Browse files
committed
Merge branch 'main' of https://github.com/pytorch/torchchat into xpu_device
2 parents ce5774d + 2766a95 commit 28a36e2

File tree

16 files changed

+231
-91
lines changed

16 files changed

+231
-91
lines changed

.ci/scripts/run-docs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@ fi
88

99
# Pre-initialize variables
1010
filepath=""
11-
parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
11+
# cuda supports padding, so no need to replace quantization for now.
12+
# otherwise add: 'cuda.json:cuda-32.json' to replace rules
13+
parameters="--replace llama3:stories15M,-l3:-l2,mobile.json:mobile-32.json --suppress huggingface-cli,HF_TOKEN"
1214
script_name="./run-${1}.sh" # Dynamically initialize script name
1315

1416
# Use a case statement to handle the $1 argument
1517
case "$1" in
1618
"readme")
1719
filepath="README.md"
20+
parameters="--replace llama3.1:stories15M,-l3:-l2,mobile.json:mobile-32.json --suppress huggingface-cli,HF_TOKEN"
1821
;;
1922
"quantization")
2023
filepath="docs/quantization.md"
@@ -38,7 +41,7 @@ case "$1" in
3841
;;
3942
"distributed")
4043
filepath="docs/distributed.md"
41-
parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication
44+
parameters="--replace llama3.1:stories110M,-l3:-l2 --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication
4245
;;
4346
"local")
4447
filepath="docs/local-model.md"
@@ -63,5 +66,6 @@ echo "::group::Run $1"
6366
echo "*******************************************"
6467
cat "$script_name"
6568
echo "*******************************************"
66-
bash -x "$script_name"
69+
set -x
70+
. "$script_name"
6771
echo "::endgroup::"

.github/workflows/pull.yml

Lines changed: 83 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,16 @@ jobs:
291291
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
292292
echo "::endgroup::"
293293
294+
echo "::group::Run inference with quantize file"
295+
for DEVICE in cpu; do # cuda
296+
# cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
297+
# follow up with torchao as a separate PR
298+
echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
299+
python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
300+
python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
301+
done
302+
echo "::endgroup::"
303+
294304
test-gpu-aoti-float32:
295305
permissions:
296306
id-token: write
@@ -335,6 +345,11 @@ jobs:
335345
fi
336346
echo "::endgroup::"
337347
348+
# echo "::group::Run inference with quantize file"
349+
# python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
350+
# python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
351+
# echo "::endgroup::"
352+
338353
test-gpu-aoti-float16:
339354
permissions:
340355
id-token: write
@@ -376,10 +391,15 @@ jobs:
376391
echo "::group::Run inference with quantize file"
377392
if [ $(uname -s) == Darwin ]; then
378393
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
379-
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
394+
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
380395
fi
381396
echo "::endgroup::"
382397
398+
# echo "::group::Run inference with quantize file"
399+
# python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
400+
# python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
401+
# echo "::endgroup::"
402+
383403
test-gpu-eval-sanity-check:
384404
permissions:
385405
id-token: write
@@ -495,12 +515,12 @@ jobs:
495515
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
496516
497517
echo "******************************************"
498-
echo "*** --quantize torchchat/quant_config/mobile.json ***"
518+
echo "*** [TEST DISABLED] Can't test --quantize torchchat/quant_config/mobile.json ***"
519+
echo "*** Testing --quantize torchchat/quant_config/mobile-32.json instead ***"
499520
echo "******************************************"
500-
# python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
521+
# python torchchat.py export --quantize torchchat/quant_config/mobile-32.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
501522
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
502523
503-
504524
echo "******************************************"
505525
echo "******* Emb: channel-wise quantized ******"
506526
echo "******************************************"
@@ -514,16 +534,16 @@ jobs:
514534
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
515535
516536
echo "******************************************"
517-
echo "**** Emb 4bit: channel-wise quantized ****"
537+
echo "**** [TEST DISABLED] Emb 4bit: channel-wise quantized ****"
518538
echo "******************************************"
519-
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
520-
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
539+
# python torchchat.py export --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
540+
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
521541
522542
echo "******************************************"
523-
echo "****** Emb 4bit: group-wise quantized ****"
543+
echo "****** [TEST DISABLED] Emb 4bit: group-wise quantized ****"
524544
echo "******************************************"
525-
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
526-
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
545+
# python torchchat.py export --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
546+
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
527547
528548
echo "******************************************"
529549
echo "******* INT8 channel-wise quantized ******"
@@ -1055,7 +1075,59 @@ jobs:
10551075
./runner/build_android.sh
10561076
echo "Tests complete."
10571077
1058-
test-torchao-experimental:
1078+
test-torchao-aoti-experimental:
1079+
strategy:
1080+
matrix:
1081+
runner: [macos-14-xlarge]
1082+
runs-on: ${{matrix.runner}}
1083+
steps:
1084+
- name: Checkout repo
1085+
uses: actions/checkout@v3
1086+
with:
1087+
submodules: true
1088+
- name: Setup Python
1089+
uses: actions/setup-python@v2
1090+
with:
1091+
python-version: 3.10.11
1092+
- name: Setup Xcode
1093+
if: runner.os == 'macOS'
1094+
uses: maxim-lobanov/setup-xcode@v1
1095+
with:
1096+
xcode-version: '15.3'
1097+
- name: Print machine info
1098+
run: |
1099+
uname -a
1100+
if [ $(uname -s) == Darwin ]; then
1101+
sysctl machdep.cpu.brand_string
1102+
sysctl machdep.cpu.core_count
1103+
fi
1104+
- name: Install torchchat
1105+
run: |
1106+
echo "Intalling pip3 packages"
1107+
./install/install_requirements.sh
1108+
pip3 list
1109+
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
1110+
- name: Install torchao-ops
1111+
id: install-torchao-ops
1112+
run: |
1113+
bash torchchat/utils/scripts/build_torchao_ops.sh
1114+
- name: Install runner AOTI
1115+
id: install-runner-aoti
1116+
run: |
1117+
bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
1118+
- name: Run inference
1119+
run: |
1120+
python torchchat.py download stories110M
1121+
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
1122+
export PRMT="Once upon a time in a land far away"
1123+
echo "Export and run AOTI (C++ runner)"
1124+
python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1125+
./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1126+
echo "Generate AOTI"
1127+
python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
1128+
echo "Tests complete."
1129+
1130+
test-torchao-et-experimental:
10591131
strategy:
10601132
matrix:
10611133
runner: [macos-14-xlarge]
@@ -1100,10 +1172,6 @@ jobs:
11001172
run: |
11011173
echo "Installing runner"
11021174
bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
1103-
- name: Install runner AOTI
1104-
id: install-runner-aoti
1105-
run: |
1106-
bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
11071175
- name: Run inference
11081176
run: |
11091177
python torchchat.py download stories110M
@@ -1116,11 +1184,6 @@ jobs:
11161184
echo "Export and run ET (C++ runner)"
11171185
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11181186
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
1119-
echo "Export and run AOTI (C++ runner)"
1120-
python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1121-
./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1122-
echo "Generate AOTI"
1123-
python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
11241187
echo "Tests complete."
11251188
11261189
test-torchao-experimental-mps:

.github/workflows/run-readme-pr-linuxaarch64.yml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@ jobs:
2323
uname -a
2424
echo "::endgroup::"
2525
26-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
26+
which pip || true
27+
which pip3 || true
28+
which conda || true
29+
# TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
2730
2831
echo "::group::Completion"
2932
echo "tests complete"
@@ -44,8 +47,12 @@ jobs:
4447
echo "::group::Print machine info"
4548
uname -a
4649
echo "::endgroup::"
47-
48-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
50+
51+
which pip || true
52+
which pip3 || true
53+
which conda || true
54+
55+
# TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
4956

5057
test-gguf-cpu:
5158
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -62,7 +69,11 @@ jobs:
6269
uname -a
6370
echo "::endgroup::"
6471
65-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
72+
which pip || true
73+
which pip3 || true
74+
which conda || true
75+
76+
# TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
6677
6778
echo "::group::Completion"
6879
echo "tests complete"
@@ -84,7 +95,11 @@ jobs:
8495
uname -a
8596
echo "::endgroup::"
8697
87-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
98+
which pip || true
99+
which pip3 || true
100+
which conda || true
101+
102+
# TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
88103
89104
echo "::group::Completion"
90105
echo "tests complete"
@@ -106,7 +121,11 @@ jobs:
106121
uname -a
107122
echo "::endgroup::"
108123
109-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
124+
which pip || true
125+
which pip3 || true
126+
which conda || true
127+
128+
# TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
110129
111130
echo "::group::Completion"
112131
echo "tests complete"

.github/workflows/run-readme-pr-macos.yml

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,13 @@ jobs:
3333
sysctl machdep.cpu.core_count
3434
echo "::endgroup::"
3535
36+
which pip || true
37+
which pip3 || true
38+
which conda || true
39+
3640
echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
37-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
41+
export TORCHCHAT_DEVICE=cpu
42+
# . .ci/scripts/run-docs readme
3843
3944
echo "::group::Completion"
4045
echo "tests complete"
@@ -70,8 +75,9 @@ jobs:
7075
echo "::endgroup::"
7176
7277
echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
73-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
74-
78+
export TORCHCHAT_DEVICE=cpu
79+
# . .ci/scripts/run-docs quantization
80+
7581
echo "::group::Completion"
7682
echo "tests complete"
7783
echo "*******************************************"
@@ -106,7 +112,8 @@ jobs:
106112
echo "::endgroup::"
107113
108114
echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
109-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
115+
export TORCHCHAT_DEVICE=cpu
116+
# .ci/scripts/run-docs gguf
110117
111118
echo "::group::Completion"
112119
echo "tests complete"
@@ -141,7 +148,8 @@ jobs:
141148
echo "::endgroup::"
142149
143150
echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
144-
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
151+
export TORCHCHAT_DEVICE=cpu
152+
# . .ci/scripts/run-docs advanced
145153
146154
echo "::group::Completion"
147155
echo "tests complete"
@@ -175,7 +183,7 @@ jobs:
175183
sysctl machdep.cpu.core_count
176184
echo "::endgroup::"
177185
178-
.ci/scripts/run-docs evaluation
186+
# .ci/scripts/run-docs evaluation
179187
180188
echo "::group::Completion"
181189
echo "tests complete"
@@ -209,7 +217,8 @@ jobs:
209217
sysctl machdep.cpu.core_count
210218
echo "::endgroup::"
211219
212-
.ci/scripts/run-docs multimodal
220+
# metadata does not install properly on macos
221+
# .ci/scripts/run-docs multimodal
213222
214223
echo "::group::Completion"
215224
echo "tests complete"
@@ -243,7 +252,8 @@ jobs:
243252
sysctl machdep.cpu.core_count
244253
echo "::endgroup::"
245254
246-
.ci/scripts/run-docs native
255+
echo ".ci/scripts/run-docs native DISABLED"
256+
# .ci/scripts/run-docs native
247257
248258
echo "::group::Completion"
249259
echo "tests complete"

0 commit comments

Comments
 (0)