Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 95304b8

Browse files
authored
Merge branch 'main' into new-intx-quantizer
2 parents 0abe175 + fd04123 commit 95304b8

File tree

16 files changed

+315
-190
lines changed

16 files changed

+315
-190
lines changed

.ci/scripts/check_gibberish

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@ else
2424
fi
2525
fi
2626

27+
#######################################################################
28+
#
29+
# check whether aspell spell check evailable
30+
31+
if command -v aspell &> /dev/null; then
32+
echo "Checking $TMPFILE for gibberish"
33+
else
34+
echo "Aspell is not installed or not in PATH."
35+
echo "Gibberish unchecked in $TMPFILE"
36+
exit 0
37+
fi
38+
2739
#######################################################################
2840
#
2941
# run spell check on the extracted sequence

.ci/scripts/run-docs

Lines changed: 62 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,67 @@
1-
# /bin/bash -x
1+
#!/bin/bash -x
22

3-
if [ "X$1" == "X" ]; then
3+
# Check if an argument was provided
4+
if [ -z "$1" ]; then
45
echo "Must specify document to run"
56
exit 1
67
fi
78

8-
if [ "$1" == "readme" ]; then
9-
echo "::group::Create script to run README"
10-
python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
11-
# for good measure, if something happened to updown processor,
12-
# and it did not error out, fail with an exit 1
13-
echo "exit 1" >> ./run-readme.sh
14-
echo "::endgroup::"
15-
16-
echo "::group::Run README"
17-
echo "*******************************************"
18-
cat ./run-readme.sh
19-
echo "*******************************************"
20-
bash -x ./run-readme.sh
21-
echo "::endgroup::"
22-
23-
exit 0
24-
fi
25-
26-
if [ "$1" == "quantization" ]; then
27-
echo "::group::Create script to run quantization"
28-
python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
29-
# for good measure, if something happened to updown processor,
30-
# and it did not error out, fail with an exit 1
31-
echo "exit 1" >> ./run-quantization.sh
32-
echo "::endgroup::"
33-
34-
echo "::group::Run quantization"
35-
echo "*******************************************"
36-
cat ./run-quantization.sh
37-
echo "*******************************************"
38-
bash -x ./run-quantization.sh
39-
echo "::endgroup::"
40-
41-
exit 0
42-
fi
43-
44-
if [ "$1" == "gguf" ]; then
45-
echo "::group::Create script to run gguf"
46-
python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
47-
# for good measure, if something happened to updown processor,
48-
# and it did not error out, fail with an exit 1
49-
echo "exit 1" >> ./run-gguf.sh
50-
echo "::endgroup::"
51-
52-
echo "::group::Run gguf"
53-
echo "*******************************************"
54-
cat ./run-gguf.sh
55-
echo "*******************************************"
56-
bash -x ./run-gguf.sh
57-
echo "::endgroup::"
58-
fi
59-
60-
61-
if [ "$1" == "advanced" ]; then
62-
echo "::group::Create script to run advanced"
63-
python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
64-
# for good measure, if something happened to updown processor,
65-
# and it did not error out, fail with an exit 1
66-
echo "exit 1" >> ./run-advanced.sh
67-
echo "::endgroup::"
68-
69-
echo "::group::Run advanced"
70-
echo "*******************************************"
71-
cat ./run-advanced.sh
72-
echo "*******************************************"
73-
bash -x ./run-advanced.sh
74-
echo "::endgroup::"
75-
fi
76-
77-
if [ "$1" == "evaluation" ]; then
78-
echo "::group::Create script to run evaluation"
79-
python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
80-
# for good measure, if something happened to updown processor,
81-
# and it did not error out, fail with an exit 1
82-
echo "exit 1" >> ./run-evaluation.sh
83-
echo "::endgroup::"
84-
85-
echo "::group::Run evaluation"
86-
echo "*******************************************"
87-
cat ./run-evaluation.sh
88-
echo "*******************************************"
89-
bash -x ./run-evaluation.sh
90-
fi
91-
92-
if [ "$1" == "multimodal" ]; then
93-
94-
# Expecting that this might fail this test as-is, because
95-
# it's the first on-pr test depending on github secrets for access with HF token access
96-
97-
echo "::group::Create script to run multimodal"
98-
python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
99-
# for good measure, if something happened to updown processor,
100-
# and it did not error out, fail with an exit 1
101-
echo "exit 1" >> ./run-multimodal.sh
102-
echo "::endgroup::"
103-
104-
echo "::group::Run multimodal"
105-
echo "*******************************************"
106-
cat ./run-multimodal.sh
107-
echo "*******************************************"
108-
bash -x ./run-multimodal.sh
109-
echo "::endgroup::"
110-
fi
111-
112-
if [ "$1" == "native" ]; then
113-
114-
echo "::group::Create script to run native-execution"
115-
python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
116-
# for good measure, if something happened to updown processor,
117-
# and it did not error out, fail with an exit 1
118-
echo "exit 1" >> ./run-native.sh
119-
echo "::endgroup::"
120-
121-
echo "::group::Run native-execution"
122-
echo "*******************************************"
123-
cat ./run-native.sh
124-
echo "*******************************************"
125-
bash -x ./run-native.sh
126-
echo "::endgroup::"
127-
fi
128-
129-
if [ "$1" == "distributed" ]; then
130-
131-
echo "::group::Create script to run distributed"
132-
python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
133-
python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
134-
# for good measure, if something happened to updown processor,
135-
# and it did not error out, fail with an exit 1
136-
echo "exit 1" >> ./run-distributed.sh
137-
echo "::endgroup::"
138-
139-
echo "::group::Run distributed"
140-
echo "*******************************************"
141-
cat ./run-distributed.sh
142-
echo "*******************************************"
143-
bash -x ./run-distributed.sh
144-
echo "::endgroup::"
145-
fi
9+
# Pre-initialize variables
10+
filepath=""
11+
parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
12+
script_name="./run-${1}.sh" # Dynamically initialize script name
13+
14+
# Use a case statement to handle the $1 argument
15+
case "$1" in
16+
"readme")
17+
filepath="README.md"
18+
;;
19+
"quantization")
20+
filepath="docs/quantization.md"
21+
;;
22+
"gguf")
23+
filepath="docs/GGUF.md"
24+
;;
25+
"advanced")
26+
filepath="docs/ADVANCED-USERS.md"
27+
;;
28+
"evaluation")
29+
filepath="torchchat/utils/docs/evaluation.md"
30+
;;
31+
"multimodal")
32+
filepath="docs/multimodal.md"
33+
parameters="" # Clear parameters
34+
;;
35+
"native")
36+
filepath="docs/native-execution.md"
37+
parameters="" # Clear parameters
38+
;;
39+
"distributed")
40+
filepath="docs/distributed.md"
41+
parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication
42+
;;
43+
"local")
44+
filepath="docs/local-model.md"
45+
parameters="" # Clear parameters
46+
;;
47+
48+
*)
49+
echo "Unknown option: $1"
50+
exit 1
51+
;;
52+
esac
53+
54+
# Generate the script
55+
echo "::group::Create script to run $1"
56+
python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name"
57+
# if something happened to updown processor, and it did not error out, fail with an exit 1
58+
echo "exit 1" >> "$script_name"
59+
echo "::endgroup::"
60+
61+
# Run the script
62+
echo "::group::Run $1"
63+
echo "*******************************************"
64+
cat "$script_name"
65+
echo "*******************************************"
66+
bash -x "$script_name"
67+
echo "::endgroup::"

.github/workflows/more-tests.yml

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ jobs:
1919
gpu-arch-version: "12.4"
2020
timeout: 60
2121
script: |
22+
set -xeou pipefail
2223
echo "::group::Print machine info"
2324
uname -a
2425
echo "::endgroup::"
@@ -39,9 +40,10 @@ jobs:
3940
echo "::endgroup::"
4041
4142
echo "::group::Run inference"
42-
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
43+
export MODEL_DIR=checkpoints/stories15M/
44+
export MODEL_PATH=${MODEL_DIR}/stories15M.pt
4345
export MODEL_NAME=stories15M
44-
export MODEL_DIR=/tmp
46+
4547
4648
for DTYPE in bfloat16 float16 float32; do
4749
###################################################################
@@ -83,3 +85,66 @@ jobs:
8385
echo "tests complete"
8486
echo "******************************************"
8587
echo "::endgroup::"
88+
89+
90+
test-sdpa-backends-export:
91+
permissions:
92+
id-token: write
93+
contents: read
94+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
95+
with:
96+
runner: linux.g5.4xlarge.nvidia.gpu
97+
gpu-arch-type: cuda
98+
gpu-arch-version: "12.4"
99+
timeout: 60
100+
script: |
101+
set -xeou pipefail
102+
echo "::group::Print machine info"
103+
uname -a
104+
echo "::endgroup::"
105+
106+
echo "::group::Download checkpoints"
107+
# Install requirements
108+
./install/install_requirements.sh cuda
109+
pip3 list
110+
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
111+
echo "::endgroup::"
112+
113+
echo "::group::Download checkpoints"
114+
mkdir -p checkpoints/stories15M
115+
pushd checkpoints/stories15M
116+
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
117+
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
118+
popd
119+
echo "::endgroup::"
120+
121+
echo "::group::Run inference"
122+
export MODEL_DIR=checkpoints/stories15M/
123+
export MODEL_PATH=${MODEL_DIR}/stories15M.pt
124+
export MODEL_NAME=stories15M
125+
126+
./torchchat/utils/scripts/build_native.sh aoti
127+
128+
for DEVICE in cpu cuda; do
129+
# depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml
130+
# (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that)
131+
for DTYPE in bfloat16 float16 float32; do
132+
for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do
133+
echo "***************************************************************"
134+
echo "*** $DEVICE $DTYPE $SDPA"
135+
###################################################################
136+
# Export DSO and run with Python
137+
python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}
138+
python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time"
139+
###################################################################
140+
# Export AOTI and run with aoti_run
141+
python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE}
142+
./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time"
143+
###################################################################
144+
done
145+
done
146+
done
147+
148+
echo "tests complete"
149+
echo "******************************************"
150+
echo "::endgroup::"

.github/workflows/run-readme-pr-mps.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ jobs:
1515
conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
1616
conda activate test-readme-mps-macos
1717
set -x
18-
# NS: Remove previous installation of torch first
19-
# as this script does not isntall anything into conda env but rather as system dep
18+
# NS: Remove previous installation of torch first
19+
# as this script does not install anything into conda env but rather as system dep
2020
pip3 uninstall -y torch || true
2121
set -eou pipefail
2222
@@ -37,6 +37,7 @@ jobs:
3737
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
3838
with:
3939
runner: macos-m1-14
40+
timeout: 60
4041
script: |
4142
set -x
4243
conda create -y -n test-quantization-mps-macos python=3.10.11

README.md

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android.
44

55
> [!IMPORTANT]
6-
> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!!
6+
> Update
7+
>
8+
> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)!
9+
>
10+
> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**!
711
>
812
> To try it out, finish the [Installation](#Installation) section below, then hop
913
> over to our [multimodal guide](docs/multimodal.md) to learn more.
@@ -75,6 +79,7 @@ aliases.
7579
| [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) || Alias to `granite3-8b`.|
7680
| [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) || Alias to `granite3.1-2b` and `granite3.1`.|
7781
| [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) || Alias to `granite3.1-8b`.|
82+
| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) || Alias to `deepseek-r1:8b`.|
7883

7984

8085
## Installation
@@ -413,7 +418,7 @@ torchchat/utils/scripts/build_native.sh et
413418

414419
Execute using the runner
415420
```bash
416-
cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
421+
cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
417422
```
418423

419424
</details>
@@ -442,15 +447,7 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
442447
```bash
443448
open et-build/src/executorch/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj
444449
```
445-
446-
> Note: If you're running into any issues related to package dependencies, close Xcode, clean some of the caches and/or the build products, and open the Xcode project again:
447-
> ```bash
448-
> rm -rf \
449-
> ~/Library/org.swift.swiftpm \
450-
> ~/Library/Caches/org.swift.swiftpm \
451-
> ~/Library/Caches/com.apple.dt.Xcode \
452-
> ~/Library/Developer/Xcode/DerivedData
453-
> ```
450+
454451
2. Click the Play button to launch the app in the Simulator.
455452

456453
3. To run on a device, ensure you have it set up for development and a provisioning profile with the `increased-memory-limit` entitlement. Update the app's bundle identifier to match your provisioning profile with the required capability.

docs/quantization.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --aoti-package-path lla
186186
If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
187187

188188
```
189-
OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
189+
OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3
190190
```
191191

192192
#### ExecuTorch
@@ -197,7 +197,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e
197197
Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command.
198198

199199
```
200-
./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
200+
./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time,"
201201
```
202202

203203
## Experimental TorchAO MPS lowbit kernels

0 commit comments

Comments
 (0)