Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit 99c33e8

Browse files
authored
Merge branch 'main' into refactor/distributed_inference_without_abstraction
2 parents 924a096 + 4fdbe10 commit 99c33e8

File tree

10 files changed

+124
-28
lines changed

10 files changed

+124
-28
lines changed

.ci/scripts/run-docs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,6 @@ if [ "$1" == "advanced" ]; then
7575
fi
7676

7777
if [ "$1" == "evaluation" ]; then
78-
79-
exit 0
80-
8178
echo "::group::Create script to run evaluation"
8279
python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
8380
# for good measure, if something happened to updown processor,
@@ -95,7 +92,7 @@ fi
9592
if [ "$1" == "multimodal" ]; then
9693

9794
# Expecting that this might fail this test as-is, because
98-
# it's the first on-pr test depending on githib secrets for access with HF token access
95+
# it's the first on-pr test depending on github secrets for access with HF token access
9996

10097
echo "::group::Create script to run multimodal"
10198
python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
@@ -111,3 +108,20 @@ if [ "$1" == "multimodal" ]; then
111108
bash -x ./run-multimodal.sh
112109
echo "::endgroup::"
113110
fi
111+
112+
if [ "$1" == "native" ]; then
113+
114+
echo "::group::Create script to run native-execution"
115+
python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
116+
# for good measure, if something happened to updown processor,
117+
# and it did not error out, fail with an exit 1
118+
echo "exit 1" >> ./run-native.sh
119+
echo "::endgroup::"
120+
121+
echo "::group::Run native-execution"
122+
echo "*******************************************"
123+
cat ./run-native.sh
124+
echo "*******************************************"
125+
bash -x ./run-native.sh
126+
echo "::endgroup::"
127+
fi

.github/workflows/pull.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ jobs:
731731
732732
git clone https://github.com/ggerganov/llama.cpp.git
733733
pushd llama.cpp
734+
git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3
734735
make
735736
popd
736737

.github/workflows/run-readme-pr-mps.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ jobs:
1010
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
1111
with:
1212
runner: macos-m1-14
13+
timeout-minutes: 50
1314
script: |
1415
conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
1516
conda activate test-readme-mps-macos

.github/workflows/run-readme-pr.yml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,3 +287,46 @@ jobs:
287287
echo "::endgroup::"
288288
289289
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
290+
291+
test-native-any:
292+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
293+
with:
294+
runner: linux.g5.4xlarge.nvidia.gpu
295+
gpu-arch-type: cuda
296+
gpu-arch-version: "12.1"
297+
timeout: 60
298+
script: |
299+
echo "::group::Print machine info"
300+
uname -a
301+
echo "::endgroup::"
302+
303+
echo "::group::Install newer objcopy that supports --set-section-alignment"
304+
yum install -y devtoolset-10-binutils
305+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
306+
echo "::endgroup::"
307+
308+
.ci/scripts/run-docs native
309+
310+
echo "::group::Completion"
311+
echo "tests complete"
312+
echo "*******************************************"
313+
echo "::endgroup::"
314+
315+
test-native-cpu:
316+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
317+
with:
318+
runner: linux.g5.4xlarge.nvidia.gpu
319+
gpu-arch-type: cuda
320+
gpu-arch-version: "12.1"
321+
timeout: 60
322+
script: |
323+
echo "::group::Print machine info"
324+
uname -a
325+
echo "::endgroup::"
326+
327+
echo "::group::Install newer objcopy that supports --set-section-alignment"
328+
yum install -y devtoolset-10-binutils
329+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
330+
echo "::endgroup::"
331+
332+
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native

README.md

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,16 @@ aliases.
4545

4646
| Model | Mobile Friendly | Notes |
4747
|------------------|---|---------------------|
48-
|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)||Tuned for `chat` . Alias to `llama3.2-3b`.|
48+
|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)||Tuned for `chat`. Alias to `llama3.2-3b`.|
4949
|[meta-llama/Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)||Best for `generate`. Alias to `llama3.2-3b-base`.|
50-
|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)||Tuned for classification . Alias to `llama3-1b-guard`.|
51-
|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)||Tuned for `chat` . Alias to `llama3.2-1b`.|
50+
|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)||Tuned for classification. Alias to `llama3-1b-guard`.|
51+
|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)||Tuned for `chat`. Alias to `llama3.2-1b`.|
5252
|[meta-llama/Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)||Best for `generate`. Alias to `llama3.2-1b-base`.|
53-
|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat` . Alias to `llama3.2-11B`.|
54-
|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate` . Alias to `llama3.2-11B-base`.|
55-
|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)||Tuned for `chat` . Alias to `llama3.1`.|
53+
|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat`. Alias to `llama3.2-11B`.|
54+
|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate`. Alias to `llama3.2-11B-base`.|
55+
|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)||Tuned for `chat`. Alias to `llama3.1`.|
5656
|[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)||Best for `generate`. Alias to `llama3.1-base`.|
57-
|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)||Tuned for `chat` . Alias to `llama3`.|
57+
|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)||Tuned for `chat`. Alias to `llama3`.|
5858
|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)||Best for `generate`. Alias to `llama3-base`.|
5959
|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)||Tuned for `chat`. Alias to `llama2`.|
6060
|[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)||Tuned for `chat`. Alias to `llama2-13b-chat`.|
@@ -231,6 +231,8 @@ python3 torchchat.py server llama3.1
231231
```
232232
[skip default]: end
233233

234+
[shell default]: python3 torchchat.py server llama3.1 & server_pid=$!
235+
234236
In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
235237

236238
> [!NOTE]
@@ -244,8 +246,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
244246

245247
**Example Input + Output**
246248

247-
[skip default]: begin
248-
249249
```
250250
curl http://127.0.0.1:5000/v1/chat/completions \
251251
-H "Content-Type: application/json" \
@@ -265,12 +265,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \
265265
]
266266
}'
267267
```
268+
[skip default]: begin
268269
```
269270
{"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
270271
```
271272

272273
[skip default]: end
273274

275+
[shell default]: kill ${server_pid}
274276

275277
</details>
276278

@@ -664,6 +666,6 @@ awesome libraries and tools you've built around local LLM inference.
664666

665667
torchchat is released under the [BSD 3 license](LICENSE). (Additional
666668
code in this distribution is covered by the MIT and Apache Open Source
667-
licenses.) However you may have other legal obligations that govern
669+
licenses.) However, you may have other legal obligations that govern
668670
your use of content, such as the terms of service for third-party
669671
models.

docs/ADVANCED-USERS.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
11
> [!WARNING]
22
> Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation.
33
4-
# Torchchat is still in pre-release!
5-
6-
7-
Torchchat is currently in a pre-release state and under extensive development.
8-
94

105
# The Lost Manual: torchchat
116

127
[**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license)
138

9+
<!--
10+
1411
[shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
1512
1613
[shell default]: ./install/install_requirements.sh
1714
1815
[shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
1916
17+
-->
2018

2119
This is the advanced users' guide, if you're looking to get started
2220
with LLMs, please refer to the README at the root directory of the
@@ -251,6 +249,8 @@ To improve performance, you can compile the model with `--compile`
251249
trading off the time to first token processed with time per token. To
252250
improve performance further, you may also compile the prefill with
253251
`--compile-prefill`. This will increase further compilation times though.
252+
For CPU, you can use `--max-autotune` to further improve the performance
253+
with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
254254

255255
Parallel prefill is not yet supported by exported models, and may be
256256
supported in a future release.
@@ -463,7 +463,7 @@ significant impact on accuracy.
463463

464464
## Native (Stand-Alone) Execution of Exported Models
465465

466-
Refer to the [README](README.md] for an introduction to native
466+
Refer to the [README](README.md) for an introduction to native
467467
execution on servers, desktops, and laptops. Mobile and Edge execution for Android and iOS are
468468
described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively.
469469

docs/model_customization.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ prefill with `--compile_prefill`.
3434

3535
To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/
3636

37+
For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`.
38+
39+
See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
3740

3841
## Model Precision
3942

install/.pins/et-pin.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
72b3bb3194c611f7c4861e6f3b24af5de868af72
1+
98e4dd524f2cb08414ee015b27616229cabc06ba

install/install_requirements.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
6262
# NOTE: If a newly-fetched version of the executorch repo changes the value of
6363
# PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
6464
# package versions.
65-
PYTORCH_NIGHTLY_VERSION=dev20241002
65+
PYTORCH_NIGHTLY_VERSION=dev20241013
6666

6767
# Nightly version for torchvision
68-
VISION_NIGHTLY_VERSION=dev20241002
68+
VISION_NIGHTLY_VERSION=dev20241013
6969

7070
# Nightly version for torchtune
71-
TUNE_NIGHTLY_VERSION=dev20241010
71+
TUNE_NIGHTLY_VERSION=dev20241013
7272

7373
# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
7474
(

torchchat/utils/docs/evaluation.md

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,13 @@
44

55
# Evaluation Features
66

7+
<!--
8+
79
[shell default]: ./install/install_requirements.sh
810
11+
[shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
12+
13+
-->
914

1015
Torchchat provides evaluation functionality for your language model on
1116
a variety of tasks using the
@@ -14,26 +19,53 @@ library.
1419

1520
## Usage
1621

17-
The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
22+
The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext".
23+
24+
## Examples
1825

19-
**Examples**
26+
### Evaluation example with model in Python
2027

2128
Running wikitext for 10 iterations
2229
```
2330
python3 torchchat.py eval stories15M --tasks wikitext --limit 10
2431
```
2532

26-
Running an exported model
33+
Running wikitext with torch.compile for 10 iterations
34+
```
35+
python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10
36+
```
37+
38+
Running multiple tasks and calling eval.py directly (with torch.compile):
39+
```
40+
python3 torchchat.py eval stories15M --compile --tasks wikitext hellaswag
41+
```
42+
43+
### Evaluation with model exported to PTE with ExecuTorch
44+
45+
Running an exported model with ExecuTorch (as PTE)
2746
```
2847
python3 torchchat.py export stories15M --output-pte-path stories15M.pte
2948
python3 torchchat.py eval stories15M --pte-path stories15M.pte
3049
```
3150

32-
Running multiple tasks and calling eval.py directly:
51+
Running multiple tasks and calling eval.py directly (with PTE):
3352
```
3453
python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
3554
```
3655

56+
### Evaluation with model exported to DSO with AOT Inductor (AOTI)
57+
58+
Running an exported model with AOT Inductor (DSO model)
59+
```
60+
python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so
61+
python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so
62+
```
63+
64+
Running multiple tasks and calling eval.py directly (with AOTI):
65+
```
66+
python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag
67+
```
68+
3769
For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
3870

3971
[end default]: end

0 commit comments

Comments
 (0)