Merge branch 'main' into refactor/distributed_inference_without_abstraction

Jack-Khuu · web-flow · commit 99c33e8e79bc · 2024-12-10T18:32:01.000-08:00
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
@@ -75,9 +75,6 @@ if [ "$1" == "advanced" ]; then
 fi
 
 if [ "$1" == "evaluation" ]; then
-
-    exit 0
-
         echo "::group::Create script to run evaluation"
         python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
         # for good measure, if something happened to updown processor,
@@ -95,7 +92,7 @@ fi
 if [ "$1" == "multimodal" ]; then
 
    # Expecting that this might fail this test as-is, because 
-   # it's the first on-pr test depending on githib secrets for access with HF token access
+   # it's the first on-pr test depending on github secrets for access with HF token access
 
         echo "::group::Create script to run multimodal"
         python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
@@ -111,3 +108,20 @@ if [ "$1" == "multimodal" ]; then
         bash -x ./run-multimodal.sh
         echo "::endgroup::"
 fi
+
+if [ "$1" == "native" ]; then
+
+        echo "::group::Create script to run native-execution"
+        python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-native.sh
+        echo "::endgroup::"
+
+        echo "::group::Run native-execution"
+        echo "*******************************************"
+        cat ./run-native.sh
+        echo "*******************************************"
+        bash -x ./run-native.sh
+        echo "::endgroup::"
+fi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -731,6 +731,7 @@ jobs:
 
           git clone https://github.com/ggerganov/llama.cpp.git
           pushd llama.cpp
+          git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3
           make
           popd
 
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
@@ -10,6 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
+      timeout-minutes: 50
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
@@ -287,3 +287,46 @@ jobs:
         echo "::endgroup::"
 
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
+
+  test-native-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs native
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-native-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native  
diff --git a/README.md b/README.md
@@ -45,16 +45,16 @@ aliases.
 
 | Model | Mobile Friendly | Notes |
 |------------------|---|---------------------|
-|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-3b`.|
+|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-3b`.|
 |[meta-llama/Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|✅|Best for `generate`. Alias to `llama3.2-3b-base`.|
-|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification . Alias to `llama3-1b-guard`.|
-|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-1b`.|
+|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification. Alias to `llama3-1b-guard`.|
+|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-1b`.|
 |[meta-llama/Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|✅|Best for `generate`. Alias to `llama3.2-1b-base`.|
-|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat` . Alias to `llama3.2-11B`.|
-|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate` . Alias to `llama3.2-11B-base`.|
-|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.1`.|
+|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat`. Alias to `llama3.2-11B`.|
+|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate`. Alias to `llama3.2-11B-base`.|
+|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.1`.|
 |[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)|✅|Best for `generate`. Alias to `llama3.1-base`.|
-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3`.|
+|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3`.|
 |[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|✅|Best for `generate`. Alias to `llama3-base`.|
 |[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|✅|Tuned for `chat`. Alias to `llama2`.|
 |[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)||Tuned for `chat`. Alias to `llama2-13b-chat`.|
@@ -231,6 +231,8 @@ python3 torchchat.py server llama3.1
 ```
 [skip default]: end
 
+[shell default]: python3 torchchat.py server llama3.1 & server_pid=$!
+
 In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
 
 > [!NOTE]
@@ -244,8 +246,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
-[skip default]: begin
-
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -265,12 +265,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \
     ]
   }'
 ```
+[skip default]: begin
 ```
 {"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
 ```
 
 [skip default]: end
 
+[shell default]: kill ${server_pid}
 
 </details>
 
@@ -664,6 +666,6 @@ awesome libraries and tools you've built around local LLM inference.
 
 torchchat is released under the [BSD 3 license](LICENSE). (Additional
 code in this distribution is covered by the MIT and Apache Open Source
-licenses.) However you may have other legal obligations that govern
+licenses.) However, you may have other legal obligations that govern
 your use of content, such as the terms of service for third-party
 models.
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
@@ -1,22 +1,20 @@
 > [!WARNING]
 > Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation.
 
-# Torchchat is still in pre-release!
-
-
-Torchchat is currently in a pre-release state and under extensive development.
-
 
 # The Lost Manual: torchchat
 
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license)
 
+<!--
+
 [shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
 
 [shell default]: ./install/install_requirements.sh
 
 [shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
 
+-->
 
 This is the advanced users' guide, if you're looking to get started
 with LLMs, please refer to the README at the root directory of the
@@ -251,6 +249,8 @@ To improve performance, you can compile the model with `--compile`
 trading off the time to first token processed with time per token.  To
 improve performance further, you may also compile the prefill with
 `--compile-prefill`. This will increase further compilation times though. 
+For CPU, you can use `--max-autotune` to further improve the performance
+with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 Parallel prefill is not yet supported by exported models, and may be
 supported in a future release.
@@ -463,7 +463,7 @@ significant impact on accuracy.
 
 ## Native (Stand-Alone) Execution of Exported Models
 
-Refer to the [README](README.md] for an introduction to native
+Refer to the [README](README.md) for an introduction to native
 execution on servers, desktops, and laptops.  Mobile and Edge execution for Android and iOS are
 described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively.
 
diff --git a/docs/model_customization.md b/docs/model_customization.md
@@ -34,6 +34,9 @@ prefill with `--compile_prefill`.
 
 To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/
 
+For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`.
+
+See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 ## Model Precision
 
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
@@ -1 +1 @@
-72b3bb3194c611f7c4861e6f3b24af5de868af72
+98e4dd524f2cb08414ee015b27616229cabc06ba
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -62,13 +62,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241002
+PYTORCH_NIGHTLY_VERSION=dev20241013
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241002
+VISION_NIGHTLY_VERSION=dev20241013
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241010
+TUNE_NIGHTLY_VERSION=dev20241013
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md
@@ -4,8 +4,13 @@
 
 # Evaluation Features
 
+<!--
+
 [shell default]: ./install/install_requirements.sh
 
+[shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
+
+-->
 
 Torchchat provides evaluation functionality for your language model on
 a variety of tasks using the
@@ -14,26 +19,53 @@ library.
 
 ## Usage
 
-The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
+The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext".
+
+## Examples
 
-**Examples**
+### Evaluation example with model in Python
 
 Running wikitext for 10 iterations
 ```
 python3 torchchat.py eval stories15M --tasks wikitext --limit 10
 ```
 
-Running an exported model
+Running wikitext with torch.compile for 10 iterations
+```
+python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10
+```
+
+Running multiple tasks and calling eval.py directly (with torch.compile):
+```
+python3 torchchat.py eval stories15M --compile --tasks wikitext hellaswag
+```
+
+### Evaluation with model exported to PTE with ExecuTorch
+
+Running an exported model with ExecuTorch (as PTE)
 ```
 python3 torchchat.py export stories15M --output-pte-path stories15M.pte
 python3 torchchat.py eval stories15M --pte-path stories15M.pte
 ```
 
-Running multiple tasks and calling eval.py directly:
+Running multiple tasks and calling eval.py directly (with PTE):
 ```
 python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
 ```
 
+### Evaluation with model exported to DSO with AOT Inductor (AOTI)
+
+Running an exported model with AOT Inductor (DSO model)
+```
+python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so
+python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so
+```
+
+Running multiple tasks and calling eval.py directly (with AOTI):
+```
+python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag
+```
+
 For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
 
 [end default]: end

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-72b3bb3194c611f7c4861e6f3b24af5de868af72`
	`1`	`+98e4dd524f2cb08414ee015b27616229cabc06ba`