Merge branch 'main' into refactor/dist_run

mreso · web-flow · commit abf0679b10c2 · 2024-10-24T12:52:17.000-07:00
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
@@ -7,7 +7,7 @@ fi
 
 if [ "$1" == "readme" ]; then
         echo "::group::Create script to run README"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
+        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-readme.sh
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1092,32 +1092,11 @@ jobs:
         id: install-torchao-ops
         run: |
           bash torchchat/utils/scripts/build_torchao_ops.sh
-      - name: Set git shas
-        id: setup-hash
-        run: |
-          export TORCHCHAT_ROOT=${PWD}
-          echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
-      - name: Load or install ET
-        id: install-et
-        uses: actions/cache@v4
-        with:
-          path: |
-            ./et-build
-            ./torchchat/utils/scripts/install_et.sh
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
-      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
-        continue-on-error: true
+      - name: Install ET
         run: |
           echo "Installing ExecuTorch"
+          export TORCHCHAT_ROOT=${PWD}
           bash torchchat/utils/scripts/install_et.sh
-      - name: Install ExecuTorch python
-        run: |
-          echo "Install ExecuTorch python"
-          export TORCHCHAT_ROOT=$PWD
-          export ET_BUILD_DIR="et-build"
-          ENABLE_ET_PYBIND="${1:-true}"
-          source "torchchat/utils/scripts/install_utils.sh"
-          install_executorch_python_libs $ENABLE_ET_PYBIND
       - name: Install runner
         run: |
           echo "Installing runner"
@@ -1132,14 +1111,14 @@ jobs:
           wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
           export PRMT="Once upon a time in a land far away"
           echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
           echo "Export and run ET (C++ runner)"
-          python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Export and run AOTI (C++ runner)"
-          python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Generate AOTI"
           python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
diff --git a/README.md b/README.md
@@ -171,7 +171,7 @@ python3 torchchat.py download llama3.1
 <summary>Additional Model Inventory Management Commands</summary>
 
 ### Where
-This subcommand shows location of a particular model.
+This subcommand shows the location of a particular model.
 ```bash
 python3 torchchat.py where llama3.1
 ```
@@ -216,7 +216,6 @@ This mode generates text based on an input prompt.
 python3 torchchat.py generate llama3.1 --prompt "write me a story about a boy and his bear"
 ```
 
-[skip default]: end
 
 ### Server
 This mode exposes a REST API for interacting with a model.
@@ -286,14 +285,16 @@ First, follow the steps in the Server section above to start a local server. The
 streamlit run torchchat/usages/browser.py
 ```
 
+[skip default]: end
+
 Use the "Max Response Tokens" slider to limit the maximum number of tokens generated by the model for each response. Click the "Reset Chat" button to remove the message history and start a fresh chat.
 
 
 ## Desktop/Server Execution
 
 ### AOTI (AOT Inductor)
 [AOTI](https://pytorch.org/blog/pytorch2-2/) compiles models before execution for faster inference. The process creates a [DSO](https://en.wikipedia.org/wiki/Shared_library) model (represented by a file with extension `.so`)
-that is then loaded for inference. This can be done with both Python and C++ enviroments.
+that is then loaded for inference. This can be done with both Python and C++ environments.
 
 The following example exports and executes the Llama3.1 8B Instruct
 model.  The first command compiles and performs the actual export.
@@ -308,9 +309,9 @@ python3 torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.s
 For more details on quantization and what settings to use for your use
 case visit our [customization guide](docs/model_customization.md).
 
-### Run in a Python Enviroment
+### Run in a Python Environment
 
-To run in a python enviroment, use the generate subcommand like before, but include the dso file.
+To run in a python environment, use the generate subcommand like before, but include the dso file.
 
 ```
 python3 torchchat.py generate llama3.1 --dso-path exportedModels/llama3.1.so --prompt "Hello my name is"
@@ -377,7 +378,7 @@ While ExecuTorch does not focus on desktop inference, it is capable
 of doing so. This is handy for testing out PTE
 models without sending them to a physical device.
 
-Specifically there are 2 ways of doing so: Pure Python and via a Runner
+Specifically, there are 2 ways of doing so: Pure Python and via a Runner
 
 <details>
 <summary>Deploying via Python</summary>
@@ -501,7 +502,7 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
 and use [this script](https://github.com/pytorch/executorch/blob/main/build/build_android_llm_demo.sh) to build the AAR library.
 
 <p align="center">
-    <img src="https://pytorch.org/executorch/main/_static/img/android_llama_app.png" width="600" alt="Android app running a LlaMA model">
+    <img src="https://pytorch.org/executorch/main/_static/img/chat.png" width="600" alt="Android app running a LlaMA model">
 </p>
 
 
diff --git a/assets/view.jpg b/assets/view.jpg
diff --git a/docs/quantization.md b/docs/quantization.md
@@ -121,22 +121,29 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my n
 ## Experimental TorchAO lowbit kernels
 
 ### Use
-The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
+
+#### linear:a8wxdq
+The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
 It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
 The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
 Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.
 
-You should expect high performance on ARM CPU if bitwidth is 1, 2, 3, 4, or 5 and groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+You should expect high performance on ARM CPU if groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+
+#### embedding:wx
+The quantization scheme embedding:wx quantizes embeddings in a groupwise manner with the specified bitwidth and groupsize.  It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize.  Unlike linear:a8wxdq, embedding:wx always quantizes with scales and zeros.
+
+You should expect high performance on ARM CPU if groupsize is divisible by 32.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
 
 ### Setup
-To use a8wxdq, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
 
 From the torchchat root directory, run
 ```
 sh torchchat/utils/scripts/build_torchao_ops.sh
 ```
 
-This should take about 10 seconds to complete.  Once finished, you can use a8wxdq in torchchat.
+This should take about 10 seconds to complete.
 
 Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
@@ -156,17 +163,17 @@ Below we show how to use the new kernels.  Except for ExecuTorch, you can specif
 
 #### Eager mode
 ```
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --prompt "Once upon a time,"  --num-samples 5
 ```
 
 #### torch.compile
 ```
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile --prompt "Once upon a time,"  --num-samples 5
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile --prompt "Once upon a time,"  --num-samples 5
 ```
 
 #### AOTI
 ```
-OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3_1.so
+OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
 OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time,"  --num-samples 5
 ```
 
@@ -178,7 +185,7 @@ OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cac
 
 #### ExecuTorch
 ```
-python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3_1.pte
+python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-pte llama3_1.pte
 ```
 
 Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-49b1fb61c8b8eceda755579a2fd92c756d822de2
+c8f1174a06dcc0102849c8348ca6573bde8847a9
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -84,19 +84,16 @@ def __post_init__(self):
         if self.dso_path and self.pte_path:
             raise RuntimeError("specify either DSO path or PTE path, but not both")
 
-        if self.checkpoint_path and (self.dso_path or self.pte_path):
-            print(
-                "Warning: checkpoint path ignored because an exported DSO or PTE path specified"
-            )
-        if self.checkpoint_dir and (self.dso_path or self.pte_path):
-            print(
-                "Warning: checkpoint dir ignored because an exported DSO or PTE path specified"
-            )
-        if self.gguf_path and (self.dso_path or self.pte_path):
-            print(
-                "Warning: GGUF path ignored because an exported DSO or PTE path specified"
-            )
-        if not (self.dso_path) and not (self.pte_path):
+        if self.dso_path or self.pte_path:
+            ignored_params = [
+                (self.checkpoint_path, "checkpoint path"),
+                (self.checkpoint_dir, "checkpoint dir"),
+                (self.gguf_path, "GGUF path"),
+            ]
+            for param, param_msg in ignored_params:
+                if param:
+                    print(f"Warning: {param_msg} ignored because an exported DSO or PTE path was specified")
+        else:
             self.prefill_possible = True
 
     @classmethod
@@ -458,7 +455,7 @@ def _maybe_init_distributed(
     return world_mesh, parallel_dims
 
 
-def _maybe_parellelize_model(
+def _maybe_parallelize_model(
     model: nn.Module,
     builder_args: BuilderArgs,
     world_mesh: DeviceMesh,
@@ -498,7 +495,7 @@ def _load_model(builder_args: BuilderArgs) -> Model:
     #    model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-    # model = _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims)
+    model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
@@ -10,7 +10,10 @@
 from pathlib import Path
 from typing import Optional
 
-from torchchat.cli.convert_hf_checkpoint import convert_hf_checkpoint, convert_hf_checkpoint_to_tune
+from torchchat.cli.convert_hf_checkpoint import (
+    convert_hf_checkpoint,
+    convert_hf_checkpoint_to_tune,
+)
 from torchchat.model_config.model_config import (
     load_model_configs,
     ModelConfig,
@@ -57,7 +60,6 @@ def _download_hf_snapshot(
         snapshot_download(
             model_config.distribution_path,
             local_dir=artifact_dir,
-            local_dir_use_symlinks=False,
             token=hf_token,
             ignore_patterns=ignore_patterns,
         )
@@ -77,9 +79,14 @@ def _download_hf_snapshot(
             raise e
 
     # Convert the Multimodal Llama model to the torchtune format.
-    if model_config.name in {"meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-11B-Vision"}:
+    if model_config.name in {
+        "meta-llama/Llama-3.2-11B-Vision-Instruct",
+        "meta-llama/Llama-3.2-11B-Vision",
+    }:
         print(f"Converting {model_config.name} to torchtune format...", file=sys.stderr)
-        convert_hf_checkpoint_to_tune( model_dir=artifact_dir, model_name=model_config.name)
+        convert_hf_checkpoint_to_tune(
+            model_dir=artifact_dir, model_name=model_config.name
+        )
 
     else:
         # Convert the model to the torchchat format.
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -180,8 +180,15 @@ def from_args(cls, args):
 
         # Validate that all image prompts exist before expensive model load
         if image_prompts := getattr(args, "image_prompts", None):
-            if not all(os.path.exists(image_prompt) for image_prompt in image_prompts):
-                raise RuntimeError(f"Image prompt {image_prompt} does not exist")
+            non_existent_image_prompts = [
+                image_prompt
+                for image_prompt in image_prompts
+                if (not os.path.exists(image_prompt))
+            ]
+            if len(non_existent_image_prompts):
+                raise RuntimeError(
+                    f"Image prompt {non_existent_image_prompts} does not exist"
+                )
 
         return cls(
             prompt=getattr(args, "prompt", ""),
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+from enum import Enum
 import logging
 import os
 from pathlib import Path
@@ -78,36 +79,33 @@ def set_backend(dso, pte):
     active_builder_args_pte = pte
 
 
-def use_aoti_backend() -> bool:
+class _Backend(Enum):
+    AOTI = 0,
+    EXECUTORCH = 1
+
+
+def _active_backend() -> _Backend:
     global active_builder_args_dso
     global active_builder_args_pte
 
     # eager == aoti, which is when backend has not been explicitly set
     if (not active_builder_args_dso) and not (active_builder_args_pte):
-        return True
+        return _Backend.AOTI
 
     if active_builder_args_pte and active_builder_args_dso:
         raise RuntimeError(
             "code generation needs to choose different implementations for DSO and PTE path.  Please only use one export option, and call export twice if necessary!"
         )
 
-    return bool(active_builder_args_dso)
+    return _Backend.AOTI if active_builder_args_dso else _Backend.EXECUTORCH
 
 
-def use_et_backend() -> bool:
-    global active_builder_args_dso
-    global active_builder_args_pte
-
-    # eager == aoti, which is when backend has not been explicitly set
-    if not (active_builder_args_pte or active_builder_args_dso):
-        return False
+def use_aoti_backend() -> bool:
+    return _active_backend() == _Backend.AOTI
 
-    if active_builder_args_pte and active_builder_args_dso:
-        raise RuntimeError(
-            "code generation needs to choose different implementations for DSO and PTE path.  Please only use one export option, and call export twice if necessary!"
-        )
 
-    return bool(active_builder_args_pte)
+def use_et_backend() -> bool:
+    return _active_backend() == _Backend.EXECUTORCH
 
 
 ##########################################################################
@@ -142,9 +140,9 @@ def name_to_dtype(name, device):
                 return torch.float16
         return torch.bfloat16
 
-    if name in name_to_dtype_dict:
+    try:
         return name_to_dtype_dict[name]
-    else:
+    except KeyError:
         raise RuntimeError(f"unsupported dtype name {name} specified")
 
 
@@ -212,10 +210,7 @@ def canonical_path(path):
 
 
 def state_dict_device(d, device="cpu") -> Dict:
-    for key, weight in d.items():
-        d[key] = weight.to(device=device)
-
-    return d
+    return {key : weight.to(device=device) for (key, weight) in d.items()}
 
 
 #########################################################################
@@ -259,9 +254,9 @@ def get_device(device) -> str:
     return torch.device(device)
 
 
-def is_cuda_or_cpu_device(device) -> bool:
-    return device == "" or str(device) == "cpu" or ("cuda" in str(device))
-
-
 def is_cpu_device(device) -> bool:
     return device == "" or str(device) == "cpu"
+
+
+def is_cuda_or_cpu_device(device) -> bool:
+    return is_cpu_device(device) or ("cuda" in str(device))
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
diff --git a/torchchat/utils/scripts/updown.py b/torchchat/utils/scripts/updown.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-49b1fb61c8b8eceda755579a2fd92c756d822de2`
	`1`	`+c8f1174a06dcc0102849c8348ca6573bde8847a9`