pytorch
diff --git a/‎.github/workflows/run-readme-pr-linuxaarch64.yml‎
Lines changed: 114 additions & 0 deletions b/‎.github/workflows/run-readme-pr-linuxaarch64.yml‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎.github/workflows/run-readme-pr-mps.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/run-readme-pr-mps.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/ADVANCED-USERS.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/ADVANCED-USERS.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎install/install_requirements.sh‎
Lines changed: 26 additions & 16 deletions b/‎install/install_requirements.sh‎
Lines changed: 26 additions & 16 deletions
diff --git a/‎torchchat/cli/builder.py‎
Lines changed: 19 additions & 1 deletion b/‎torchchat/cli/builder.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎torchchat/cli/cli.py‎
Lines changed: 9 additions & 2 deletions b/‎torchchat/cli/cli.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎torchchat/generate.py‎
Lines changed: 15 additions & 3 deletions b/‎torchchat/generate.py‎
Lines changed: 15 additions & 3 deletions
@@ -0,0 +1,114 @@
+name: Run the README instructions - with stories - on Linux aarch64
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-readme-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-quantization-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
+
+  test-gguf-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-advanced-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-evaluation-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
@@ -10,7 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
-      timeout: 50
+      timeout: 60
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
@@ -63,7 +63,7 @@ jobs:
   test-gguf-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
@@ -90,7 +90,7 @@ jobs:
   test-advanced-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
 
@@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
 |-----|------|-----|-----|-----|-----|
 | x86 | Linux | ✅ |  ✅ |  ✅ |  ✅ |
-| aarch64 | Linux | n/t | n/t | n/t | n/t |
+| aarch64 | Linux | ✅ | ✅ | ✅ | n/t |
 | aarch64 | macOS | ✅ |  ✅ |  ✅ |  ✅ |
 | AMD GPU | Linux |  ✅ |  ✅ |  ✅ | ❌|
 | Nvidia GPU | Linux | ✅ |  ✅ |  ✅ | ❌|
@@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Mobile GPU (Vulkan) | Android |  ❌|❌|❌| ✅ |
 | CoreML | iOS |  ❌|❌|❌| ✅ |
 | Hexagon DSP | Android | ❌|❌|❌| ✅ |
-| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ |
+| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ |
 | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t |
 | ARM 32b (up to v7) | any | ❌|❌|❌|❌|
 
 
@@ -51,19 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241218
+PYTORCH_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241218
+VISION_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241218
-
-# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
-(
-  set -x
-  $PIP_EXECUTABLE uninstall -y triton
-)
+TUNE_NIGHTLY_VERSION=dev20250119
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
@@ -74,16 +68,28 @@ then
 elif [[ -x "$(command -v rocminfo)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
+elif [[ -x "$(command -v xpu-smi)" ]];
+then
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/xpu"
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
 
 # pip packages needed by exir.
-REQUIREMENTS_TO_INSTALL=(
-  torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
-  torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-  torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
-)
+if [[ -x "$(command -v xpu-smi)" ]];
+then
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.6.0"
+  )
+else
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
+  )
+fi
 
 #
 # First install requirements in install/requirements.txt. Older torch may be
@@ -95,6 +101,12 @@ REQUIREMENTS_TO_INSTALL=(
   $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}"
 )
 
+# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
+(
+  set -x
+  $PIP_EXECUTABLE uninstall -y triton
+)
+
 # Install the requirements. --extra-index-url tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
 (
@@ -116,8 +128,6 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
     $PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py
   )
 fi
-
-
 (
   set -x
   $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
 
@@ -69,10 +69,16 @@ class BuilderArgs:
     prefill_possible: bool = False
     dynamic_shapes: bool = False
     max_seq_length: Optional[int] = None
+    attention_backend: str = "math"
 
     def __post_init__(self):
         if self.device is None:
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.xpu.is_available():
+                self.device = "xpu"
+            else:
+                self.device = "cpu"
 
         if not (
             (self.checkpoint_path and self.checkpoint_path.is_file())
@@ -178,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
+        sdp_backend_dict = {
+            'math': torch.nn.attention.SDPBackend.MATH,
+            'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+        }
+        attention_backend = sdp_backend_dict[args.attention_backend]
+        if args.device == "cpu" and (args.attention_backend == "efficient_attention"
+                                     or args.attention_backend == "cudnn_attention"):
+            print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
+            attention_backend = torch.nn.attention.SDPBackend.MATH
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -202,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
+            attention_backend=attention_backend,
         )
 
     @classmethod
 
@@ -176,8 +176,15 @@ def _add_model_config_args(parser, verb: str) -> None:
         "--device",
         type=str,
         default=None,
-        choices=["fast", "cpu", "cuda", "mps"],
-        help="Hardware device to use. Options: fast, cpu, cuda, mps",
+        choices=["fast", "cpu", "cuda", "mps", "xpu"],
+        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
+    )
+    model_config_parser.add_argument(
+        "--attention-backend",
+        type=str,
+        default="math",
+        choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
+        help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
     )
 
 
 
@@ -26,6 +26,7 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torch._C import _SDPBackend as SDPBackend
 
 from PIL import Image
 
@@ -531,6 +532,7 @@ def decode_n_tokens(
         callback=lambda _: _,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
         new_tokens, new_probs = [], []
@@ -539,7 +541,7 @@ def decode_n_tokens(
             num_new_tokens - 1
         ):  # -1 to save space to run an EoS if dont generate it naturally
             # Actually better for Inductor to codegen attention here
-            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            with torch.nn.attention.sdpa_kernel([attention_backend]):
 
                 out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
@@ -683,6 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -799,6 +802,7 @@ def generate(
                     if self.is_llama3_model
                     else None
                 ),
+                attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
                 generated_tokens.append(generated_token.view(-1))
@@ -1122,7 +1126,7 @@ def chat(
                     messages_to_encode.append(
                         {"role": "system", "content": self.system_prompt}
                     )
-                messages_to_encode.append({"role": "system", "content": prompt})
+                messages_to_encode.append({"role": "user", "content": prompt})
                 encoded = self.chat_formatter.encode_dialog_prompt(
                     messages_to_encode, add_generation_prompt=True,
                 )
@@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False):
                     start_pos=start_pos,
                     skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
+                    attention_backend=self.builder_args.attention_backend,
                 )
                 for token_tensor, metrics in generator_func:
                     if token_tensor is not None:
@@ -1203,8 +1208,10 @@ def callback(x, *, done_generating=False):
             if hasattr(prof, "export_chrome_trace"):
                 if self.builder_args.device == "cpu":
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
-                else:
+                elif self.builder_args.device == "cuda":
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
+                else:
+                    print(prof.key_averages().table(sort_by="self_xpu_time_total"))
                 prof.export_chrome_trace(f"{self.profile}.json")
 
             if start_pos >= max_seq_length:
@@ -1289,6 +1296,9 @@ def callback(x, *, done_generating=False):
             )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
+        if torch.xpu.is_available():
+            print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB")
+
 
 
 class DistributedGenerator(LocalGenerator):
@@ -1615,6 +1625,8 @@ def run_generator(
         )
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
+        if torch.xpu.is_available():
+            torch.xpu.reset_peak_memory_stats()
 
         for _ in gen.chat(generator_args):
             pass