Merge branch 'pytorch:main' into main

Sebastian-Larsson · web-flow · commit e901687d6684 · 2024-12-12T14:01:49.000+01:00
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
@@ -157,9 +157,9 @@ def test_layer_norm_tosa_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
-    @parameterized.expand(test_data_suite[:-2])
+    @parameterized.expand(test_data_suite)
     @unittest.expectedFailure
-    def test_layer_norm_u55_BI(
+    def test_layer_norm_u55_BI_xfails(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -171,7 +171,8 @@ def test_layer_norm_u55_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite[:-2])
-    def test_layer_norm_u85_BI_fvp(
+    @unittest.expectedFailure
+    def test_layer_norm_u85_BI_xfails(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -182,7 +183,6 @@ def test_layer_norm_u85_BI_fvp(
         )
 
     @parameterized.expand(test_data_suite[-2:])
-    @unittest.skip  # Flaky
     def test_layer_norm_u85_BI(
         self,
         test_name: str,
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
@@ -59,6 +59,7 @@ partially lower the Llama model to Vulkan.
 # The files will usually be downloaded to ~/.llama
 python -m examples.models.llama.export_llama \
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
+  --model "llama3_2" \ 
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
diff --git a/devtools/bundled_program/core.py b/devtools/bundled_program/core.py
@@ -9,7 +9,6 @@
 from typing import Dict, List, Optional, Sequence, Type, Union
 
 import executorch.devtools.bundled_program.schema as bp_schema
-from pyre_extensions import none_throws
 
 import executorch.exir.schema as core_schema
 
@@ -44,10 +43,12 @@ class BundledProgram:
 
     def __init__(
         self,
-        executorch_program: Optional[Union[
-            ExecutorchProgram,
-            ExecutorchProgramManager,
-        ]],
+        executorch_program: Optional[
+            Union[
+                ExecutorchProgram,
+                ExecutorchProgramManager,
+            ]
+        ],
         method_test_suites: Sequence[MethodTestSuite],
         pte_file_path: Optional[str] = None,
     ):
@@ -59,18 +60,24 @@ def __init__(
             pte_file_path: The path to pte file to deserialize program if executorch_program is not provided.
         """
         if not executorch_program and not pte_file_path:
-            raise RuntimeError("Either executorch_program or pte_file_path must be provided")
+            raise RuntimeError(
+                "Either executorch_program or pte_file_path must be provided"
+            )
 
         if executorch_program and pte_file_path:
-            raise RuntimeError("Only one of executorch_program or pte_file_path can be used")
+            raise RuntimeError(
+                "Only one of executorch_program or pte_file_path can be used"
+            )
 
         method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
         if executorch_program:
             self._assert_valid_bundle(executorch_program, method_test_suites)
-        self.executorch_program: Optional[Union[
-            ExecutorchProgram,
-            ExecutorchProgramManager,
-        ]] = executorch_program
+        self.executorch_program: Optional[
+            Union[
+                ExecutorchProgram,
+                ExecutorchProgramManager,
+            ]
+        ] = executorch_program
         self._pte_file_path: Optional[str] = pte_file_path
 
         self.method_test_suites = method_test_suites
@@ -88,7 +95,8 @@ def serialize_to_schema(self) -> bp_schema.BundledProgram:
         if self.executorch_program:
             program = self._extract_program(self.executorch_program)
         else:
-            with open(none_throws(self._pte_file_path), "rb") as f:
+            assert self._pte_file_path is not None
+            with open(self._pte_file_path, "rb") as f:
                 p_bytes = f.read()
             program = _deserialize_pte_binary(p_bytes)
 
diff --git a/devtools/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py
@@ -6,9 +6,10 @@
 
 # pyre-strict
 
+import tempfile
 import unittest
 from typing import List
-import tempfile
+
 import executorch.devtools.bundled_program.schema as bp_schema
 
 import torch
@@ -73,7 +74,7 @@ def test_bundled_program(self) -> None:
             bundled_program.serialize_to_schema().program,
             bytes(_serialize_pte_binary(executorch_program.executorch_program)),
         )
-        
+
     def test_bundled_program_from_pte(self) -> None:
         executorch_program, method_test_suites = get_common_executorch_program()
 
@@ -82,11 +83,17 @@ def test_bundled_program_from_pte(self) -> None:
             with open(executorch_model_path, "wb") as f:
                 f.write(executorch_program.buffer)
 
-            bundled_program = BundledProgram(executorch_program=None, method_test_suites=method_test_suites, pte_file_path=executorch_model_path)
+            bundled_program = BundledProgram(
+                executorch_program=None,
+                method_test_suites=method_test_suites,
+                pte_file_path=executorch_model_path,
+            )
 
             method_test_suites = sorted(method_test_suites, key=lambda t: t.method_name)
 
-            for plan_id in range(len(executorch_program.executorch_program.execution_plan)):
+            for plan_id in range(
+                len(executorch_program.executorch_program.execution_plan)
+            ):
                 bundled_plan_test = (
                     bundled_program.serialize_to_schema().method_test_suites[plan_id]
                 )
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
@@ -92,7 +92,7 @@ tosa_reference_model_rev="c5570b79e90c3a36ab8c4ddb8ee3fbc2cd3f7c38"
 
 # vela
 vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela"
-vela_rev="a08fc18780827b5fefc814dd0162ee6317ce0ae7"
+vela_rev="5427dc7e9c1a4c7d554163290faeea75f168772d"
 
 ########
 ### Mandatory user args
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -168,6 +168,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama.export_llama \
+  --model "llama3_2" \
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv \
@@ -189,6 +190,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m examples.models.llama.export_llama \
+   --model "llama3_2" \
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -214,6 +216,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m examples.models.llama.export_llama \
+   --model "llama3_2" \
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    -qat \
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -113,6 +113,7 @@ class ModelArgs:
     )
     rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
     use_scaled_rope: bool = False  # Use scaled RoPE, introduced in llama3.1.
+    rope_scale_factor: int = 8
     # Additional Model Metadata needed at runtime
     bos_idx: int = 1
     eos_idx: int = 3
@@ -155,7 +156,9 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = hf_precompute_freqs_cis
         else:
             self.precompute_freqs_cis = partial(
-                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+                precompute_freqs_cis,
+                use_scaled=self.params.use_scaled_rope,
+                scale_factor=self.params.rope_scale_factor,
             )
         freqs_cos, freqs_sin = self.precompute_freqs_cis(
             self.params.head_dim,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -145,6 +145,15 @@ def __init__(self, **kwargs):
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
         )
+
+        if model_args.use_scaled_rope:
+            # Older models don't have use_scaled_rope configuration
+            assert self.args.model not in ["llama2", "stories110m"]
+
+            # Llama3_2 and newer models in ExecuTorch repo should set larger scale factor
+            if self.args.model not in ["llama3", "llama3_1"]:
+                model_args.rope_scale_factor = 32
+
         if kwargs.get("verbose", False):
             print("============= weights ================")
             print("{key} : {weights.numel()} : {weights.size()}")
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -8,16 +8,15 @@
 # Different RoPE implementations
 
 import math
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 
 # ======================== Stock Implementation ========================
 
 
-def apply_scaling(freqs: torch.Tensor):
+def apply_scaling(freqs: torch.Tensor, scale_factor: int):
     # Values obtained from grid search
-    scale_factor = 8
     low_freq_factor = 1
     high_freq_factor = 4
     old_context_len = 8192  # original llama3 length
@@ -41,14 +40,19 @@ def apply_scaling(freqs: torch.Tensor):
 
 
 def precompute_freqs_cis(
-    dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False
+    dim: int,
+    end: int,
+    theta: float = 10000.0,
+    use_scaled: bool = False,
+    scale_factor: Optional[int] = None,
 ):
     freqs = 1.0 / (
         theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
     )
     t = torch.arange(end, device=freqs.device)  # pyre-ignore
     if use_scaled:
-        freqs = apply_scaling(freqs)  # pyre-ignore
+        assert scale_factor is not None
+        freqs = apply_scaling(freqs, scale_factor)  # pyre-ignore
     freqs = torch.outer(t, freqs).float()
     freqs_cos = torch.cos(freqs)
     freqs_sin = torch.sin(freqs)