Update base for Update on "[ET-VK] Replace Uniform buffers with push constants for copy op"

trivedivivek · trivedivivek · commit 4810bc2cbc18 · 2024-12-12T09:34:19.000-08:00
This diff replaces uniform buffers with push constants for copy op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. Differential Revision: [D66890851](https://our.internmc.facebook.com/intern/diff/D66890851/) [ghstack-poisoned]
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
@@ -157,9 +157,9 @@ def test_layer_norm_tosa_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
-    @parameterized.expand(test_data_suite[:-2])
+    @parameterized.expand(test_data_suite)
     @unittest.expectedFailure
-    def test_layer_norm_u55_BI(
+    def test_layer_norm_u55_BI_xfails(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -171,7 +171,8 @@ def test_layer_norm_u55_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite[:-2])
-    def test_layer_norm_u85_BI_fvp(
+    @unittest.expectedFailure
+    def test_layer_norm_u85_BI_xfails(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -182,7 +183,6 @@ def test_layer_norm_u85_BI_fvp(
         )
 
     @parameterized.expand(test_data_suite[-2:])
-    @unittest.skip  # Flaky
     def test_layer_norm_u85_BI(
         self,
         test_name: str,
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
@@ -59,6 +59,7 @@ partially lower the Llama model to Vulkan.
 # The files will usually be downloaded to ~/.llama
 python -m examples.models.llama.export_llama \
   --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
+  --model "llama3_2" \ 
   -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
   -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -75,19 +75,21 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
-  uint8_t push_constants_data[128];
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
   uint32_t push_constants_offset = 0;
 
   for (const auto& push_constant : push_constants_) {
-    push_constants_offset +=
-        push_constant.write(push_constants_data, push_constants_offset, 128);
+    push_constants_offset += push_constant.write(
+        push_constants_data.data(),
+        push_constants_offset,
+        kMaxPushConstantSize);
   }
   context->register_shader_dispatch(
       descriptor_set,
       pipeline_barrier,
       shader_,
       global_workgroup_size_,
-      push_constants_data,
+      push_constants_data.data(),
       push_constants_offset);
 
   context->report_shader_dispatch_end();
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -18,6 +18,7 @@ namespace vkcompute {
 
 class ComputeGraph;
 
+constexpr uint32_t kMaxPushConstantSize = 128;
 /*
  * Represents a push constant data entry
  * Which is either shared pointer to a tensor's uniform data with an attribute
diff --git a/devtools/bundled_program/core.py b/devtools/bundled_program/core.py
@@ -9,7 +9,6 @@
 from typing import Dict, List, Optional, Sequence, Type, Union
 
 import executorch.devtools.bundled_program.schema as bp_schema
-from pyre_extensions import none_throws
 
 import executorch.exir.schema as core_schema
 
@@ -44,10 +43,12 @@ class BundledProgram:
 
     def __init__(
         self,
-        executorch_program: Optional[Union[
-            ExecutorchProgram,
-            ExecutorchProgramManager,
-        ]],
+        executorch_program: Optional[
+            Union[
+                ExecutorchProgram,
+                ExecutorchProgramManager,
+            ]
+        ],
         method_test_suites: Sequence[MethodTestSuite],
         pte_file_path: Optional[str] = None,
     ):
@@ -59,18 +60,24 @@ def __init__(
             pte_file_path: The path to pte file to deserialize program if executorch_program is not provided.
         """
         if not executorch_program and not pte_file_path:
-            raise RuntimeError("Either executorch_program or pte_file_path must be provided")
+            raise RuntimeError(
+                "Either executorch_program or pte_file_path must be provided"
+            )
 
         if executorch_program and pte_file_path:
-            raise RuntimeError("Only one of executorch_program or pte_file_path can be used")
+            raise RuntimeError(
+                "Only one of executorch_program or pte_file_path can be used"
+            )
 
         method_test_suites = sorted(method_test_suites, key=lambda x: x.method_name)
         if executorch_program:
             self._assert_valid_bundle(executorch_program, method_test_suites)
-        self.executorch_program: Optional[Union[
-            ExecutorchProgram,
-            ExecutorchProgramManager,
-        ]] = executorch_program
+        self.executorch_program: Optional[
+            Union[
+                ExecutorchProgram,
+                ExecutorchProgramManager,
+            ]
+        ] = executorch_program
         self._pte_file_path: Optional[str] = pte_file_path
 
         self.method_test_suites = method_test_suites
@@ -88,7 +95,8 @@ def serialize_to_schema(self) -> bp_schema.BundledProgram:
         if self.executorch_program:
             program = self._extract_program(self.executorch_program)
         else:
-            with open(none_throws(self._pte_file_path), "rb") as f:
+            assert self._pte_file_path is not None
+            with open(self._pte_file_path, "rb") as f:
                 p_bytes = f.read()
             program = _deserialize_pte_binary(p_bytes)
 
diff --git a/devtools/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py
@@ -6,9 +6,10 @@
 
 # pyre-strict
 
+import tempfile
 import unittest
 from typing import List
-import tempfile
+
 import executorch.devtools.bundled_program.schema as bp_schema
 
 import torch
@@ -73,7 +74,7 @@ def test_bundled_program(self) -> None:
             bundled_program.serialize_to_schema().program,
             bytes(_serialize_pte_binary(executorch_program.executorch_program)),
         )
-        
+
     def test_bundled_program_from_pte(self) -> None:
         executorch_program, method_test_suites = get_common_executorch_program()
 
@@ -82,11 +83,17 @@ def test_bundled_program_from_pte(self) -> None:
             with open(executorch_model_path, "wb") as f:
                 f.write(executorch_program.buffer)
 
-            bundled_program = BundledProgram(executorch_program=None, method_test_suites=method_test_suites, pte_file_path=executorch_model_path)
+            bundled_program = BundledProgram(
+                executorch_program=None,
+                method_test_suites=method_test_suites,
+                pte_file_path=executorch_model_path,
+            )
 
             method_test_suites = sorted(method_test_suites, key=lambda t: t.method_name)
 
-            for plan_id in range(len(executorch_program.executorch_program.execution_plan)):
+            for plan_id in range(
+                len(executorch_program.executorch_program.execution_plan)
+            ):
                 bundled_plan_test = (
                     bundled_program.serialize_to_schema().method_test_suites[plan_id]
                 )
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
@@ -92,7 +92,7 @@ tosa_reference_model_rev="c5570b79e90c3a36ab8c4ddb8ee3fbc2cd3f7c38"
 
 # vela
 vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela"
-vela_rev="a08fc18780827b5fefc814dd0162ee6317ce0ae7"
+vela_rev="5427dc7e9c1a4c7d554163290faeea75f168772d"
 
 ########
 ### Mandatory user args
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
@@ -56,14 +56,14 @@ In this demo app, we support text-only inference with up-to-date Llama models an
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -72,7 +72,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -48,14 +48,14 @@ sh examples/models/llama/install_requirements.sh
 Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --use_spin_quant native --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_spinquant.pte"
 ```
 
 ### For Llama 3.2 1B and 3B QAT+LoRA models
 Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
 * Export Llama model and generate .pte file as below:
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -qat -lora 16 -kv --use_sdpa_with_kv_cache -X -d fp32 --xnnpack-extended-ops --preq_mode 8da4w_output_8da8w --preq_group_size 32 --max_seq_length 2048 --preq_embedding_quantize 8,0 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name "llama3_2_qat_lora.pte"
 ```
 
 ### For Llama 3.2 1B and 3B BF16 models
@@ -64,7 +64,7 @@ We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B
 * Export Llama model and generate .pte file as below:
 
 ```
-python -m examples.models.llama.export_llama --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
+python -m examples.models.llama.export_llama --model "llama3_2" --checkpoint <path-to-your-checkpoint.pth> --params <path-to-your-params.json> -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2_bf16.pte"
 ```
 
 For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -168,6 +168,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama.export_llama \
+  --model "llama3_2" \
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv \
@@ -189,6 +190,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/spinquant/params.json
 
 python -m examples.models.llama.export_llama \
+   --model "llama3_2" \
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -214,6 +216,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth
 LLAMA_PARAMS=path/to/qlora/params.json
 
 python -m examples.models.llama.export_llama \
+   --model "llama3_2" \
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    -qat \
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -113,6 +113,7 @@ class ModelArgs:
     )
     rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
     use_scaled_rope: bool = False  # Use scaled RoPE, introduced in llama3.1.
+    rope_scale_factor: int = 8
     # Additional Model Metadata needed at runtime
     bos_idx: int = 1
     eos_idx: int = 3
@@ -155,7 +156,9 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = hf_precompute_freqs_cis
         else:
             self.precompute_freqs_cis = partial(
-                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+                precompute_freqs_cis,
+                use_scaled=self.params.use_scaled_rope,
+                scale_factor=self.params.rope_scale_factor,
             )
         freqs_cos, freqs_sin = self.precompute_freqs_cis(
             self.params.head_dim,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -145,6 +145,15 @@ def __init__(self, **kwargs):
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
         )
+
+        if model_args.use_scaled_rope:
+            # Older models don't have use_scaled_rope configuration
+            assert self.args.model not in ["llama2", "stories110m"]
+
+            # Llama3_2 and newer models in ExecuTorch repo should set larger scale factor
+            if self.args.model not in ["llama3", "llama3_1"]:
+                model_args.rope_scale_factor = 32
+
         if kwargs.get("verbose", False):
             print("============= weights ================")
             print("{key} : {weights.numel()} : {weights.size()}")
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -8,16 +8,15 @@
 # Different RoPE implementations
 
 import math
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 
 # ======================== Stock Implementation ========================
 
 
-def apply_scaling(freqs: torch.Tensor):
+def apply_scaling(freqs: torch.Tensor, scale_factor: int):
     # Values obtained from grid search
-    scale_factor = 8
     low_freq_factor = 1
     high_freq_factor = 4
     old_context_len = 8192  # original llama3 length
@@ -41,14 +40,19 @@ def apply_scaling(freqs: torch.Tensor):
 
 
 def precompute_freqs_cis(
-    dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False
+    dim: int,
+    end: int,
+    theta: float = 10000.0,
+    use_scaled: bool = False,
+    scale_factor: Optional[int] = None,
 ):
     freqs = 1.0 / (
         theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
     )
     t = torch.arange(end, device=freqs.device)  # pyre-ignore
     if use_scaled:
-        freqs = apply_scaling(freqs)  # pyre-ignore
+        assert scale_factor is not None
+        freqs = apply_scaling(freqs, scale_factor)  # pyre-ignore
     freqs = torch.outer(t, freqs).float()
     freqs_cos = torch.cos(freqs)
     freqs_sin = torch.sin(freqs)