Update (base update)

swolchok · swolchok · commit 7d967841609b · 2024-09-20T15:19:17.000-07:00
[ghstack-poisoned]
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -28,7 +28,7 @@
 
 
 """
-Export for Server 
+Export for Server
 """
 
 
@@ -78,7 +78,7 @@ def export_for_server(
 """
 Export for ExecuTorch
 
-TODO (https://github.com/pytorch/torchchat/issues/1058): Replace 
+TODO (https://github.com/pytorch/torchchat/issues/1058): Replace
 replace_attention_with_custom_sdpa_attention with ET's implementation
 """
 
@@ -94,6 +94,9 @@ def export_for_server(
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
+    from executorch.backends.xnnpack.passes.convert_to_linear import (
+        ConvertToLinearPass,
+    )
     from executorch.exir import EdgeProgramManager, to_edge
 
     from executorch.exir.capture._config import (
@@ -194,7 +197,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.examples.models.llama2.custom_ops import (  # noqa
+        from executorch.extension.llm.custom_ops import (  # noqa
             sdpa_with_kv_cache,
         )
 
@@ -274,22 +277,20 @@ def export_for_et(model, device, output_path) -> str:
             _skip_type_promotion=bool(target_precision == torch.float16),
         )
 
-        if target_precision == torch.float16 or target_precision == torch.bfloat16:
-            if state_dict_dtype != torch.float16:
-                print("model.to torch.float16")
-                model = model.to(dtype=torch.float16)
-                state_dict_dtype = torch.float16
-        elif target_precision == torch.float32:
-            if state_dict_dtype != torch.float32:
-                print("model.to torch.float32")
-                model = model.to(dtype=torch.float32)
-        elif target_precision == torch.bfloat16:
-            print("model.to torch.bfloat16")
-            model = model.to(dtype=torch.bfloat16)
-        else:
+        if target_precision not in (torch.float16, torch.float32, torch.bfloat16):
             raise ValueError(f"Unsupported dtype for ET export: {target_precision}")
 
-        replace_attention_with_custom_sdpa_attention(model)
+        if state_dict_dtype != target_precision:
+            print(f"model.to {target_precision}")
+            model = model.to(dtype=target_precision)
+            state_dict_dtype = target_precision
+
+        # Custom SDPA does not work with bfloat16 on CPU currently. (The op doesn't
+        # support anything but bfloat32, and our attempt to use it anyway by converting
+        # to and from float causes other errors.)
+        if target_precision != torch.bfloat16:
+            replace_attention_with_custom_sdpa_attention(model)
+
         with torch.nn.attention.sdpa_kernel(
             [torch.nn.attention.SDPBackend.MATH]
         ), torch.no_grad():
@@ -304,9 +305,9 @@ def export_for_et(model, device, output_path) -> str:
         edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
         export_program = edge_manager.to_executorch(
             ExecutorchBackendConfig(
-                extract_constant_segment=True,
                 extract_delegate_segments=True,
                 passes=[
+                    ConvertToLinearPass(),
                     QuantFusionPass(),
                 ],
                 sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
@@ -363,13 +364,17 @@ def main(args):
         except:
             tokenizer = None
 
-        if (
-            output_dso_path is not None
-            and builder_args.max_seq_length is None
-            and not builder_args.dynamic_shapes
-        ):
-            print("Setting max_seq_length to 300 for DSO export.")
-            builder_args.max_seq_length = 300
+        if builder_args.max_seq_length is None:
+            if (
+                output_dso_path is not None
+                and not builder_args.dynamic_shapes
+            ):
+                print("Setting max_seq_length to 300 for DSO export.")
+                builder_args.max_seq_length = 300
+            elif output_pte_path is not None:
+                # The value of 128 was chosen to match the ExecuTorch Llama example setup.
+                print("Setting max_seq_length to 128 for ExecuTorch export.")
+                builder_args.max_seq_length = 128
 
         model = _initialize_model(
             builder_args,
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
@@ -147,7 +147,7 @@ install_executorch() {
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         ${CROSS_COMPILE_ARGS} \
         -S . -B ${CMAKE_OUT_DIR} -G Ninja
-  cmake --build ${CMAKE_OUT_DIR}
+  cmake --build ${CMAKE_OUT_DIR} -j16
   cmake --install ${CMAKE_OUT_DIR} --prefix ${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install
   popd
 }

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ install_executorch() {`
`147`	`147`	`-DEXECUTORCH_BUILD_XNNPACK=ON \`
`148`	`148`	`${CROSS_COMPILE_ARGS} \`
`149`	`149`	`-S . -B ${CMAKE_OUT_DIR} -G Ninja`
`150`		`- cmake --build ${CMAKE_OUT_DIR}`
	`150`	`+ cmake --build ${CMAKE_OUT_DIR} -j16`
`151`	`151`	`cmake --install ${CMAKE_OUT_DIR} --prefix ${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install`
`152`	`152`	`popd`
`153`	`153`	`}`