[AOTI] Add a --dynamic-shapes option to export (#1011)

desertfire · web-flow · commit 46e3ab708467 · 2024-08-05T22:40:19.000-04:00
* [AOTI] Change export to use static shapes

Summary: The inputs to model forward are with static shapes, so changing the export call to make sure more Inductor optimizations will take effect down the stream. This change by itself improves average tokens/sec from 29.60 to 33.43 on A100. Some following PRs will provide further perf gains.

* Add a dynamic-shapes option for export

* Actually add --dynamic-shapes to CLI

* Access args.dynamic_shapes correctly
diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
@@ -285,7 +285,7 @@ function eval_model_sanity_check() {
             echo "******** INT4 group-wise quantized (AOTI) *******"
             echo "*************************************************"
             if [ "$DTYPE" != "float16" ]; then
-                python3 -W ignore export.py --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
+                python3 -W ignore export.py --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --dynamic-shapes --device "$TARGET_DEVICE" || exit 1
                 python3 -W ignore eval.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1
                 cat "$MODEL_DIR/output_eval_aoti"
             fi;
diff --git a/build/builder.py b/build/builder.py
@@ -44,6 +44,7 @@ class BuilderArgs:
     use_distributed: bool = False
     is_chat_model: bool = False
     prefill_possible: bool = False
+    dynamic_shapes: bool = False
 
     def __post_init__(self):
         if self.device is None:
@@ -157,6 +158,7 @@ def from_args(cls, args):  # -> BuilderArgs:
             setup_caches=(output_dso_path or output_pte_path),
             use_distributed=args.distributed,
             is_chat_model=is_chat_model,
+            dynamic_shapes=getattr(args, "dynamic_shapes", False),
         )
 
     @classmethod
diff --git a/cli.py b/cli.py
@@ -185,6 +185,11 @@ def _add_export_output_path_args(parser) -> None:
         default=None,
         help="Output to the specified AOT Inductor .dso model file",
     )
+    parser.add_argument(
+        "--dynamic-shapes",
+        action="store_true",
+        help="Call torch.export with dynamic shapes",
+    )
 
 
 # Add CLI Args representing user provided exported model files
diff --git a/export.py b/export.py
@@ -37,7 +37,10 @@
 
 
 def export_for_server(
-    model: nn.Module, device: Optional[str] = "cpu", output_path: str = "model.dso"
+    model: nn.Module,
+    device: Optional[str] = "cpu",
+    output_path: str = "model.dso",
+    dynamic_shapes: bool = False,
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -49,16 +52,22 @@ def export_for_server(
     Returns:
         The path to the exported model.
     """
-    input = (
-        torch.tensor([[1, 9038, 2501, 263, 931]], dtype=torch.int, device=device),
-        torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
-    )
+    if dynamic_shapes:
+        input = (
+            torch.tensor([[1, 9038, 2501, 263, 931]], dtype=torch.int, device=device),
+            torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
+        )
 
-    seq = Dim("seq", min=1, max=model.config.max_seq_length)
-    # Specify that the first dimension of each input is that batch size
-    dynamic_shapes = {"idx": {1: seq}, "input_pos": {0: seq}}
+        seq = Dim("seq", min=1, max=model.config.max_seq_length)
+        # Specify that the first dimension of each input is that batch size
+        dynamic_shapes = {"idx": {1: seq}, "input_pos": {0: seq}}
+    else:
+        input = (
+            torch.tensor([[1]], dtype=torch.int, device=device),
+            torch.tensor([0], dtype=torch.int, device=device),
+        )
+        dynamic_shapes = None
 
-    model.to(device)
     so = torch._export.aot_compile(
         model,
         args=input,
@@ -143,7 +152,12 @@ def main(args):
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            export_for_server(model_to_dso, builder_args.device, output_dso_path)
+            export_for_server(
+                model_to_dso,
+                builder_args.device,
+                output_dso_path,
+                builder_args.dynamic_shapes,
+            )
 
 
 if __name__ == "__main__":