[AOTI] Set sdpa_kernel context when exporting (#1013)

desertfire · web-flow · commit 67f678b58d3c · 2024-08-06T09:20:23.000-04:00
Summary: This improves average tokens/sec from 33.43 to 72.63 on A100 for AOTI.

```
python3 torchchat.py export llama3 --quantize '{"precision": {"dtype":"bfloat16"}, "executor":{"accelerator":"cuda"}}' --output-dso-path /tmp/model16.so &amp;&amp; python3 torchchat.py generate llama3 --dso-path /tmp/model16.so --prompt "Once upon a time," --max-new-tokens 256 --device cuda --num-samples 3
```
diff --git a/export.py b/export.py
@@ -68,12 +68,13 @@ def export_for_server(
         )
         dynamic_shapes = None
 
-    so = torch._export.aot_compile(
-        model,
-        args=input,
-        options={"aot_inductor.output_path": output_path},
-        dynamic_shapes=dynamic_shapes,
-    )
+    with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+        so = torch._export.aot_compile(
+            model,
+            args=input,
+            options={"aot_inductor.output_path": output_path},
+            dynamic_shapes=dynamic_shapes,
+        )
     print(f"The generated DSO model can be found at: {so}")
     return so