Update on "add module level benchmark for gemma3 model"

Gasoonjia · Gasoonjia · commit 96704b58c8c7 · 2025-10-17T16:12:41.000-07:00
This diff adds a module-level benchmark for the GEMMA3 model. Also introduce mutlmodal_benchmark.cpp to replace original voxtral_runner.cpp for benchmarking both gemma3 and voxtral model in module level. Differential Revision: [D84958564](https://our.internmc.facebook.com/intern/diff/D84958564/) [ghstack-poisoned]
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -140,6 +140,9 @@ def preprocess(
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
+            # Disable this to support sdpa decomposition
+            # TODO(gasoonjia): remove it after pin bump to latest pytorch
+            "loop_ordering_after_fusion": False,
             # Better model precision
             "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object