pytorch
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/aoti_backend.py‎
Lines changed: 5 additions & 3 deletions b/‎backends/aoti/aoti_backend.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 17 additions & 17 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 17 additions & 17 deletions
@@ -126,7 +126,7 @@ case "$HF_MODEL" in
 esac
 
 echo "::group::Setup ExecuTorch Requirements"
-./install_requirements.sh
+# ./install_requirements.sh
 pip list
 echo "::endgroup::"
 
 
@@ -214,6 +214,8 @@ def preprocess(
         with open(so_path, "rb") as f:
             so_data = f.read()
 
+        print("so_path: ", so_path)
+
         # Read weights blob
         with open(blob_path, "rb") as f:
             blob_data = f.read()
@@ -229,9 +231,9 @@ def preprocess(
             method_name + "_weights_blob", blob_data, 1, weights_blob_data_type
         )
 
-        # Clean up the generated files
-        os.remove(so_path)
-        os.remove(blob_path)
+        # # Clean up the generated files
+        # os.remove(so_path)
+        # os.remove(blob_path)
 
         return PreprocessResult(
             processed_bytes=b"",
 
@@ -98,6 +98,7 @@ install(
 set(_aoti_cuda_shim_sources
     runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp
     runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
+    runtime/shims/sdpa.cu
     ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
 )
 
@@ -130,12 +131,12 @@ target_link_options(
   aoti_cuda_shims PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and
+# Link against CUDA::cudart, CUDA::cublas, common AOTI library, cuda_tensor_maker, and
 # platform utilities
 target_link_libraries(
   aoti_cuda_shims
-  PRIVATE cuda_platform
-  PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
+  PRIVATE cuda_platform executorch_core
+  PUBLIC extension_tensor cuda_tensor_maker CUDA::cudart CUDA::cublas ${CMAKE_DL_LIBS}
 )
 
 if(NOT MSVC)
 
@@ -137,20 +137,20 @@ def get_aoti_compile_options(
 
         return options
 
-    @classmethod
-    def get_extra_aoti_compile_context_manager(cls):
-        """
-        Return SDPA MATH backend context manager for CUDA compilation.
-
-        This context manager plays as a fallback solution for any remaining PyTorch SDPA
-        operations to use the MATH backend (decomposed SDPA) during AOTInductor compilation.
-
-        Note:
-        - If SDPA ops are replaced with Triton kernels by ReplaceEdgeOpWithTritonOpPass,
-          this context manager will have no effect on those ops (they are no longer
-          PyTorch SDPA ops).
-        - If SDPA ops are NOT replaced (e.g., when triton_kernel_mode="OFF"), this
-          context manager will force them to use the MATH backend, causing them to
-          be automatically decomposed during compilation.
-        """
-        return torch.nn.attention.sdpa_kernel([SDPBackend.MATH])
+    # @classmethod
+    # def get_extra_aoti_compile_context_manager(cls):
+    #     """
+    #     Return SDPA MATH backend context manager for CUDA compilation.
+
+    #     This context manager plays as a fallback solution for any remaining PyTorch SDPA
+    #     operations to use the MATH backend (decomposed SDPA) during AOTInductor compilation.
+
+    #     Note:
+    #     - If SDPA ops are replaced with Triton kernels by ReplaceEdgeOpWithTritonOpPass,
+    #       this context manager will have no effect on those ops (they are no longer
+    #       PyTorch SDPA ops).
+    #     - If SDPA ops are NOT replaced (e.g., when triton_kernel_mode="OFF"), this
+    #       context manager will force them to use the MATH backend, causing them to
+    #       be automatically decomposed during compilation.
+    #     """
+    #     return torch.nn.attention.sdpa_kernel([SDPBackend.MATH])