Change AOTI_RUNTIME_DEVICE_CHECK to be device device specific (pytorch#157818)

yushangdi · pytorchmergebot · commit 52772765e034 · 2025-07-09T18:34:56.000Z
Summary: Change AOTI_RUNTIME_DEVICE_CHECK to the following depending on device: AOTI_RUNTIME_CUDA_CHECK AOTI_RUNTIME_XPU_CHECK AOTI_RUNTIME_CPU_CHECK Currently in the codebase, only `AOTI_RUNTIME_CUDA_CHECK` is used. This shouldn't change anything as of now, but we do this to prepare for simultaneouly loading multiple backends (e..g CPU and CUDA) in AOTI standalone. We don't want people writing `AOTI_RUNTIME_DEVICE_CHECK` for both CPU and CUDA checks. This could cause compilation problems when we statically link both CPU and CUDA models. Test Plan: CI Rollback Plan: Reviewed By: muchulee8 Differential Revision: D77742977 Pull Request resolved: pytorch#157818 Approved by: https://github.com/jingsh
diff --git a/torch/csrc/inductor/aoti_runtime/device_utils.h b/torch/csrc/inductor/aoti_runtime/device_utils.h
@@ -14,7 +14,7 @@
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                    \
+#define AOTI_RUNTIME_CUDA_CHECK(EXPR)                      \
   do {                                                     \
     const cudaError_t code = EXPR;                         \
     const char* msg = cudaGetErrorString(code);            \
@@ -34,7 +34,7 @@ using DeviceStreamType = cudaStream_t;
 #include <level_zero/ze_api.h>
 #include <sycl/sycl.hpp>
 #include <sstream>
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                   \
+#define AOTI_RUNTIME_XPU_CHECK(EXPR)                                      \
   do {                                                                    \
     const ze_result_t status = EXPR;                                      \
     if (status != ZE_RESULT_SUCCESS) {                                    \
@@ -52,7 +52,7 @@ using DeviceStreamType = sycl::queue*;
 
 #else
 
-#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
+#define AOTI_RUNTIME_CPU_CHECK(EXPR)               \
   bool ok = EXPR;                                  \
   if (!ok) {                                       \
     throw std::runtime_error("CPU runtime error"); \
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -63,8 +63,8 @@ using RAIIDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
 RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
   void* data_ptr = nullptr;
-  AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
-  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
+  AOTI_RUNTIME_CUDA_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
+  auto deleter = [](void* ptr) { AOTI_RUNTIME_CUDA_CHECK(cudaFree(ptr)); };
   return RAIIDataPtr(data_ptr, deleter);
 }
 
@@ -165,10 +165,10 @@ class AOTInductorModelBase {
 
 #ifdef USE_CUDA
     if (device_idx_ == -1) {
-      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+      AOTI_RUNTIME_CUDA_CHECK(cudaGetDevice(&device_idx_));
     } else {
       // If device_idx_ is passed in, we need to set the current device to it
-      AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
+      AOTI_RUNTIME_CUDA_CHECK(cudaSetDevice(device_idx_));
     }
 #endif // USE_CUDA
 #ifdef USE_XPU
@@ -222,7 +222,7 @@ class AOTInductorModelBase {
 #ifdef USE_CUDA
     if (!run_finished_) {
       cudaEvent_t run_finished = nullptr;
-      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      AOTI_RUNTIME_CUDA_CHECK(cudaEventCreate(&run_finished));
       run_finished_.emplace(run_finished);
     }
 #elif defined(USE_XPU)
@@ -239,7 +239,7 @@ class AOTInductorModelBase {
     model->run_impl(input_handles, output_handles, stream, proxy_executor);
 
 #ifdef USE_CUDA
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+    AOTI_RUNTIME_CUDA_CHECK(cudaEventRecord(*run_finished_, stream));
 #elif defined(USE_XPU)
     run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
         static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
@@ -273,7 +273,7 @@ class AOTInductorModelBase {
 #ifdef USE_CUDA
     if (!run_finished_) {
       cudaEvent_t run_finished = nullptr;
-      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      AOTI_RUNTIME_CUDA_CHECK(cudaEventCreate(&run_finished));
       run_finished_.emplace(run_finished);
     }
 #elif defined(USE_XPU)
@@ -291,7 +291,7 @@ class AOTInductorModelBase {
         model->const_run_impl(stream, proxy_executor, initialization);
 
 #ifdef USE_CUDA
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+    AOTI_RUNTIME_CUDA_CHECK(cudaEventRecord(*run_finished_, stream));
 #elif defined(USE_XPU)
     // sycl::queue* queue_ptr = nullptr;
     // aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
@@ -408,7 +408,7 @@ class AOTInductorModelBase {
           ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
           .wait();
 #elif USE_CUDA
-      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+      AOTI_RUNTIME_CUDA_CHECK(cudaMemcpy(
           internal_ptr,
           _get_constants_start() + bytes_read,
           data_size,
@@ -613,7 +613,7 @@ class AOTInductorModelBase {
       throw std::runtime_error{"Model event was not initialized"};
     }
 
-    AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
+    AOTI_RUNTIME_CUDA_CHECK(cudaEventSynchronize(*run_finished_));
 #endif // USE_CUDA
 #ifdef USE_XPU
     if (!run_finished_) {
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -476,7 +476,7 @@ class AOTInductorModelContainer {
           ->memcpy(internal_constants_ptr, user_constant_ptr, constant_size)
           .wait();
 #elif USE_CUDA
-      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+      AOTI_RUNTIME_CUDA_CHECK(cudaMemcpy(
           internal_constants_ptr,
           user_constant_ptr,
           constant_size,