psiddh
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 19 additions & 10 deletions b/‎.github/workflows/add-unanswered-to-project.yml‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎.github/workflows/doc-build.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/doc-build.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 49 additions & 8 deletions b/‎backends/aoti/slim/c10/cuda/Exception.h‎
Lines changed: 49 additions & 8 deletions
diff --git a/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/slim/cuda/TARGETS‎
Lines changed: 6 additions & 0 deletions b/‎backends/aoti/slim/cuda/TARGETS‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/aoti/slim/cuda/guard.cpp‎
Lines changed: 159 additions & 0 deletions b/‎backends/aoti/slim/cuda/guard.cpp‎
Lines changed: 159 additions & 0 deletions
@@ -1 +1 @@
-732b11313b2006b4d8649500eaf5567ec6ac1e49
+f8aa919593cc51301ade73a2ee5491582521ab80
@@ -42,16 +42,17 @@ jobs:
               "agunapal", "SamGondelman", "Ninja91", "ivayloen", "DrJessop", "rodrigos01meta", "akrieger", "cmt0", "yiming0416", 
               "ethansfng", "ThomasJannaud", "nirvanagth", "marcinkwiatkowski", "3l1", "omerjerk", "nitish2112", "yipjustin", 
               "ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana", "Polyomino", "ezrilow", "navsud", 
-              "michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "YifanShenSZ", 
-              "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta", "junpi", "pytorchbot", "pytorchmergebot", "pytorchupdatebot", 
-              "facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", 
-              "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", 
-              "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", 
-              "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", "ArmRyan", "xingguo01", 
-              "tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", 
-              "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", 
-              "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", 
-              "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", "anzr299", "Jiseong-oh", "alexdean08",
+              "michaelmaitland", "RahulC7", "seyeong-han", "thdusdl1219", "jaejunku", "felixweilbach", "apullin", "trviv", "junluan01", 
+              "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta", "junpi", "pytorchbot", "pytorchmergebot", 
+              "pytorchupdatebot", "facebook-github-bot", "app/dependabot", "Erik-Lundell", "zingo", "AdrianLundell", "oscarandersson8218", 
+              "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", 
+              "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", 
+              "agrima1304", "emmakujala", "annietllnd", "MatthiasHertel80", "AlexTawseArm", "jmahbs", "morgolock", "Christoffer-JL", 
+              "ArmRyan", "xingguo01", "tgonzalezorlandoarm", "chizkiyahu", "sarah-blades", "haowhsu-quic", "shewu-quic", "winskuo-quic", 
+              "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", "jethroqti", "chenweng-quic", "cymbalrush", "DenisVieriu97", 
+              "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "MartinPavella", "roman-janik-nxp", "novak-vaclav", 
+              "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", 
+              "anzr299", "Jiseong-oh", "alexdean08",
               // explicitly include the dependabot bot login seen in PRs
               "dependabot[bot]"
             ]);
@@ -61,6 +62,9 @@ jobs:
               "meta", "facebook", "pytorch", "arm", "apple", "qualcomm", "nxp", "mediatek", "cadence", "intel", "samsung"
             ]);
 
+            // Labels on PRs to exclude from being added to the project
+            const excludedPrLabels = new Set(["fb-exported", "meta-exported"]);
+            
             // Simple cache for user -> boolean (member of excluded org)
             const orgsCache = new Map();
 
@@ -93,6 +97,11 @@ jobs:
               return false;
             }
 
+            function hasExcludedLabel(item) {
+              if (!item || !item.labels) return false;
+              return item.labels.some(l => l && l.name && excludedPrLabels.has(l.name.toLowerCase()));
+            }
+
             async function addItem(contentId, type, number) {
               try {
                 await github.graphql(`
 
@@ -13,6 +13,10 @@ on:
   schedule:
     - cron: '0 0 * * *'
 
+concurrency:
+  group: docs-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   build:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -8,26 +8,55 @@
 
 #pragma once
 
-#ifdef CUDA_AVAILABLE
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 
 /// Checks a CUDA expression and aborts on error.
 /// @param EXPR The CUDA expression to check.
-#define ET_CUDA_CHECK(EXPR)                                                 \
-  do {                                                                      \
-    const cudaError_t __err = EXPR;                                         \
-    ET_CHECK_MSG(                                                           \
-        __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
+#ifndef ET_CUDA_CHECK
+#define ET_CUDA_CHECK(EXPR)                                           \
+  do {                                                                \
+    const cudaError_t __err = EXPR;                                   \
+    if (__err == cudaSuccess) {                                       \
+      break;                                                          \
+    }                                                                 \
+    ET_LOG(                                                           \
+        Error,                                                        \
+        "%s:%d CUDA error: %s",                                       \
+        __FILE__,                                                     \
+        __LINE__,                                                     \
+        cudaGetErrorString(__err));                                   \
+    ET_CHECK_MSG(false, "CUDA error: %s", cudaGetErrorString(__err)); \
   } while (0)
+#endif
+
+/// Checks a CUDA expression and returns Error::Internal on failure.
+/// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_CHECK_OR_RETURN_ERROR
+#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR)        \
+  do {                                             \
+    const cudaError_t __err = EXPR;                \
+    if (__err == cudaSuccess) {                    \
+      break;                                       \
+    }                                              \
+    ET_LOG(                                        \
+        Error,                                     \
+        "%s:%d CUDA error: %s",                    \
+        __FILE__,                                  \
+        __LINE__,                                  \
+        cudaGetErrorString(__err));                \
+    return ::executorch::runtime::Error::Internal; \
+  } while (0)
+#endif
 
 /// Checks a CUDA expression and logs a warning on error (non-fatal).
 /// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_LOG_WARN
 #define ET_CUDA_LOG_WARN(EXPR)                                      \
   do {                                                              \
     const cudaError_t __err = EXPR;                                 \
@@ -36,5 +65,17 @@
       ET_LOG(Error, "CUDA warning: %s", cudaGetErrorString(__err)); \
     }                                                               \
   } while (0)
+#endif
+
+/// Kernel launch check macro (with return) - checks cudaGetLastError after
+/// kernel launch.
+#ifndef ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR
+#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
+#endif
 
-#endif // CUDA_AVAILABLE
+/// Kernel launch check macro (without return) - checks cudaGetLastError after
+/// kernel launch.
+#ifndef ET_CUDA_KERNEL_LAUNCH_CHECK
+#define ET_CUDA_KERNEL_LAUNCH_CHECK() ET_CUDA_CHECK(cudaGetLastError())
+#endif
@@ -12,7 +12,7 @@
 
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
-#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/backends/aoti/slim/cuda/guard.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
 
@@ -18,7 +18,7 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/util:size_util",
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
-            "//executorch/backends/cuda/runtime:guard",
+            "//executorch/backends/aoti/slim/cuda:guard",
         ],
     )
 
 
@@ -0,0 +1,6 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/runtime/platform/log.h>
+#include <limits>
+#include <unordered_map>
+
+namespace executorch::backends::cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following
+    // ATen
+    int tmp_device = -1;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+    device_index = static_cast<DeviceIndex>(tmp_device);
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following
+    // ATen
+    int tmp_device = -1;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+    device_index = static_cast<DeviceIndex>(tmp_device);
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    // DeviceIndex (int8_t) implicitly widens to int for cudaSetDevice
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          static_cast<int>(original_device_index_),
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following ATen
+  int tmp_device = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+
+  original_device_index_ = static_cast<DeviceIndex>(tmp_device);
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    // DeviceIndex (int8_t) implicitly widens to int for cudaSetDevice
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  // Restore the original stream unless this object was moved-from.
+  // After a move, original_stream_ == current_stream_, which indicates
+  // the moved-from object should not restore.
+  // Note: nullptr is a valid stream value (represents the default stream),
+  // so we must restore even if original_stream_ is nullptr.
+  if (original_stream_ != current_stream_) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          static_cast<int>(device_index_));
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(
+        Error,
+        "Failed to get current stream for device %d",
+        static_cast<int>(device_index));
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace executorch::backends::cuda
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-732b11313b2006b4d8649500eaf5567ec6ac1e49`
	`1`	`+f8aa919593cc51301ade73a2ee5491582521ab80`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def define_common_targets():`
`18`	`18`	`"//executorch/backends/aoti/slim/util:size_util",`
`19`	`19`	`"//executorch/runtime/platform:platform",`
`20`	`20`	`"//executorch/backends/aoti/slim/c10/cuda:exception",`
`21`		`- "//executorch/backends/cuda/runtime:guard",`
	`21`	`+ "//executorch/backends/aoti/slim/cuda:guard",`
`22`	`22`	`],`
`23`	`23`	`)`
`24`	`24`