pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 11 additions & 1 deletion b/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎.ci/scripts/wheel/test_windows.py‎
Lines changed: 1 addition & 3 deletions b/‎.ci/scripts/wheel/test_windows.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 5 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 37 additions & 11 deletions b/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 37 additions & 11 deletions
diff --git a/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 2 deletions b/‎backends/aoti/slim/core/targets.bzl‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/aoti/slim/core/test/test_slimtensor_basic.cpp‎
Lines changed: 5 additions & 12 deletions b/‎backends/aoti/slim/core/test/test_slimtensor_basic.cpp‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎backends/aoti/slim/core/test/test_storage.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/aoti/slim/core/test/test_storage.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -1 +1 @@
-de4f3c4978b4d36cc0bb8f87c6877a4a040d7ae7
+732b11313b2006b4d8649500eaf5567ec6ac1e49
@@ -101,6 +101,9 @@ install_devicefarm_cli() {
 }
 
 reserve_if_needed() {
+  # Set default value
+  export DEVICE_RESERVED=0
+
   if ! command -v devicefarm-cli >/dev/null 2>&1; then
     echo "[WARN] devicefarm-cli is not installed." >&2
     return 1
@@ -144,7 +147,11 @@ reserve_if_needed() {
 
   if (( any_below_threshold )); then
     echo "[INFO] Reserving now."
-    devicefarm-cli -R
+    if ! devicefarm-cli -R; then
+      echo "::warning::Failed to reserve a device. No devices are currently available." >&2
+      echo "[WARN] Device reservation failed - continuing without device." >&2
+      return 0
+    fi
   else
     echo "[INFO] Don't need to be reserved."
   fi
@@ -174,7 +181,10 @@ reserve_if_needed() {
   if [[ -n "$reservation_id" ]]; then
     devicefarm-cli -C "$reservation_id"
     devicefarm-cli -E "ls /"
+    export DEVICE_RESERVED=1
+    echo "[INFO] Device successfully reserved and connected."
   else
+    echo "::warning::No available devices found." >&2
     echo "[WARN] There is no available devices."
   fi
 }
 
@@ -30,9 +30,7 @@ def test_model_xnnpack(model: Model, quantize: bool) -> None:
 
     if quantize:
         quant_type = MODEL_NAME_TO_OPTIONS[str(model)].quantization
-        model_instance = torch.export.export_for_training(
-            model_instance, example_inputs
-        )
+        model_instance = torch.export.export(model_instance, example_inputs)
         model_instance = quantize_xnn(
             model_instance.module(), example_inputs, quant_type
         )
 
@@ -490,17 +490,19 @@ jobs:
       build-tool: buck2
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
 
-  unittest-arm-backend-with-no-fvp:
-    name: unittest-arm-backend-with-no-fvp
+  unittest-arm-backend-with-no-deps:
+    name: unittest-arm-backend-with-no-deps
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       matrix:
         include:
-          - test_arm_baremetal: test_pytest_ops
-          - test_arm_baremetal: test_pytest_models
+          - test_arm_baremetal: test_pytest_ops_no_target
+          - test_arm_baremetal: test_pytest_ops_tosa
+          - test_arm_baremetal: test_pytest_models_tosa
+          - test_arm_baremetal: test_run_tosa
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -516,7 +518,7 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch "--use-pt-pinned-commit"
 
-        .ci/scripts/setup-arm-baremetal-tools.sh
+        .ci/scripts/setup-arm-baremetal-tools.sh --disable-ethos-u-deps
 
         ARM_TEST=${{ matrix.test_arm_baremetal }}
 
@@ -945,6 +947,12 @@ jobs:
         export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
         source .ci/scripts/setup-samsung-linux-deps.sh
 
+        # Check if device was reserved
+        if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
+          echo "::warning::Skipping tests - no Samsung device available"
+          exit 0
+        fi
+
         # Test quant models
         model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
         for m_script in $model_scripts; do
@@ -981,6 +989,12 @@ jobs:
         export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
         source .ci/scripts/setup-samsung-linux-deps.sh
 
+        # Check if device was reserved
+        if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
+          echo "::warning::Skipping tests - no Samsung device available"
+          exit 0
+        fi
+
         # Test models
         python -m unittest discover -s backends/samsung/test/models -p "test_*.py"
 
 
@@ -273,21 +273,21 @@ jobs:
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
 
-  test-arm-backend:
-    name: test-arm-backend
+  test-arm-backend-ethos-u:
+    name: test-arm-backend-ethos-u
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
       matrix:
         include:
-          - test_arm_baremetal: test_pytest_ops_ethosu_fvp
-          - test_arm_baremetal: test_pytest_models_ethosu_fvp
-          - test_arm_baremetal: test_run_ethosu_fvp
-          - test_arm_baremetal: test_models_tosa
-          - test_arm_baremetal: test_models_ethos-u55
-          - test_arm_baremetal: test_models_ethos-u85
+          - test_arm_baremetal: test_pytest_ops_ethos_u55
+          - test_arm_baremetal: test_pytest_models_ethos_u55
+          - test_arm_baremetal: test_run_ethos_u55
+          - test_arm_baremetal: test_pytest_ops_ethos_u85
+          - test_arm_baremetal: test_pytest_models_ethos_u85
+          - test_arm_baremetal: test_run_ethos_u85
           - test_arm_baremetal: test_smaller_stories_llama
           - test_arm_baremetal: test_memory_allocation
           - test_arm_baremetal: test_model_smollm2-135M
 
@@ -18,7 +18,7 @@
 
 /devtools @Gasoonjia
 
-/docs @mergennachin
+/docs @mergennachin @AlannaBurke
 
 /examples/apple @shoumikhin
 /examples/apple/coreml @cccclai @metascroy @cymbalrush @YifanShenSZ
 
@@ -12,7 +12,7 @@
 
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
-#include <executorch/backends/aoti/slim/cuda/Guard.h>
+#include <executorch/backends/cuda/runtime/guard.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -87,24 +87,53 @@ struct DeviceTraits<c10::DeviceType::CPU> {
 #ifdef CUDA_AVAILABLE
 /// CUDA specialization of DeviceTraits.
 /// Provides CUDA memory allocation and copy operations using
-/// cudaMalloc/cudaFree.
+/// cudaMallocAsync/cudaFreeAsync with proper stream handling.
+///
+/// IMPORTANT: Callers are expected to set the correct CUDA device and stream
+/// using CUDAStreamGuard before calling these methods. This is consistent
+/// with PyTorch's CUDACachingAllocator design pattern where the allocator
+/// assumes the caller has already set the correct device context.
 template <>
 struct DeviceTraits<c10::DeviceType::CUDA> {
-  /// Allocates CUDA device memory.
+  /// Allocates CUDA device memory on the current stream.
+  /// Uses cudaMallocAsync for asynchronous allocation on the stream
+  /// that is currently set via CUDAStreamGuard, similar to how
+  /// PyTorch's CUDACachingAllocator works.
+  ///
+  /// NOTE: Caller must ensure the correct device is already set via
+  /// CUDAStreamGuard. This function does NOT create a device guard internally.
+  ///
   /// @param nbytes Number of bytes to allocate.
-  /// @param device The target CUDA device.
+  /// @param device The target CUDA device (used to get the stream).
   /// @return Pointer to allocated device memory.
   static void* allocate(size_t nbytes, const c10::Device& device) {
-    cuda::CUDAGuard guard(device);
+    // Get the current stream for this device (set by CUDAStreamGuard if any)
+    // This follows PyTorch's pattern where the allocator assumes the caller
+    // has already set the correct device via CUDAStreamGuard.
+    auto stream_result =
+        executorch::backends::cuda::getCurrentCUDAStream(device.index());
+    ET_CHECK_MSG(
+        stream_result.ok(),
+        "Failed to get current CUDA stream for device %d",
+        static_cast<int>(device.index()));
+
+    cudaStream_t stream = stream_result.get();
     void* data = nullptr;
-    ET_CUDA_CHECK(cudaMalloc(&data, nbytes));
+    ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
     return data;
   }
 
-  /// Frees CUDA device memory.
+  /// Frees CUDA device memory on the current stream.
   /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
-    ET_CUDA_LOG_WARN(cudaFree(ptr));
+    // Get the current stream for the current device
+    auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
+    if (stream_result.ok()) {
+      ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    } else {
+      // Fallback to synchronous free if we can't get the stream
+      ET_CUDA_LOG_WARN(cudaFree(ptr));
+    }
   }
 
   /// Copies memory between CPU and CUDA or CUDA and CUDA.
@@ -120,13 +149,11 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& dst_device,
       const c10::Device& src_device) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-    c10::Device cuda_device = dst_device;
 
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
       direction = cudaMemcpyDeviceToHost;
-      cuda_device = src_device;
     } else {
       ET_CHECK_MSG(
           src_device.index() == dst_device.index(),
@@ -135,7 +162,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
           static_cast<int>(dst_device.index()));
     }
 
-    cuda::CUDAGuard guard(cuda_device);
     ET_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
   }
 };
 
@@ -18,7 +18,7 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/util:size_util",
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
-            "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/backends/cuda/runtime:guard",
         ],
     )
 
@@ -40,6 +40,5 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/util:size_util",
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
-            "//executorch/backends/aoti/slim/cuda:guard",
         ],
     )
@@ -80,11 +80,7 @@ TEST_P(SlimTensorBasicDeviceTest, ConstructWithStorage) {
   EXPECT_EQ(tensor.numel(), 24u);
   EXPECT_TRUE(tensor.is_contiguous());
 
-  if (device().is_cpu()) {
-    EXPECT_TRUE(tensor.is_cpu());
-  } else {
-    EXPECT_TRUE(tensor.is_cuda());
-  }
+  EXPECT_EQ(device().is_cpu(), tensor.is_cpu());
 }
 
 TEST_P(SlimTensorBasicDeviceTest, ConstructWithStorageOffset) {
@@ -153,14 +149,11 @@ TEST_P(SlimTensorBasicDeviceTest, Dtype) {
 TEST_P(SlimTensorBasicDeviceTest, Device) {
   SlimTensor tensor = make_2x3_tensor();
 
-  if (device().is_cpu()) {
-    EXPECT_TRUE(tensor.is_cpu());
-    EXPECT_EQ(tensor.device_type(), c10::DeviceType::CPU);
-  } else {
-    EXPECT_TRUE(tensor.is_cuda());
-    EXPECT_EQ(tensor.device_type(), c10::DeviceType::CUDA);
-  }
+  // Check device type and index
+  EXPECT_EQ(tensor.device_type(), device().type());
   EXPECT_EQ(tensor.device_index(), device().index());
+  EXPECT_EQ(tensor.is_cpu(), device().is_cpu());
+  EXPECT_EQ(tensor.is_cuda(), device().is_cuda());
 }
 
 TEST_P(SlimTensorBasicDeviceTest, Numel) {
 
@@ -367,7 +367,7 @@ TEST_P(StorageSharedPtrParamTest, SharedOwnership) {
   Storage storage1(new MaybeOwningStorage(device(), kNbytes));
   void* data_ptr = storage1->data();
 
-  Storage storage2 = storage1;
+  Storage storage2 = storage1; // Copy, not reference - increments ref count
 
   EXPECT_EQ(storage1.use_count(), 2);
   EXPECT_EQ(storage2.use_count(), 2);
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-de4f3c4978b4d36cc0bb8f87c6877a4a040d7ae7`
	`1`	`+732b11313b2006b4d8649500eaf5567ec6ac1e49`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def define_common_targets():`
`18`	`18`	`"//executorch/backends/aoti/slim/util:size_util",`
`19`	`19`	`"//executorch/runtime/platform:platform",`
`20`	`20`	`"//executorch/backends/aoti/slim/c10/cuda:exception",`
`21`		`- "//executorch/backends/aoti/slim/cuda:guard",`
	`21`	`+ "//executorch/backends/cuda/runtime:guard",`
`22`	`22`	`],`
`23`	`23`	`)`
`24`	`24`
`@@ -40,6 +40,5 @@ def define_common_targets():`
`40`	`40`	`"//executorch/backends/aoti/slim/util:size_util",`
`41`	`41`	`"//executorch/runtime/platform:platform",`
`42`	`42`	`"//executorch/backends/aoti/slim/c10/cuda:exception",`
`43`		`- "//executorch/backends/aoti/slim/cuda:guard",`
`44`	`43`	`],`
`45`	`44`	`)`