Skip to content

Commit 21df00b

Browse files
committed
Update base for Update on "[slimtensor] Add aoti_torch_new_tensor_handle for SlimTensor"
Add `aoti_torch_new_tensor_handle()` - Creates a new tensor handle that shares storage with the original tensor. Uses SlimTensor's copy constructor which shares the SharedPtr<Storage>, so both tensors reference the same memory. Differential Revision: [D90126245](https://our.internmc.facebook.com/intern/diff/D90126245/) [ghstack-poisoned]
2 parents c720797 + 35eb01a commit 21df00b

File tree

271 files changed

+6498
-3136
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

271 files changed

+6498
-3136
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
de4f3c4978b4d36cc0bb8f87c6877a4a040d7ae7
1+
732b11313b2006b4d8649500eaf5567ec6ac1e49

.ci/scripts/setup-samsung-linux-deps.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ install_devicefarm_cli() {
101101
}
102102

103103
reserve_if_needed() {
104+
# Set default value
105+
export DEVICE_RESERVED=0
106+
104107
if ! command -v devicefarm-cli >/dev/null 2>&1; then
105108
echo "[WARN] devicefarm-cli is not installed." >&2
106109
return 1
@@ -144,7 +147,11 @@ reserve_if_needed() {
144147

145148
if (( any_below_threshold )); then
146149
echo "[INFO] Reserving now."
147-
devicefarm-cli -R
150+
if ! devicefarm-cli -R; then
151+
echo "::warning::Failed to reserve a device. No devices are currently available." >&2
152+
echo "[WARN] Device reservation failed - continuing without device." >&2
153+
return 0
154+
fi
148155
else
149156
echo "[INFO] Don't need to be reserved."
150157
fi
@@ -174,7 +181,10 @@ reserve_if_needed() {
174181
if [[ -n "$reservation_id" ]]; then
175182
devicefarm-cli -C "$reservation_id"
176183
devicefarm-cli -E "ls /"
184+
export DEVICE_RESERVED=1
185+
echo "[INFO] Device successfully reserved and connected."
177186
else
187+
echo "::warning::No available devices found." >&2
178188
echo "[WARN] There is no available devices."
179189
fi
180190
}

.ci/scripts/wheel/test_windows.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@ def test_model_xnnpack(model: Model, quantize: bool) -> None:
3030

3131
if quantize:
3232
quant_type = MODEL_NAME_TO_OPTIONS[str(model)].quantization
33-
model_instance = torch.export.export_for_training(
34-
model_instance, example_inputs
35-
)
33+
model_instance = torch.export.export(model_instance, example_inputs)
3634
model_instance = quantize_xnn(
3735
model_instance.module(), example_inputs, quant_type
3836
)

.github/workflows/pull.yml

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -490,17 +490,19 @@ jobs:
490490
build-tool: buck2
491491
docker-image: ci-image:executorch-ubuntu-22.04-clang12
492492

493-
unittest-arm-backend-with-no-fvp:
494-
name: unittest-arm-backend-with-no-fvp
493+
unittest-arm-backend-with-no-deps:
494+
name: unittest-arm-backend-with-no-deps
495495
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
496496
permissions:
497497
id-token: write
498498
contents: read
499499
strategy:
500500
matrix:
501501
include:
502-
- test_arm_baremetal: test_pytest_ops
503-
- test_arm_baremetal: test_pytest_models
502+
- test_arm_baremetal: test_pytest_ops_no_target
503+
- test_arm_baremetal: test_pytest_ops_tosa
504+
- test_arm_baremetal: test_pytest_models_tosa
505+
- test_arm_baremetal: test_run_tosa
504506
fail-fast: false
505507
with:
506508
runner: linux.2xlarge
@@ -516,7 +518,7 @@ jobs:
516518
source .ci/scripts/utils.sh
517519
install_executorch "--use-pt-pinned-commit"
518520
519-
.ci/scripts/setup-arm-baremetal-tools.sh
521+
.ci/scripts/setup-arm-baremetal-tools.sh --disable-ethos-u-deps
520522
521523
ARM_TEST=${{ matrix.test_arm_baremetal }}
522524
@@ -945,6 +947,12 @@ jobs:
945947
export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
946948
source .ci/scripts/setup-samsung-linux-deps.sh
947949
950+
# Check if device was reserved
951+
if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
952+
echo "::warning::Skipping tests - no Samsung device available"
953+
exit 0
954+
fi
955+
948956
# Test quant models
949957
model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
950958
for m_script in $model_scripts; do
@@ -981,6 +989,12 @@ jobs:
981989
export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
982990
source .ci/scripts/setup-samsung-linux-deps.sh
983991
992+
# Check if device was reserved
993+
if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
994+
echo "::warning::Skipping tests - no Samsung device available"
995+
exit 0
996+
fi
997+
984998
# Test models
985999
python -m unittest discover -s backends/samsung/test/models -p "test_*.py"
9861000

.github/workflows/trunk.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -273,21 +273,21 @@ jobs:
273273
# Test selective build
274274
PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
275275
276-
test-arm-backend:
277-
name: test-arm-backend
276+
test-arm-backend-ethos-u:
277+
name: test-arm-backend-ethos-u
278278
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
279279
permissions:
280280
id-token: write
281281
contents: read
282282
strategy:
283283
matrix:
284284
include:
285-
- test_arm_baremetal: test_pytest_ops_ethosu_fvp
286-
- test_arm_baremetal: test_pytest_models_ethosu_fvp
287-
- test_arm_baremetal: test_run_ethosu_fvp
288-
- test_arm_baremetal: test_models_tosa
289-
- test_arm_baremetal: test_models_ethos-u55
290-
- test_arm_baremetal: test_models_ethos-u85
285+
- test_arm_baremetal: test_pytest_ops_ethos_u55
286+
- test_arm_baremetal: test_pytest_models_ethos_u55
287+
- test_arm_baremetal: test_run_ethos_u55
288+
- test_arm_baremetal: test_pytest_ops_ethos_u85
289+
- test_arm_baremetal: test_pytest_models_ethos_u85
290+
- test_arm_baremetal: test_run_ethos_u85
291291
- test_arm_baremetal: test_smaller_stories_llama
292292
- test_arm_baremetal: test_memory_allocation
293293
- test_arm_baremetal: test_model_smollm2-135M

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
/devtools @Gasoonjia
2020

21-
/docs @mergennachin
21+
/docs @mergennachin @AlannaBurke
2222

2323
/examples/apple @shoumikhin
2424
/examples/apple/coreml @cccclai @metascroy @cymbalrush @YifanShenSZ

backends/aoti/slim/core/Storage.h

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
#ifdef CUDA_AVAILABLE
1414
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
15-
#include <executorch/backends/aoti/slim/cuda/Guard.h>
15+
#include <executorch/backends/cuda/runtime/guard.h>
1616
#endif
1717

1818
#include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -87,24 +87,53 @@ struct DeviceTraits<c10::DeviceType::CPU> {
8787
#ifdef CUDA_AVAILABLE
8888
/// CUDA specialization of DeviceTraits.
8989
/// Provides CUDA memory allocation and copy operations using
90-
/// cudaMalloc/cudaFree.
90+
/// cudaMallocAsync/cudaFreeAsync with proper stream handling.
91+
///
92+
/// IMPORTANT: Callers are expected to set the correct CUDA device and stream
93+
/// using CUDAStreamGuard before calling these methods. This is consistent
94+
/// with PyTorch's CUDACachingAllocator design pattern where the allocator
95+
/// assumes the caller has already set the correct device context.
9196
template <>
9297
struct DeviceTraits<c10::DeviceType::CUDA> {
93-
/// Allocates CUDA device memory.
98+
/// Allocates CUDA device memory on the current stream.
99+
/// Uses cudaMallocAsync for asynchronous allocation on the stream
100+
/// that is currently set via CUDAStreamGuard, similar to how
101+
/// PyTorch's CUDACachingAllocator works.
102+
///
103+
/// NOTE: Caller must ensure the correct device is already set via
104+
/// CUDAStreamGuard. This function does NOT create a device guard internally.
105+
///
94106
/// @param nbytes Number of bytes to allocate.
95-
/// @param device The target CUDA device.
107+
/// @param device The target CUDA device (used to get the stream).
96108
/// @return Pointer to allocated device memory.
97109
static void* allocate(size_t nbytes, const c10::Device& device) {
98-
cuda::CUDAGuard guard(device);
110+
// Get the current stream for this device (set by CUDAStreamGuard if any)
111+
// This follows PyTorch's pattern where the allocator assumes the caller
112+
// has already set the correct device via CUDAStreamGuard.
113+
auto stream_result =
114+
executorch::backends::cuda::getCurrentCUDAStream(device.index());
115+
ET_CHECK_MSG(
116+
stream_result.ok(),
117+
"Failed to get current CUDA stream for device %d",
118+
static_cast<int>(device.index()));
119+
120+
cudaStream_t stream = stream_result.get();
99121
void* data = nullptr;
100-
ET_CUDA_CHECK(cudaMalloc(&data, nbytes));
122+
ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
101123
return data;
102124
}
103125

104-
/// Frees CUDA device memory.
126+
/// Frees CUDA device memory on the current stream.
105127
/// @param ptr Pointer to device memory to free.
106128
static void free(void* ptr) {
107-
ET_CUDA_LOG_WARN(cudaFree(ptr));
129+
// Get the current stream for the current device
130+
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
131+
if (stream_result.ok()) {
132+
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
133+
} else {
134+
// Fallback to synchronous free if we can't get the stream
135+
ET_CUDA_LOG_WARN(cudaFree(ptr));
136+
}
108137
}
109138

110139
/// Copies memory between CPU and CUDA or CUDA and CUDA.
@@ -120,13 +149,11 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
120149
const c10::Device& dst_device,
121150
const c10::Device& src_device) {
122151
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
123-
c10::Device cuda_device = dst_device;
124152

125153
if (src_device.is_cpu()) {
126154
direction = cudaMemcpyHostToDevice;
127155
} else if (dst_device.is_cpu()) {
128156
direction = cudaMemcpyDeviceToHost;
129-
cuda_device = src_device;
130157
} else {
131158
ET_CHECK_MSG(
132159
src_device.index() == dst_device.index(),
@@ -135,7 +162,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
135162
static_cast<int>(dst_device.index()));
136163
}
137164

138-
cuda::CUDAGuard guard(cuda_device);
139165
ET_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
140166
}
141167
};

backends/aoti/slim/core/targets.bzl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def define_common_targets():
1818
"//executorch/backends/aoti/slim/util:size_util",
1919
"//executorch/runtime/platform:platform",
2020
"//executorch/backends/aoti/slim/c10/cuda:exception",
21-
"//executorch/backends/aoti/slim/cuda:guard",
21+
"//executorch/backends/cuda/runtime:guard",
2222
],
2323
)
2424

@@ -40,6 +40,5 @@ def define_common_targets():
4040
"//executorch/backends/aoti/slim/util:size_util",
4141
"//executorch/runtime/platform:platform",
4242
"//executorch/backends/aoti/slim/c10/cuda:exception",
43-
"//executorch/backends/aoti/slim/cuda:guard",
4443
],
4544
)

backends/aoti/slim/core/test/test_slimtensor_basic.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,7 @@ TEST_P(SlimTensorBasicDeviceTest, ConstructWithStorage) {
8080
EXPECT_EQ(tensor.numel(), 24u);
8181
EXPECT_TRUE(tensor.is_contiguous());
8282

83-
if (device().is_cpu()) {
84-
EXPECT_TRUE(tensor.is_cpu());
85-
} else {
86-
EXPECT_TRUE(tensor.is_cuda());
87-
}
83+
EXPECT_EQ(device().is_cpu(), tensor.is_cpu());
8884
}
8985

9086
TEST_P(SlimTensorBasicDeviceTest, ConstructWithStorageOffset) {
@@ -153,14 +149,11 @@ TEST_P(SlimTensorBasicDeviceTest, Dtype) {
153149
TEST_P(SlimTensorBasicDeviceTest, Device) {
154150
SlimTensor tensor = make_2x3_tensor();
155151

156-
if (device().is_cpu()) {
157-
EXPECT_TRUE(tensor.is_cpu());
158-
EXPECT_EQ(tensor.device_type(), c10::DeviceType::CPU);
159-
} else {
160-
EXPECT_TRUE(tensor.is_cuda());
161-
EXPECT_EQ(tensor.device_type(), c10::DeviceType::CUDA);
162-
}
152+
// Check device type and index
153+
EXPECT_EQ(tensor.device_type(), device().type());
163154
EXPECT_EQ(tensor.device_index(), device().index());
155+
EXPECT_EQ(tensor.is_cpu(), device().is_cpu());
156+
EXPECT_EQ(tensor.is_cuda(), device().is_cuda());
164157
}
165158

166159
TEST_P(SlimTensorBasicDeviceTest, Numel) {

backends/aoti/slim/core/test/test_storage.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ TEST_P(StorageSharedPtrParamTest, SharedOwnership) {
367367
Storage storage1(new MaybeOwningStorage(device(), kNbytes));
368368
void* data_ptr = storage1->data();
369369

370-
Storage storage2 = storage1;
370+
Storage storage2 = storage1; // Copy, not reference - increments ref count
371371

372372
EXPECT_EQ(storage1.use_count(), 2);
373373
EXPECT_EQ(storage2.use_count(), 2);

0 commit comments

Comments
 (0)