Skip to content

Commit 7992fdd

Browse files
committed
Add aoti_torch_item_bool and aoti_torch_new_tensor_handle in AOTI CUDA shim
As titled. This PR adds `aoti_torch_item_bool` implementation and `aoti_torch_new_tensor_handle` implementation. `aoti_torch_item_bool` will be used when the model does: `scalar_tensor.item()` and expect to receive a bool. `aoti_torch_new_tensor_handle` returns a tensor handle that share the same data pointer as another tensor.
1 parent 6785032 commit 7992fdd

File tree

5 files changed

+220
-16
lines changed

5 files changed

+220
-16
lines changed

Makefile

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,22 @@
8787
#
8888
# ==============================================================================
8989

90-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
90+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal whisper-cuda whisper-debug-cuda whisper-cpu whisper-metal llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
9191

9292
help:
93-
@echo "This Makefile adds targets to build runners for various models on various backends. Run using `make <target>`. Available targets:"
94-
@echo " voxtral-cuda - Build Voxtral runner with CUDA backend"
95-
@echo " voxtral-cpu - Build Voxtral runner with CPU backend"
96-
@echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)"
97-
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
98-
@echo " whisper-cpu - Build Whisper runner with CPU backend"
99-
@echo " whisper-metal - Build Whisper runner with Metal backend (macOS only)"
100-
@echo " llama-cpu - Build Llama runner with CPU backend"
101-
@echo " llava-cpu - Build Llava runner with CPU backend"
102-
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
103-
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
104-
@echo " clean - Clean build artifacts"
93+
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
94+
@echo " voxtral-cuda - Build Voxtral runner with CUDA backend"
95+
@echo " voxtral-cpu - Build Voxtral runner with CPU backend"
96+
@echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)"
97+
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
98+
@echo " whisper-debug-cuda - Build Whisper runner with CUDA backend (debug mode)"
99+
@echo " whisper-cpu - Build Whisper runner with CPU backend"
100+
@echo " whisper-metal - Build Whisper runner with Metal backend (macOS only)"
101+
@echo " llama-cpu - Build Llama runner with CPU backend"
102+
@echo " llava-cpu - Build Llava runner with CPU backend"
103+
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
104+
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
105+
@echo " clean - Clean build artifacts"
105106

106107
voxtral-cuda:
107108
@echo "==> Building and installing ExecuTorch with CUDA..."
@@ -139,6 +140,15 @@ whisper-cuda:
139140
@echo "✓ Build complete!"
140141
@echo " Binary: cmake-out/examples/models/whisper/whisper_runner"
141142

143+
whisper-debug-cuda:
144+
@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
145+
cmake --workflow --preset llm-debug-cuda
146+
@echo "==> Building Whisper runner with CUDA (debug mode)..."
147+
cd examples/models/whisper && cmake --workflow --preset whisper-debug-cuda
148+
@echo ""
149+
@echo "✓ Build complete!"
150+
@echo " Binary: cmake-out/examples/models/whisper/whisper_runner"
151+
142152
whisper-cpu:
143153
@echo "==> Building and installing ExecuTorch..."
144154
cmake --workflow --preset llm-release

backends/aoti/common_shims.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,18 @@ aoti_torch_get_device_index(Tensor* tensor, int32_t* ret_device_index);
5959
AOTI_SHIM_EXPORT AOTITorchError
6060
aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
6161

62+
// Device type query (optional; backends may implement this to return device
63+
// type constants such as CUDA). Declared here so common shims can dispatch
64+
// appropriately.
65+
AOTI_SHIM_EXPORT AOTITorchError
66+
aoti_torch_get_device_type(Tensor* tensor, int32_t* ret_device_type);
67+
6268
// Utility functions for device and layout information
6369
AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
6470
AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
6571
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
6672
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
73+
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool();
6774
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
6875
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
6976
AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();

backends/cuda/runtime/shims/memory.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ namespace executorch::backends::cuda {
2424

2525
using executorch::aten::SizesType;
2626
using executorch::aten::StridesType;
27+
using executorch::backends::aoti::aoti_torch_dtype_bool;
2728
using executorch::backends::aoti::aoti_torch_get_device_index;
2829
using executorch::backends::aoti::aoti_torch_get_dtype;
2930
using executorch::backends::aoti::aoti_torch_get_sizes;
@@ -682,6 +683,50 @@ AOTITorchError aoti_torch__reinterpret_tensor(
682683
return Error::Ok;
683684
}
684685

686+
// item implementation for scalar tensors
687+
AOTITorchError aoti_torch_item_bool(Tensor* tensor, bool* ret_value) {
688+
// Validate that tensor is 0D
689+
ET_CHECK_OR_RETURN_ERROR(
690+
tensor->dim() == 0,
691+
InvalidArgument,
692+
"aoti_torch_item_bool failed: tensor is not 0D");
693+
694+
// Validate that tensor dtype is bool
695+
int32_t dtype;
696+
ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(tensor, &dtype));
697+
698+
ET_CHECK_OR_RETURN_ERROR(
699+
dtype == aoti_torch_dtype_bool(), // PyTorch dtype code for bool
700+
InvalidArgument,
701+
"aoti_torch_item_bool failed: tensor dtype is not bool");
702+
703+
// Get the data pointer
704+
const void* data_ptr = tensor->const_data_ptr();
705+
ET_CHECK_OR_RETURN_ERROR(
706+
data_ptr != nullptr,
707+
InvalidArgument,
708+
"aoti_torch_item_bool failed: tensor data pointer is null");
709+
710+
// Check if tensor is on CUDA or CPU
711+
cudaPointerAttributes attributes{};
712+
ET_CUDA_CHECK_OR_RETURN_ERROR(
713+
cudaPointerGetAttributes(&attributes, data_ptr));
714+
715+
if (attributes.type == cudaMemoryTypeDevice) {
716+
// CUDA memory case: copy from device to host
717+
bool device_value;
718+
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
719+
&device_value, data_ptr, sizeof(bool), cudaMemcpyDeviceToHost));
720+
*ret_value = device_value;
721+
} else {
722+
// CPU memory case: direct access
723+
const bool* bool_ptr = static_cast<const bool*>(data_ptr);
724+
*ret_value = *bool_ptr;
725+
}
726+
727+
return Error::Ok;
728+
}
729+
685730
AOTITorchError aoti_torch_new_tensor_handle(
686731
Tensor* orig_handle,
687732
Tensor** new_handle) {
@@ -771,6 +816,93 @@ AOTITorchError aoti_torch_new_tensor_handle(
771816

772817
return Error::Ok;
773818
}
819+
820+
AOTI_SHIM_EXPORT AOTITorchError
821+
aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst) {
822+
if (src == nullptr || ret_dst == nullptr) {
823+
return Error::InvalidArgument;
824+
}
825+
826+
// Get the original data pointer from the source tensor
827+
void* data_ptr = src->mutable_data_ptr();
828+
ET_CHECK_OR_RETURN_ERROR(
829+
data_ptr != nullptr,
830+
InvalidArgument,
831+
"Source tensor has null data pointer");
832+
833+
// Check if the given memory is in the map, if not return error
834+
auto memory_it = memory_to_n_tensor.find(data_ptr);
835+
ET_CHECK_OR_RETURN_ERROR(
836+
memory_it != memory_to_n_tensor.end(),
837+
InvalidArgument,
838+
"Memory address %p is not being tracked by reference counting system",
839+
data_ptr);
840+
841+
// Convert sizes and strides to vectors
842+
std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
843+
std::vector<StridesType> strides =
844+
convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
845+
846+
// Create new tensor that shares the same memory as the original
847+
// This is similar to PyTorch's Tensor copy constructor - creates a new
848+
// tensor object that shares the same underlying storage
849+
std::shared_ptr<Tensor> tensor = make_tensor(
850+
sizes, // Same sizes as original
851+
data_ptr, // Share the same memory from source tensor
852+
{}, // dim_order (empty, will be auto-generated)
853+
strides, // Same strides as original
854+
dtype_to_scalar_type(dtype) // Same dtype as original
855+
);
856+
857+
ET_CHECK_OR_RETURN_ERROR(
858+
tensor != nullptr, InvalidArgument, "Failed to create new tensor handle");
859+
int32_t dtype = 0;
860+
ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(src, &dtype));
861+
862+
std::vector<SizesType> sizes;
863+
std::vector<StridesType> strides;
864+
865+
int64_t* view_sizes_ptr;
866+
if (aoti_torch_get_sizes(src, &view_sizes_ptr) != Error::Ok) {
867+
return Error::Internal;
868+
}
869+
int64_t* view_strides_ptr;
870+
if (aoti_torch_get_strides(src, &view_strides_ptr) != Error::Ok) {
871+
return Error::Internal;
872+
}
873+
sizes = convert_sizes_to_vector(src->dim(), view_sizes_ptr);
874+
strides =
875+
convert_strides_to_vector(src->dim(), view_sizes_ptr, view_strides_ptr);
876+
877+
// Create new tensor view that reinterprets the same memory with different
878+
// shape/strides This creates a view, not a copy - the data pointer is shared
879+
// Using CUDA-specific tensor maker that supports incontiguous tensors
880+
std::shared_ptr<Tensor> tensor = make_tensor(
881+
sizes, // New sizes with explicit SizesType
882+
data_ptr, // Reuse the same memory from source tensor
883+
{}, // dim_order (empty, will be auto-generated)
884+
strides, // New strides with explicit StridesType
885+
dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
886+
);
887+
888+
ET_CHECK_OR_RETURN_ERROR(
889+
tensor != nullptr,
890+
InvalidArgument,
891+
"Failed to create reinterpreted tensor view");
892+
893+
// Store the tensor so it doesn't get destroyed
894+
tensors.insert(tensor);
895+
896+
*ret_dst = tensor.get();
897+
898+
// Increment the reference count for this memory address only if it is owned
899+
// by tensor
900+
memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
901+
? NOT_OWN
902+
: memory_to_n_tensor[data_ptr] + 1;
903+
904+
return Error::Ok;
905+
}
774906
} // extern "C"
775907

776908
} // namespace executorch::backends::cuda

backends/cuda/runtime/shims/memory.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,30 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
161161
* @return Error::Ok on success, appropriate error code on failure:
162162
* - Error::InvalidArgument: null pointers or invalid parameters
163163
*/
164-
AOTITorchError aoti_torch_new_tensor_handle(
165-
Tensor* orig_handle,
166-
Tensor** new_handle);
164+
AOTI_SHIM_EXPORT AOTITorchError
165+
aoti_torch_new_tensor_handle(Tensor* orig_handle, Tensor** new_handle);
166+
167+
// Function to retrieve boolean value from a 0D boolean tensor
168+
AOTI_SHIM_EXPORT AOTITorchError
169+
aoti_torch_item_bool(Tensor* tensor, bool* ret_value);
170+
171+
/**
172+
* Reinterprets the destination tensor to be a view of the source tensor's
173+
* data.
174+
*
175+
* This function makes the destination tensor a view of the source tensor's
176+
* underlying data, but with the destination tensor's original shape and
177+
* strides. The number of elements in both tensors must match. The destination
178+
* tensor handle will be updated to point to a new tensor view.
179+
*
180+
* @param src The source tensor providing the data.
181+
* @param ret_dst On input, a pointer to the destination tensor. On output,
182+
* this will be updated to point to the new tensor view.
183+
*
184+
* @return Error::Ok on success, appropriate error code on failure.
185+
*/
186+
AOTI_SHIM_EXPORT AOTITorchError
187+
aoti_torch_assign_tensors_out(Tensor* src, Tensor** ret_dst);
167188

168189
// Function to clear all tensors from internal storage
169190
AOTI_SHIM_EXPORT void clear_all_tensors();

examples/models/whisper/CMakePresets.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,20 @@
2828
"rhs": "Linux"
2929
}
3030
},
31+
{
32+
"name": "whisper-debug-cuda",
33+
"displayName": "Whisper runner (CUDA Debug)",
34+
"inherits": ["whisper-base"],
35+
"cacheVariables": {
36+
"EXECUTORCH_BUILD_CUDA": "ON",
37+
"CMAKE_BUILD_TYPE": "Debug"
38+
},
39+
"condition": {
40+
"lhs": "${hostSystemName}",
41+
"type": "equals",
42+
"rhs": "Linux"
43+
}
44+
},
3145
{
3246
"name": "whisper-metal",
3347
"displayName": "Whisper runner (Metal)",
@@ -55,6 +69,12 @@
5569
"configurePreset": "whisper-cuda",
5670
"targets": ["whisper_runner"]
5771
},
72+
{
73+
"name": "whisper-debug-cuda",
74+
"displayName": "Build Whisper runner (CUDA Debug)",
75+
"configurePreset": "whisper-debug-cuda",
76+
"targets": ["whisper_runner"]
77+
},
5878
{
5979
"name": "whisper-metal",
6080
"displayName": "Build Whisper runner (Metal)",
@@ -91,6 +111,20 @@
91111
}
92112
]
93113
},
114+
{
115+
"name": "whisper-debug-cuda",
116+
"displayName": "Configure and build Whisper runner (CUDA Debug)",
117+
"steps": [
118+
{
119+
"type": "configure",
120+
"name": "whisper-debug-cuda"
121+
},
122+
{
123+
"type": "build",
124+
"name": "whisper-debug-cuda"
125+
}
126+
]
127+
},
94128
{
95129
"name": "whisper-metal",
96130
"displayName": "Configure and build Whisper runner (Metal)",

0 commit comments

Comments
 (0)