Skip to content

Commit f8c3ea2

Browse files
committed
update cuda delegate resource free pipeline for safety and segfault-free
Pull Request resolved: #14905 This diff survives `clear_all_tensors()` function and enable it during backend destroy stage. Furthermore, we defer the container handle deletion to OS to avoid potential segfault if there's more than one .so files. ghstack-source-id: 314984329 @exported-using-ghexport Differential Revision: [D84135792](https://our.internmc.facebook.com/intern/diff/D84135792/)
1 parent 13bdf55 commit f8c3ea2

File tree

2 files changed

+19
-26
lines changed

2 files changed

+19
-26
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -286,18 +286,6 @@ class ET_EXPERIMENTAL CudaBackend final
286286
i);
287287
}
288288

289-
// Clean up GPU tensors that we created (ExecuTorch tensors are always
290-
// CPU, so all GPU tensors are our copies)
291-
for (int i = 0; i < n_inputs; i++) {
292-
// All GPU input tensors were created by us, delete them
293-
aoti_torch_delete_tensor_object(gpu_inputs[i]);
294-
}
295-
296-
for (int i = 0; i < n_outputs; i++) {
297-
// All GPU output tensors were created by us, delete them
298-
aoti_torch_delete_tensor_object(gpu_outputs[i]);
299-
}
300-
301289
return Error::Ok;
302290
}
303291

@@ -318,16 +306,13 @@ class ET_EXPERIMENTAL CudaBackend final
318306
handle->cuda_stream = nullptr;
319307
}
320308

321-
// Delete the container BEFORE closing the shared library
322-
if (handle->container_handle != nullptr) {
323-
AOTIRuntimeError delete_result =
324-
AOTInductorModelContainerDelete(handle->container_handle);
325-
ET_CHECK_OR_LOG_ERROR(
326-
delete_result == Error::Ok,
327-
"Failed to delete AOTInductorModelContainer with error code %d",
328-
delete_result);
329-
handle->container_handle = nullptr;
330-
}
309+
// NOTE: AOTInductorModelContainerDelete does not work correctly with
310+
// multiple .so files. Deleting one container frees shared resources,
311+
// which causes segmentation faults when attempting to delete other
312+
// containers. As a workaround, we skip explicit container deletion
313+
// and defer cleanup to the OS.
314+
// TODO(gasoonjia): Find a proper solution for safe container deletion.
315+
// AOTInductorModelContainerDelete(handle->container_handle);
331316

332317
// Now close the shared library
333318
if (handle->so_handle != nullptr) {
@@ -346,6 +331,7 @@ class ET_EXPERIMENTAL CudaBackend final
346331
}
347332

348333
delete handle;
334+
clear_all_tensors();
349335
}
350336
};
351337

backends/cuda/runtime/shims/memory.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,14 +271,21 @@ void clear_all_tensors() {
271271
// Use aoti_torch_delete_tensor_object to properly delete each tensor
272272
// Note: We need to collect tensor pointers first since deletion modifies the
273273
// set
274-
auto old_tensors =
275-
std::move(tensors); // tensors is now empty and no need to copy
276-
for (const auto& tensor_shared : old_tensors) {
277-
aoti_torch_delete_tensor_object(tensor_shared.get());
274+
std::vector<Tensor*> tensor_ptrs;
275+
tensor_ptrs.reserve(tensors.size());
276+
for (const auto& tensor_shared : tensors) {
277+
tensor_ptrs.push_back(tensor_shared.get());
278+
}
279+
280+
// Now delete each tensor - this will modify the global tensors set
281+
for (Tensor* tensor_ptr : tensor_ptrs) {
282+
aoti_torch_delete_tensor_object(tensor_ptr);
278283
}
279284

280285
// tensors set should now be empty, but ensure it's cleared
281286
tensors.clear();
287+
288+
ET_LOG(Info, "Cleared all tensors");
282289
}
283290

284291
AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {

0 commit comments

Comments
 (0)