fix(exla): use passed device id and protect against some segfaults

polvalente · polvalente · commit c772f3c687a8 · 2024-05-01T23:39:16.000-03:00
diff --git a/exla/c_src/exla/exla.cc b/exla/c_src/exla/exla.cc
@@ -213,12 +213,17 @@ ERL_NIF_TERM create_buffer_from_device_pointer(ErlNifEnv* env, int argc, const E
 
   void* ptr;
   if (pointer_kind == "local") {
+    if (pointer_vec.size() != sizeof(void*)) {
+      // This helps prevent segfaults if someone passes an IPC handle instead of
+      // a local pointer.
+      return exla::nif::error(env, "Invalid pointer size for selected mode.");
+    }
     unsigned char* bytePtr = reinterpret_cast<unsigned char*>(&ptr);
     for (size_t i = 0; i < sizeof(void*); i++) {
       bytePtr[i] = pointer_vec[i];
     }
   } else if (pointer_kind == "cuda_ipc") {
-    auto result = get_pointer_for_ipc_handle(pointer_vec);
+    auto result = get_pointer_for_ipc_handle(pointer_vec, device_id);
     if (result.second) {
       return exla::nif::error(env, "Unable to get pointer for IPC handle.");
     }
diff --git a/exla/c_src/exla/exla_cuda.cc b/exla/c_src/exla/exla_cuda.cc
@@ -20,7 +20,12 @@ std::pair<std::vector<unsigned char>, int> get_cuda_ipc_handle(std::uintptr_t pt
   return std::make_pair(result, status != cudaSuccess);
 }
 
-std::pair<void*, int> get_pointer_for_ipc_handle(std::vector<int64_t> handle_list) {
+std::pair<void*, int> get_pointer_for_ipc_handle(std::vector<int64_t> handle_list, int device_id) {
+  if (handle_list.size() != sizeof(cudaIpcMemHandle_t)) {
+    printf("Error: Invalid CUDA IPC memory handle size\n");
+    return std::make_pair(nullptr, 1);  // Return with error status
+  }
+
   unsigned char ipc_handle_data[sizeof(cudaIpcMemHandle_t)];
   for (int i = 0; i < sizeof(cudaIpcMemHandle_t); i++) {
     ipc_handle_data[i] = (uint8_t)handle_list[i];
@@ -30,7 +35,7 @@ std::pair<void*, int> get_pointer_for_ipc_handle(std::vector<int64_t> handle_lis
   memcpy(&ipc_handle, ipc_handle_data, sizeof(cudaIpcMemHandle_t));
 
   int* ptr;
-  cudaError_t cuda_status = cudaSetDevice(0);  // Assuming device 0, change as needed
+  cudaError_t cuda_status = cudaSetDevice(device_id);  // Assuming device 0, change as needed
   if (cuda_status != cudaSuccess) {
     printf("Error setting CUDA device: %s\n", cudaGetErrorString(cuda_status));
     return std::make_pair(nullptr, 1);  // Return with error status
diff --git a/exla/c_src/exla/exla_cuda.h b/exla/c_src/exla/exla_cuda.h
@@ -4,4 +4,4 @@
 #include <vector>
 
 std::pair<std::vector<unsigned char>, int> get_cuda_ipc_handle(std::uintptr_t);
-std::pair<void*, int> get_pointer_for_ipc_handle(std::vector<int64_t>);
+std::pair<void*, int> get_pointer_for_ipc_handle(std::vector<int64_t>, int);
diff --git a/exla/test/exla/device_memory_sharing_test.exs b/exla/test/exla/device_memory_sharing_test.exs
@@ -25,4 +25,16 @@ defmodule EXLA.DeviceMemorySharingTest do
       assert Nx.to_binary(t1) == Nx.to_binary(t2)
     end
   end
+
+  @tag :cuda_required
+  test "ipc handles don't crash the runtime when :local mode is selected" do
+    assert {:error, ~c"Invalid pointer size for selected mode."} ==
+             Nx.from_pointer(
+               {EXLA.Backend, client_name: :cuda},
+               Enum.to_list(0..63),
+               {:f, 32},
+               {1},
+               mode: :local
+             )
+  end
 end