Make Voxtral work

larryliu0820 · larryliu0820 · commit 366c7631375f · 2025-10-02T15:11:20.000-07:00
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -63,7 +63,7 @@ executorch_target_link_options_shared_lib(aoti_cuda)
 
 # Add runtime
 add_executable(voxtral_runner tests/voxtral_runner.cpp)
-target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor)
+target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor portable_ops_lib)
 
 install(
   TARGETS aoti_cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -131,15 +131,20 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
-        temp_dir / ("aoti_cuda_" + to_string(getpid()) + ".so");
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
     ofstream outfile(so_path.c_str(), ios::binary);
 
     // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
     outfile.write(
-        (char*)aoti_cuda_buffer->data(),
-        sizeof(void*) * aoti_cuda_buffer->size());
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
 
     // Finish writing the file to disk
     outfile.close();
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h
@@ -40,6 +40,7 @@ namespace cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
 };
@@ -100,6 +101,7 @@ using AOTITorchError = Error;
 // Helper function to check if a dtype is supported in ET CUDA backend
 inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
   switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
       return true;
@@ -113,8 +115,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
       dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
 
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
@@ -136,7 +136,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr audio_input = create_audio_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(audio_input);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =
@@ -171,7 +173,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr token_ids = create_token_ids_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(token_ids);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
 
       const auto run_start = Clock::now();
       auto token_output_result = module.execute("token_embedding", inputs);
@@ -203,17 +207,22 @@ int main(int argc, char** argv) {
       text_timing.load_ms = load_ms;
 
       std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
       if (token_executed) {
         if (token_output.isTensor()) {
           inputs.emplace_back(token_output);
         }
       }
 
       if (inputs.empty()) {
-        inputs.emplace_back(create_fallback_text_embedding());
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
       }
 
-      inputs.emplace_back(create_positions_input());
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =