pytorch
diff --git a/‎backends/aoti/common_shims.cpp‎
Lines changed: 4 additions & 0 deletions b/‎backends/aoti/common_shims.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/aoti/utils.h‎
Lines changed: 2 additions & 0 deletions b/‎backends/aoti/utils.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 1 addition & 5 deletions b/‎backends/cuda/cuda_backend.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎backends/cuda/runtime/shims/memory.cpp‎
Lines changed: 90 additions & 0 deletions b/‎backends/cuda/runtime/shims/memory.cpp‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/shims/memory.h‎
Lines changed: 25 additions & 0 deletions b/‎backends/cuda/runtime/shims/memory.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/utils.h‎
Lines changed: 4 additions & 1 deletion b/‎backends/cuda/runtime/utils.h‎
Lines changed: 4 additions & 1 deletion
@@ -164,6 +164,10 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
+int32_t aoti_torch_dtype_float16() {
+  return 5; // PyTorch's float16 dtype code
+}
+
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
 
@@ -57,6 +57,7 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
+int32_t aoti_torch_dtype_float16();
 int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
 int32_t aoti_torch_dtype_int8();
 
@@ -43,6 +43,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
       return executorch::aten::ScalarType::Int;
     case 4: // PyTorch's int64 dtype code
       return executorch::aten::ScalarType::Long;
+    case 5: // PyTorch's float16 (half) dtype code
+      return executorch::aten::ScalarType::Half;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 11: // PyTorch's bool dtype code
 
@@ -162,11 +162,7 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
-            [
-                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
-            ]
-        ), torch.no_grad():
+        with collect_unsupported_fallback_kernels(), torch.no_grad():
             # torch._logging.set_logs(post_grad_graphs=True)
             # Here we should expect 1 so file and 1 weight blob in the same directory.
             paths = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
 
@@ -582,6 +582,96 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
   return Error::Ok;
 }
 
+AOTITorchError aoti_torch_new_tensor_handle(
+    Tensor* orig_handle,
+    Tensor** new_handle) {
+  // Validate input parameters
+  ET_CHECK_OR_RETURN_ERROR(
+      orig_handle != nullptr,
+      InvalidArgument,
+      "aoti_torch_new_tensor_handle failed: orig_handle is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      new_handle != nullptr,
+      InvalidArgument,
+      "aoti_torch_new_tensor_handle failed: new_handle is null");
+
+  // Get metadata from the original tensor
+  int64_t* sizes_ptr;
+  int64_t* strides_ptr;
+  int32_t dtype;
+  int32_t device_type;
+  int32_t device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_sizes(orig_handle, &sizes_ptr));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_strides(orig_handle, &strides_ptr));
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(orig_handle, &dtype));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_device_type(orig_handle, &device_type));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      aoti_torch_get_device_index(orig_handle, &device_index));
+
+  int64_t ndim = orig_handle->dim();
+
+  // Validate dtype
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
+
+  // Ensure device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
+  // Get the original data pointer from the source tensor
+  void* data_ptr = orig_handle->mutable_data_ptr();
+  ET_CHECK_OR_RETURN_ERROR(
+      data_ptr != nullptr,
+      InvalidArgument,
+      "Source tensor has null data pointer");
+
+  // Check if the given memory is in the map
+  auto memory_it = memory_to_n_tensor.find(data_ptr);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it != memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is not being tracked by reference counting system",
+      data_ptr);
+
+  // Convert sizes and strides to vectors
+  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
+  std::vector<StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create new tensor that shares the same memory as the original
+  // This is similar to PyTorch's Tensor copy constructor - creates a new
+  // tensor object that shares the same underlying storage
+  std::shared_ptr<Tensor> tensor = make_tensor(
+      sizes, // Same sizes as original
+      data_ptr, // Share the same memory from source tensor
+      {}, // dim_order (empty, will be auto-generated)
+      strides, // Same strides as original
+      dtype_to_scalar_type(dtype) // Same dtype as original
+  );
+
+  ET_CHECK_OR_RETURN_ERROR(
+      tensor != nullptr, InvalidArgument, "Failed to create new tensor handle");
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *new_handle = tensor.get();
+
+  // Increment the reference count for this memory address only if it is owned
+  // by tensor
+  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
+      ? NOT_OWN
+      : memory_to_n_tensor[data_ptr] + 1;
+
+  return Error::Ok;
+}
+
 AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
 
@@ -114,6 +114,31 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     int64_t storage_offset,
     Tensor** ret_new_tensor);
 
+/**
+ * Creates a new tensor handle from an existing one.
+ *
+ * This function creates a new tensor object that shares the same underlying
+ * memory as the original tensor. Similar to PyTorch's Tensor copy constructor,
+ * it creates a new handle/reference to the same data without performing a deep
+ * copy.
+ *
+ * The new tensor will:
+ * - Share the same memory/storage as the original tensor
+ * - Have the same shape, strides, and dtype as the original
+ * - Increment the reference count for the underlying memory (if owned)
+ *
+ * @param orig_handle Original tensor to create a new handle from (must not be
+ * null)
+ * @param new_handle Output pointer to store the new tensor handle (must not be
+ * null)
+ *
+ * @return Error::Ok on success, appropriate error code on failure:
+ *         - Error::InvalidArgument: null pointers or invalid parameters
+ */
+AOTITorchError aoti_torch_new_tensor_handle(
+    Tensor* orig_handle,
+    Tensor** new_handle);
+
 /**
  * Copies data from source tensor to destination tensor.
  *
 
@@ -61,6 +61,7 @@ enum class SupportedDTypes : int32_t {
   INT16 = 2, // PyTorch's int16 dtype code
   INT32 = 3, // PyTorch's int32 dtype code
   INT64 = 4, // PyTorch's int64 dtype code
+  FLOAT16 = 5, // PyTorch's float16 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BOOL = 11, // PyTorch's bool dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
@@ -84,6 +85,7 @@ inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
     case static_cast<int32_t>(SupportedDTypes::INT16):
     case static_cast<int32_t>(SupportedDTypes::INT32):
     case static_cast<int32_t>(SupportedDTypes::INT64):
+    case static_cast<int32_t>(SupportedDTypes::FLOAT16):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BOOL):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
@@ -98,12 +100,13 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float32), %d (bool), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int8), %d (int16), %d (int32), %d (int64), %d (float16), %d (float32), %d (bool), %d (bfloat16)",
       dtype,
       static_cast<int32_t>(SupportedDTypes::INT8),
       static_cast<int32_t>(SupportedDTypes::INT16),
       static_cast<int32_t>(SupportedDTypes::INT32),
       static_cast<int32_t>(SupportedDTypes::INT64),
+      static_cast<int32_t>(SupportedDTypes::FLOAT16),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BOOL),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
Original file line number	Diff line number	Diff line change
`@@ -164,6 +164,10 @@ int32_t aoti_torch_layout_strided() {`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`// Dtype constants - these return the PyTorch dtype codes`
	`167`	`+int32_t aoti_torch_dtype_float16() {`
	`168`	`+ return 5; // PyTorch's float16 dtype code`
	`169`	`+}`
	`170`	`+`
`167`	`171`	`int32_t aoti_torch_dtype_float32() {`
`168`	`172`	`return 6; // PyTorch's float32 dtype code`
`169`	`173`	`}`