microsoft
diff --git a/‎operators/cuda/cuda_ops.cc‎
Lines changed: 8 additions & 0 deletions b/‎operators/cuda/cuda_ops.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎operators/cuda/scatter_nd_of_shape.h‎
Lines changed: 143 additions & 0 deletions b/‎operators/cuda/scatter_nd_of_shape.h‎
Lines changed: 143 additions & 0 deletions
@@ -7,7 +7,11 @@
 #include "cuda/add_mul.h"
 #include "cuda/fast_gelu.h"
 #include "cuda/negxplus1.h"
+<<<<<<< HEAD
 #include "cuda/rotary.h"
+=======
+#include "cuda/scatter_nd_of_shape.h"
+>>>>>>> f5055466d5376059c2ea74e3cea46e16a537bc0d
 #include "cuda/transpose_cast.h"
 #endif
 
@@ -30,17 +34,21 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
       ,
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
+      CustomCudaStructV2("MaskedScatterNDOfShape", contrib::MaskedScatterNDOfShape<float>),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
       CustomCudaStructV2("Rotary", contrib::Rotary<float>),
+      CustomCudaStructV2("ScatterNDOfShape", contrib::ScatterNDOfShape<float>),
 #if ORT_API_VERSION >= 16
 
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
+      CustomCudaStructV2("MaskedScatterNDOfShape", contrib::MaskedScatterNDOfShape<ortc::MFloat16>),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>),
       CustomCudaStructV2("Rotary", contrib::Rotary<ortc::MFloat16>),
+      CustomCudaStructV2("ScatterNDOfShape", contrib::ScatterNDOfShape<ortc::MFloat16>),
       CustomCudaStructV2("Transpose2DCastFP16", Transpose2DCastFloat32ToFloat16Type),
       CustomCudaStructV2("Transpose2DCastFP32", Transpose2DCastFloat16ToFloat32Type)
 #endif
 
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "ocos.h"
+#include "string_utils.h"
+#include "scatter_nd_of_shape_impl.cuh"
+
+namespace contrib {
+
+template <typename T>
+struct ScatterNDOfShape {
+  OrtStatusPtr OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
+    std::string value;
+    OrtStatusPtr status = OrtW::GetOpAttribute(info, "reduction", value);
+    if (status != nullptr)
+      return status;
+
+    if (value == "add")
+      reduction_ = ScatterReduction::Add;
+    else if (value == "mul")
+      reduction_ = ScatterReduction::Mul;
+    else if (value == "min")
+      reduction_ = ScatterReduction::Min;
+    else if (value == "max")
+      reduction_ = ScatterReduction::Max;
+    else
+      ORTX_CXX_API_THROW("Unexpected reduction, only Add is implemented.", ORT_RUNTIME_EXCEPTION);
+
+    return nullptr;
+  }
+
+  OrtStatusPtr Compute(Ort::Custom::CUDAKernelContext* ctx,
+                       const ortc::Tensor<int64_t>& output_shape,
+                       const ortc::Tensor<int64_t>& indices,
+                       const ortc::Tensor<T>& updates,
+                       ortc::Tensor<T>& output) const {
+    auto& output_shape_shape = output_shape.Shape();
+    auto& indices_shape = indices.Shape();
+    auto& updates_shape = updates.Shape();
+
+    if (output_shape_shape.size() != 1 || output_shape_shape[0] == 0) {
+      ORTX_CXX_API_THROW("output shape must be a 1D tensor", ORT_RUNTIME_EXCEPTION);
+    }
+    if (indices_shape[indices_shape.size() - 1] != 1) {
+      ORTX_CXX_API_THROW("last dimension of the indices tensor should be one", ORT_RUNTIME_EXCEPTION);
+    }
+
+    const int64_t* shape_data = output_shape.Data();  // CPU pointer
+    const int64_t* indices_data = indices.Data();  // GPU pointer
+    const T* updates_data = updates.Data();  // GPU pointer
+    std::vector<int64_t> voutput_shape(shape_data, shape_data + output_shape_shape[0]);
+    T* output_data = output.Allocate(voutput_shape);  // GPU pointer
+    LaunchScatterNDOfShapeKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                                    voutput_shape,
+                                    indices_shape,
+                                    indices_data,
+                                    updates_data,
+                                    output_data,
+                                    reduction_);
+    return nullptr;
+  }
+
+  static OrtMemType GetInputMemoryType(size_t input_index) {
+    if (input_index == 0)  // shape
+      return OrtMemType::OrtMemTypeCPUInput;
+    return OrtMemType::OrtMemTypeDefault;
+  }
+
+  ScatterReduction reduction_;
+};
+
+
+template <typename T>
+struct MaskedScatterNDOfShape {
+  OrtStatusPtr OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
+    std::string value;
+    OrtStatusPtr status = OrtW::GetOpAttribute(info, "reduction", value);
+    if (status != nullptr)
+      return status;
+
+    if (value == "add")
+      reduction_ = ScatterReduction::Add;
+    else if (value == "mul")
+      reduction_ = ScatterReduction::Mul;
+    else if (value == "min")
+      reduction_ = ScatterReduction::Min;
+    else if (value == "max")
+      reduction_ = ScatterReduction::Max;
+    else
+      ORTX_CXX_API_THROW("Unexpected reduction, only Add is implemented.", ORT_RUNTIME_EXCEPTION);
+
+    status = OrtW::GetOpAttribute(info, "maskedValue", masked_value_);
+    if (status != nullptr)
+      return status;
+
+    return nullptr;
+  }
+
+  OrtStatusPtr Compute(Ort::Custom::CUDAKernelContext* ctx,
+                       const ortc::Tensor<int64_t>& output_shape,
+                       const ortc::Tensor<int64_t>& indices,
+                       const ortc::Tensor<T>& updates,
+                       ortc::Tensor<T>& output) const {
+    auto& output_shape_shape = output_shape.Shape();
+    auto& indices_shape = indices.Shape();
+    auto& updates_shape = updates.Shape();
+
+    if (output_shape_shape.size() != 1 || output_shape_shape[0] == 0) {
+      ORTX_CXX_API_THROW("output shape must be a 1D tensor", ORT_RUNTIME_EXCEPTION);
+    }
+    if (indices_shape[indices_shape.size() - 1] != 1) {
+      ORTX_CXX_API_THROW("last dimension of the indices tensor should be one", ORT_RUNTIME_EXCEPTION);
+    }
+
+    const int64_t* shape_data = output_shape.Data();  // CPU pointer
+    const int64_t* indices_data = indices.Data();  // GPU pointer
+    const T* updates_data = updates.Data();  // GPU pointer
+    std::vector<int64_t> voutput_shape(shape_data, shape_data + output_shape_shape[0]);
+    T* output_data = output.Allocate(voutput_shape);  // GPU pointer
+    LaunchMaskedScatterNDOfShapeKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                                          voutput_shape,
+                                          indices_shape,
+                                          indices_data,
+                                          updates_data,
+                                          output_data,
+                                          reduction_,
+                                          masked_value_);
+    return nullptr;
+  }
+
+  static OrtMemType GetInputMemoryType(size_t input_index) {
+    if (input_index == 0)  // shape
+      return OrtMemType::OrtMemTypeCPUInput;
+    return OrtMemType::OrtMemTypeDefault;
+  }
+
+  private:
+  ScatterReduction reduction_;
+  int64_t masked_value_;
+};
+
+}  // namespace contrib