[INTEL_HPU] add one hot op kernel (#1609)

fmiao2372 · web-flow · commit 7dddf3eae30a · 2025-03-20T09:49:41.000+08:00
diff --git a/backends/intel_hpu/kernels/one_hot_kernel.cc b/backends/intel_hpu/kernels/one_hot_kernel.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "habanalabs/perf_lib_layer_params.h"
+#include "habanalabs/synapse_api.h"
+#include "habanalabs/synapse_common_types.h"
+#include "kernels/funcs.h"
+#include "kernels/hpu_operator.h"
+#include "utils/utils.h"
+
+namespace custom_kernel {
+
+class OneHotOperator : public HpuOperator {
+ public:
+  OneHotOperator(std::string guid_prefix, std::string node_name)
+      : HpuOperator(guid_prefix), pName_(node_name) {}
+  void AddNode(ConvertTensors& ct, ns_OneHotKernel::Params params) {
+    auto inputs = ct.GetTensors();
+    auto outputs = ct.GetTensors(false);
+    std::vector<synTensor> syn_inputs;
+    if (inputs[0].type == syn_type_int32) {
+      for (size_t i = 0; i < inputs.size(); i++) {
+        syn_inputs.push_back(createTensor(inputs[i].dims.size(),
+                                          inputs[i].type,
+                                          inputs[i].dims,
+                                          true,
+                                          inputs[i].name));
+      }
+    } else {
+      for (size_t i = 0; i < inputs.size(); i++) {
+        std::vector<synTensor> x_i64;
+        x_i64.push_back(createTensor(inputs[i].dims.size(),
+                                     inputs[i].type,
+                                     inputs[i].dims,
+                                     true,
+                                     inputs[i].name));
+        std::vector<synTensor> x_i32;
+        auto x_cast = createTensor(inputs[i].dims.size(),
+                                   syn_type_int32,
+                                   inputs[i].dims,
+                                   false,
+                                   "x_cast");
+        x_i32.push_back(x_cast);
+
+        std::string guid_cast = "cast_i64_to_i32";
+        synStatus status = synNodeCreate(graphHandle_,
+                                         x_i64.data(),
+                                         x_i32.data(),
+                                         x_i64.size(),
+                                         x_i32.size(),
+                                         nullptr,
+                                         0,
+                                         guid_cast.c_str(),
+                                         "cast_x",
+                                         nullptr,
+                                         nullptr);
+        PD_CHECK(status == synSuccess,
+                 "[RUNTIME] synNodeCreate cast_x failed = ",
+                 status);
+        syn_inputs.push_back(x_cast);
+      }
+    }
+
+    std::vector<synTensor> syn_outputs;
+    for (size_t i = 0; i < outputs.size(); i++) {
+      syn_outputs.push_back(createTensor(outputs[i].dims.size(),
+                                         outputs[i].type,
+                                         outputs[i].dims,
+                                         true,
+                                         outputs[i].name));
+    }
+
+    synStatus status = synNodeCreate(graphHandle_,
+                                     syn_inputs.data(),
+                                     syn_outputs.data(),
+                                     inputs.size(),
+                                     outputs.size(),
+                                     &params,
+                                     sizeof(params),
+                                     guid_.c_str(),
+                                     pName_.c_str(),
+                                     nullptr,
+                                     nullptr);
+    PD_CHECK(
+        status == synSuccess, "[RUNTIME] synNodeCreate () failed = %d", status);
+  }
+  std::string pName_;
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const phi::DenseTensor& x,
+                     const phi::Scalar& num_classes,
+                     phi::DataType dtype,
+                     bool allow_out_of_range,
+                     phi::DenseTensor* out) {
+  // allocate memory on device.
+  ConvertTensors ct;
+  ct.Add(x);
+  dev_ctx.template Alloc<float>(out);
+  int depth = num_classes.to<int>();
+  auto out_dims = out->dims();
+  out_dims[out_dims.size() - 1] = depth;
+  out->Resize(out_dims);
+  ct.Add(out, false);
+
+  std::vector<DIMS> inputs_dims = ct.GetDims();
+  ns_OneHotKernel::Params params{-1, depth, 1, 0};
+
+  OpCacheOperator op_info;
+  op_info.prepareOpInfo<float, ns_OneHotKernel::Params>(
+      "one_hot_fwd", {inputs_dims}, &params);
+
+  auto recipe = op_info.GetRecipe();
+  if (recipe == nullptr) {
+    // compile
+    OneHotOperator op(op_info.guid_, "one_hot_op");
+    op.AddNode(ct, params);
+    op.Compile();
+    op_info.setOp(op);
+    recipe = op_info.GetRecipe();
+  }
+
+  RecipeRunner runner(recipe);
+  auto tensors = ct.GetDeviceAddr();
+  runner.Run(reinterpret_cast<C_Stream>(dev_ctx.stream()), tensors);
+}
+
+template <typename T, typename Context>
+void OneHotKernel(const Context& dev_ctx,
+                  const phi::DenseTensor& x,
+                  const phi::Scalar& num_classes_s,
+                  phi::DenseTensor* out) {
+  custom_kernel::OneHotRawKernel<T, Context>(
+      dev_ctx, x, num_classes_s, phi::DataType::FLOAT32, false, out);
+}
+
+}  // namespace custom_kernel
+
+PD_REGISTER_PLUGIN_KERNEL(one_hot_raw,
+                          intel_hpu,
+                          ALL_LAYOUT,
+                          custom_kernel::OneHotRawKernel,
+                          int32_t,
+                          int64_t) {}
+
+PD_REGISTER_PLUGIN_KERNEL(one_hot,
+                          intel_hpu,
+                          ALL_LAYOUT,
+                          custom_kernel::OneHotKernel,
+                          int32_t,
+                          int64_t) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/backends/intel_hpu/tests/unittests/test_one_hot.py b/backends/intel_hpu/tests/unittests/test_one_hot.py
@@ -0,0 +1,126 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from tests.op_test import OpTest
+import paddle
+import paddle.base.core as core
+
+paddle.enable_static()
+
+import os
+
+intel_hpus_module_id = os.environ.get("FLAGS_selected_intel_hpus", 0)
+
+
+class TestOneHotOp(OpTest):
+    def set_hpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_hpu()
+        self.op_type = "one_hot_v2"
+        self.dtype = np.int32
+        depth = 10
+        depth_np = np.array(10).astype("int32")
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype("int32").reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.prod(x.shape), depth)).astype(self.dtype)
+
+        for i in range(np.prod(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {"X": (x, x_lod), "depth_tensor": depth_np}
+        self.attrs = {"dtype": int(core.VarDesc.VarType.FP32)}
+        self.outputs = {"Out": (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            paddle.CustomPlace("intel_hpu", int(intel_hpus_module_id))
+        )
+
+    def test_check_grad(self):
+        pass
+
+
+class TestOneHotOpAttr(OpTest):
+    def set_hpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_hpu()
+        self.op_type = "one_hot_v2"
+        self.dtype = np.int32
+        depth = 10
+        depth_np = np.array(10).astype("int32")
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype("int32").reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.prod(x.shape), depth)).astype(self.dtype)
+
+        for i in range(np.prod(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {"X": (x, x_lod)}
+        self.attrs = {"dtype": int(core.VarDesc.VarType.FP32), "depth": depth}
+        self.outputs = {"Out": (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            paddle.CustomPlace("intel_hpu", int(intel_hpus_module_id))
+        )
+
+    def test_check_grad(self):
+        pass
+
+
+class TestOneHotOpNoLod(OpTest):
+    def set_hpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_hpu()
+        self.op_type = "one_hot_v2"
+        self.dtype = np.int32
+        depth = 10
+        depth_np = np.array(10).astype("int32")
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype("int32").reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.prod(x.shape), depth)).astype(self.dtype)
+
+        for i in range(np.prod(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {"X": x, "depth_tensor": depth_np}
+        self.attrs = {"dtype": int(core.VarDesc.VarType.FP32)}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            paddle.CustomPlace("intel_hpu", int(intel_hpus_module_id))
+        )
+
+    def test_check_grad(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()