[INTEL_HPU] Added support for in-place operations in the index_copy (#1496)

zongwave · web-flow · commit 8b1b87b643e8 · 2024-12-04T09:31:23.000+08:00
diff --git a/backends/intel_hpu/custom_ops/src/index_copy.cc b/backends/intel_hpu/custom_ops/src/index_copy.cc
@@ -29,23 +29,35 @@ class IndexCopy : public HpuOperator {
     auto inputs = ct.GetTensors();
     auto outputs = ct.GetTensors(false);
 
+    synSectionHandle section = createSection();
+
     std::vector<synTensor> syn_inputs;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      syn_inputs.push_back(createTensor(inputs[i].dims.size(),
-                                        inputs[i].type,
-                                        inputs[i].dims,
-                                        true,
-                                        inputs[i].name));
-    }
+    syn_inputs.push_back(createTensor(inputs[0].dims.size(),
+                                      inputs[0].type,
+                                      inputs[0].dims,
+                                      true,
+                                      inputs[0].name,
+                                      section));
+
+    syn_inputs.push_back(createTensor(inputs[1].dims.size(),
+                                      inputs[1].type,
+                                      inputs[1].dims,
+                                      true,
+                                      inputs[1].name));
+
+    syn_inputs.push_back(createTensor(inputs[2].dims.size(),
+                                      inputs[2].type,
+                                      inputs[2].dims,
+                                      true,
+                                      inputs[2].name));
 
     std::vector<synTensor> syn_outputs;
-    for (size_t i = 0; i < outputs.size(); i++) {
-      syn_outputs.push_back(createTensor(outputs[i].dims.size(),
-                                         outputs[i].type,
-                                         outputs[i].dims,
-                                         true,
-                                         outputs[i].name));
-    }
+    syn_outputs.push_back(createTensor(outputs[0].dims.size(),
+                                       outputs[0].type,
+                                       outputs[0].dims,
+                                       true,
+                                       outputs[0].name,
+                                       section));
 
     std::string guid = guid_ + "_" + SynDataTypeToStr(outputs[0].type);
     synStatus status = synNodeCreate(graphHandle_,
@@ -73,19 +85,13 @@ void IndexCopyKernel(const Context& dev_ctx,
                      const phi::DenseTensor& input,
                      const phi::Scalar& dim,
                      const phi::DenseTensor& index,
-                     const phi::DenseTensor& source,
-                     phi::DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  if (out->numel() == 0) {
-    return;
-  }
-
+                     const phi::DenseTensor& source) {
   ConvertTensors ct;
   ct.Add(input);
   ct.Add(index);
   ct.Add(source);
 
-  ct.Add(out, false);
+  ct.Add(input, false);
 
   std::vector<DIMS> inputs_dims = ct.GetDims();
   ns_IndexCopy::Params params{};
@@ -117,48 +123,39 @@ void CallIndexCopyKernel(const Context& dev_ctx,
                          const phi::DenseTensor& input,
                          const phi::Scalar& dim,
                          const phi::DenseTensor& index,
-                         const phi::DenseTensor& source,
-                         phi::DenseTensor* out) {
+                         const phi::DenseTensor& source) {
   if (input.dtype() == phi::DataType::FLOAT32) {
-    custom_kernel::IndexCopyKernel<float>(
-        dev_ctx, input, dim, index, source, out);
+    custom_kernel::IndexCopyKernel<float>(dev_ctx, input, dim, index, source);
   } else if (input.dtype() == phi::DataType::FLOAT16) {
     custom_kernel::IndexCopyKernel<phi::dtype::float16>(
-        dev_ctx, input, dim, index, source, out);
+        dev_ctx, input, dim, index, source);
   } else if (input.dtype() == phi::DataType::BFLOAT16) {
     custom_kernel::IndexCopyKernel<phi::dtype::bfloat16>(
-        dev_ctx, input, dim, index, source, out);
+        dev_ctx, input, dim, index, source);
   } else {
     throw std::runtime_error("Unsupported data type for IndexCopyKernel");
   }
 }
 
-std::vector<paddle::Tensor> IndexCopyForward(const paddle::Tensor& input,
-                                             const int dim,
-                                             const paddle::Tensor& index,
-                                             const paddle::Tensor& source) {
+void IndexCopyForward(const paddle::Tensor& input,
+                      const int dim,
+                      const paddle::Tensor& index,
+                      const paddle::Tensor& source) {
   auto dev_ctx = static_cast<const phi::CustomContext*>(
       paddle::experimental::DeviceContextPool::Instance().Get(input.place()));
 
   auto input_tensor = static_cast<phi::DenseTensor*>(input.impl().get());
   auto index_tensor = static_cast<const phi::DenseTensor*>(index.impl().get());
   auto source_tensor =
       static_cast<const phi::DenseTensor*>(source.impl().get());
-  auto out_tensor = std::make_shared<phi::DenseTensor>();
-  out_tensor->Resize(input_tensor->dims());
-
-  CallIndexCopyKernel(*dev_ctx,
-                      *input_tensor,
-                      phi::Scalar(dim),
-                      *index_tensor,
-                      *source_tensor,
-                      out_tensor.get());
 
-  return {paddle::Tensor(out_tensor)};
+  CallIndexCopyKernel(
+      *dev_ctx, *input_tensor, phi::Scalar(dim), *index_tensor, *source_tensor);
 }
 
 PD_BUILD_OP(index_copy)
     .Inputs({"input", "index", "source"})
     .Outputs({"out"})
     .Attrs({"dim: int"})
+    .SetInplaceMap({{"input", "out"}})
     .SetKernelFn(PD_KERNEL(IndexCopyForward));
diff --git a/backends/intel_hpu/custom_ops/tests/test_index_copy.py b/backends/intel_hpu/custom_ops/tests/test_index_copy.py
@@ -30,9 +30,9 @@ def index_copy_torch(input, dim, index, source, dtype):
         "int32": torch.int32,
     }
     torch_dtype = dtype_map[dtype]
-    input_tensor = torch.tensor(input, dtype=torch_dtype)
-    index_tensor = torch.tensor(index, dtype=torch.int64)
-    source_tensor = torch.tensor(source, dtype=torch_dtype)
+    input_tensor = torch.tensor(input).clone().detach().to(dtype=torch_dtype)
+    index_tensor = torch.tensor(index).clone().detach().to(dtype=torch.int64)
+    source_tensor = torch.tensor(source).clone().detach().to(dtype=torch_dtype)
     output = torch.index_copy(
         input=input_tensor, dim=dim, index=index_tensor, source=source_tensor
     )
@@ -72,13 +72,13 @@ def check_result(self, torch_res, ops_res):
         np.testing.assert_allclose(torch_res, ops_res, rtol=rtol, atol=atol)
 
     def index_copy_custom(self, input, dim, index, source):
-        input_tensor = paddle.to_tensor(input, dtype=self.dtype)
-        index_tensor = paddle.to_tensor(index, dtype="int64")
-        source_tensor = paddle.to_tensor(source, dtype=self.dtype)
-        out = paddlenlp_ops.index_copy(
+        input_tensor = paddle.to_tensor(input, dtype=self.dtype).clone()
+        index_tensor = paddle.to_tensor(index, dtype="int64").clone()
+        source_tensor = paddle.to_tensor(source, dtype=self.dtype).clone()
+        paddlenlp_ops.index_copy(
             input=input_tensor, dim=dim, index=index_tensor, source=source_tensor
         )
-        return out
+        return input_tensor
 
     def prepare_input(
         self, batch_size=16, num_heads=32, seq_length=256, head_dim=64, dim=0, index=0
@@ -118,26 +118,26 @@ def test_index_copy_dim0_index0(self):
         input, index, source, dim = self.prepare_input(dim=0, index=0)
         custom_res = self.index_copy_custom(input, dim, index, source)
         torch_res = index_copy_torch(input, dim, index, source, dtype=self.dtype)
-        self.check_result(torch_res.numpy(), custom_res.numpy())
+        self.check_result(torch_res.numpy(), custom_res)
 
     def test_index_copy_dim0_index1(self):
         input, index, source, dim = self.prepare_input(dim=0, index=1)
         custom_res = self.index_copy_custom(input, dim, index, source)
         torch_res = index_copy_torch(input, dim, index, source, dtype=self.dtype)
-        self.check_result(torch_res.numpy(), custom_res.numpy())
+        self.check_result(torch_res.numpy(), custom_res)
 
     def test_index_copy_dim0_index_max(self):
         index = max(self.num_heads - 1, 0)
         input, index, source, dim = self.prepare_input(dim=0, index=index)
         custom_res = self.index_copy_custom(input, dim, index, source)
         torch_res = index_copy_torch(input, dim, index, source, dtype=self.dtype)
-        self.check_result(torch_res.numpy(), custom_res.numpy())
+        self.check_result(torch_res.numpy(), custom_res)
 
     def test_index_copy_dim1_index0(self):
         input, index, source, dim = self.prepare_input(dim=1, index=0)
         custom_res = self.index_copy_custom(input, dim, index, source)
         torch_res = index_copy_torch(input, dim, index, source, dtype=self.dtype)
-        self.check_result(torch_res.numpy(), custom_res.numpy())
+        self.check_result(torch_res.numpy(), custom_res)
 
     def test_index_copy_dim1_index1(self):
         input, index, source, dim = self.prepare_input(dim=1, index=1)
diff --git a/backends/intel_hpu/tests/test_kvcache.py b/backends/intel_hpu/tests/test_kvcache.py
@@ -13,16 +13,20 @@
 # limitations under the License.
 
 import paddle
+import paddlenlp_ops
 
 paddle.set_device("intel_hpu")
 # paddle.set_device("cpu")
 
 
 class KVCache(paddle.nn.Layer):
-    def __init__(self):
+    def __init__(self, cache=None, inp_seq_len=-1):
         super(KVCache, self).__init__()
-        self.cache = None
-        self.inp_seq_len = -1
+        print(
+            f"`Paddle KVCache init` cache: {cache.shape if cache is not None else 'None'}, inp_seq_len: {inp_seq_len}"
+        )
+        self.cache = cache
+        self.inp_seq_len = inp_seq_len
 
     def allocate(self, inp_seq_len, dtype, shape):
         if self.cache is None or self.cache.shape != shape:
@@ -49,20 +53,7 @@ def update(prev, cur, dim, idx, inp_seq_len):
             return orig_cur
         if idx is not None:
             # prev.index_copy_(dim, idx - 1, cur)
-            if dim == 0:
-                prev.scatter_(idx, cur)
-            else:
-                times, temp_shape, temp_index = (
-                    paddle.prod(paddle.to_tensor(prev.shape[:dim])),
-                    prev.shape,
-                    idx,
-                )
-                prev, new_t = prev.reshape([-1] + temp_shape[dim + 1 :]), cur.reshape(
-                    [-1] + temp_shape[dim + 1 :]
-                )
-                for i in range(1, times):
-                    temp_index = paddle.concat([temp_index, idx + temp_shape[dim] * i])
-                prev.scatter_(temp_index, new_t).reshape_(temp_shape)
+            paddlenlp_ops.index_copy(input=prev, dim=dim, index=idx - 1, source=cur)
             return prev
         else:
             return paddle.concat((prev, cur), dim=dim)
@@ -77,35 +68,34 @@ def forward(self, cur, dim, idx):
 
 
 batch_size = 1
-num_key_value_heads = 32
-max_seq_len = 1024
-head_dim = 128
+num_key_value_heads = 2
+max_seq_len = 16
+head_dim = 4
 
 # paddle case
 cache_shape = (batch_size, num_key_value_heads, max_seq_len, head_dim)
 dtype = "float32"
 
-inp_seq_len = 128
+inp_seq_len = 2
 
-k_cache = KVCache()
-k_cache.allocate(inp_seq_len, dtype, cache_shape)
+static_cache = paddle.zeros(cache_shape, dtype=dtype)
+k_cache = KVCache(static_cache, inp_seq_len)
+# k_cache = KVCache()
+# k_cache.allocate(inp_seq_len, dtype, cache_shape)
 
-key_states = paddle.rand(
-    (batch_size, num_key_value_heads, inp_seq_len, head_dim), dtype=dtype
+key_states = paddle.full(
+    (batch_size, num_key_value_heads, inp_seq_len, head_dim), -1, dtype=dtype
 )
-
 token_idx = paddle.to_tensor([0], dtype="int64")
-prefill = k_cache(key_states, 2, token_idx)
+prefill = k_cache(cur=key_states, dim=2, idx=token_idx)
+print(f"Paddle KVCache prefill:{prefill}")
 
-print((prefill == k_cache.cache[:, :, :inp_seq_len, :]).all())
+for i in range(inp_seq_len + 1, max_seq_len + 1):
+    token_idx = paddle.to_tensor([i], dtype="int64")
+    key_state = paddle.ones((batch_size, num_key_value_heads, 1, head_dim), dtype=dtype)
+    decode = k_cache(cur=key_state, dim=2, idx=token_idx)
+    print(f"Paddle KVCache decode:{decode}")
 
-inp_seq_len = 1
-token_idx = paddle.to_tensor([128], dtype="int64")
-key_state = paddle.ones(
-    (batch_size, num_key_value_heads, inp_seq_len, head_dim), dtype=dtype
-)
-decode = k_cache(key_state, 2, token_idx)
-print((key_state == decode[:, :, token_idx, :]).all())
 
 if 0:
     # torch case