InfiniTensor · pengcheng888 · Nov 17, 2025 · Ceng23333 · Nov 17, 2025 · pengcheng888
diff --git a/include/infinicore/ops/embedding.hpp b/include/infinicore/ops/embedding.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+Tensor embedding(Tensor input, Tensor weight);
+void embedding_(Tensor out, Tensor input, Tensor weight);
+} // namespace infinicore::op
diff --git a/python/infinicore/nn/__init__.py b/python/infinicore/nn/__init__.py
@@ -1,3 +1,3 @@
-from infinicore.nn import (
-    functional as functional,
-)
+from infinicore.nn import functional
+
+__all__ = ["functional"]
diff --git a/python/infinicore/nn/functional.py b/python/infinicore/nn/functional.py
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
@@ -0,0 +1,4 @@
+from .embedding import embedding
+from .ropy import RopeAlgo, rope
+
+__all__ = ["embedding", "rope", "RopeAlgo"]
diff --git a/python/infinicore/nn/functional/embedding.py b/python/infinicore/nn/functional/embedding.py
@@ -0,0 +1,34 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+__all__ = ["embedding"]
+
+
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx=None,
+    max_norm=None,
+    norm_type=2.0,
+    scale_grad_by_freq=False,
+    sparse=False,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Generate a simple lookup table that looks up embeddings in a fixed dictionary and size."""
+
+    assert (
+        (padding_idx is None)
+        and (max_norm is None)
+        and (scale_grad_by_freq is False)
+        and (sparse is False)
+    ), "Unsupported parameters."
+
+    assert "cpu" == input.device.type, (
+        "The device of 'input' variable must be on the CPU."
+    )
+    if out is None:
+        return Tensor(_infinicore.embedding(input._underlying, weight._underlying))
+
+    _infinicore.embedding_(out._underlying, input._underlying, weight._underlying)
+    return out
diff --git a/python/infinicore/nn/functional/ropy.py b/python/infinicore/nn/functional/ropy.py
@@ -0,0 +1,44 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+__all__ = ["rope", "RopeAlgo"]
+
+
+class RopeAlgo:
+    r"""Different types of RoPE algorithms."""
+
+    GPT_J = _infinicore.Algo.GPT_J
+    GPT_NEOX = _infinicore.Algo.GPT_NEOX
+
+
+def rope(
+    x: Tensor,
+    pos_ids: Tensor,
+    sin_table: Tensor,
+    cos_table: Tensor,
+    algo: RopeAlgo = RopeAlgo.GPT_NEOX,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Rotary Position Embedding(RoPE)."""
+
+    if out is None:
+        return Tensor(
+            _infinicore.rope(
+                x._underlying,
+                pos_ids._underlying,
+                sin_table._underlying,
+                cos_table._underlying,
+                algo,
+            )
+        )
+
+    _infinicore.rope_(
+        out._underlying,
+        x._underlying,
+        pos_ids._underlying,
+        sin_table._underlying,
+        cos_table._underlying,
+        algo,
+    )
+    return out
diff --git a/src/infinicore/ops/embedding/embedding.cc b/src/infinicore/ops/embedding/embedding.cc
@@ -0,0 +1,90 @@
+#include "infinicore/ops/embedding.hpp"
+#include "infinicore/context/context.hpp"
+#include <cstring>
+
+namespace infinicore::op {
+
+Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
+                 Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
+) {
+    auto input_shape = input->shape();
+    auto weight_shape = weight->shape();
+    auto vocab_size = weight_shape[0];
+    auto embedding_dim = weight_shape[1];
+
+    // Assign memory to out variables
+    auto output_shape = input_shape;
+    output_shape.push_back(embedding_dim);
+    Tensor inputs_embeds = Tensor::empty(output_shape, weight->dtype(), weight->device());
+
+    embedding_(inputs_embeds, input, weight);
+    return inputs_embeds;
+}
+
+void embedding_(Tensor out, Tensor input, Tensor weight) {
+    assert(infinicore::DataType::I64 == input->dtype() || (infinicore::DataType::I32 == input->dtype()));
+    assert(infinicore::Device::Type::CPU == input->device());
+
+    auto input_shape = input->shape();
+    auto weight_shape = weight->shape();
+    auto vocab_size = weight_shape[0];
+    auto embedding_dim = weight_shape[1];
+
+    // Calculate the number of token
+    Size counts = 1;
+    for (auto &v : input_shape) {
+        counts *= v;
+    }
+
+    // the bytes of one token
+    const Size bytes = dsize(weight->dtype()) * embedding_dim;
+    auto *weight_ptr = weight->data();
+    auto *out_ptr = out->data();
+
+    // copies
+    if (weight->device().getType() == Device::Type::CPU) {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        }
+
+    } else {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        }
+    }
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -5,9 +5,11 @@
 #include "ops/add.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/embedding.hpp"
 #include "ops/matmul.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
+#include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
 
@@ -24,6 +26,8 @@ inline void bind(py::module &m) {
     bind_rms_norm(m);
     bind_silu(m);
     bind_swiglu(m);
+    bind_rope(m);
+    bind_embedding(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/embedding.hpp b/src/infinicore/pybind11/ops/embedding.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/embedding.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_embedding(py::module &m) {
+
+    m.def("embedding",
+          &op::embedding,
+          py::arg("input"),
+          py::arg("weight"),
+          R"doc(Generate a simple lookup table that looks up embeddings in a fixed dictionary and size..)doc");
+
+    m.def("embedding_",
+          &op::embedding_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("weight"),
+          R"doc(In-place, Generate a simple lookup table that looks up embeddings in a fixed dictionary and size..)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rope.hpp b/src/infinicore/pybind11/ops/rope.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rope.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rope(py::module &m) {
+
+    py::enum_<infinicore::nn::RoPE::Algo>(m, "Algo")
+        .value("GPT_J", infinicore::nn::RoPE::Algo::GPT_J)
+        .value("GPT_NEOX", infinicore::nn::RoPE::Algo::GPT_NEOX);
+
+    m.def("rope",
+          &op::rope,
+          py::arg("x"),
+          py::arg("pos"),
+          py::arg("sin_cache"),
+          py::arg("cos_cache"),
+          py::arg("algo"),
+          R"doc( Rotary Position Embedding(RoPE).)doc");
+
+    m.def("rope_",
+          &op::rope_,
+          py::arg("x_out"),
+          py::arg("x"),
+          py::arg("pos"),
+          py::arg("sin_cache"),
+          py::arg("cos_cache"),
+          py::arg("algo"),
+          R"doc(In-place, Rotary Position Embedding(RoPE).)doc");
+}
+
+} // namespace infinicore::ops