InfiniTensor · PanZezhong1725 · Nov 17, 2025 · Nov 17, 2025 · Ceng23333 · Nov 17, 2025
diff --git a/include/infinicore/ops/embedding.hpp b/include/infinicore/ops/embedding.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+Tensor embedding(Tensor input, Tensor weight);
+void embedding_(Tensor out, Tensor input, Tensor weight);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/linear.hpp b/include/infinicore/ops/linear.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "common/op.hpp"
+#include <optional>
 
 namespace infinicore::op {
 

diff --git a/include/infinicore/ops/rope.hpp b/include/infinicore/ops/rope.hpp
@@ -1,21 +1,21 @@
 #pragma once
 
 #include "../device.hpp"
-#include "../tensor.hpp"
 #include "../nn/rope.hpp"
+#include "../tensor.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 class RoPE {
 public:
     using schema = void (*)(Tensor, const Tensor &, const Tensor &, const Tensor &, const Tensor &, infinicore::nn::RoPE::Algo);
-    static void execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+    static void execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
     static common::OpDispatcher<schema> &dispatcher();
 };
 
 // Internal function
-void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo);
 
 // Public API that uses infinicore::nn::RoPE::Algo
-Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo);
+Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo);
 } // namespace infinicore::op
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
@@ -1,8 +1,20 @@
 from .causal_softmax import causal_softmax
+from .embedding import embedding
 from .linear import linear
 from .random_sample import random_sample
 from .rms_norm import rms_norm
+from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
 
-__all__ = ["causal_softmax", "random_sample", "rms_norm", "silu", "swiglu", "linear"]
+__all__ = [
+    "causal_softmax",
+    "random_sample",
+    "rms_norm",
+    "silu",
+    "swiglu",
+    "linear",
+    "embedding",
+    "rope",
+    "RopeAlgo",
+]
diff --git a/python/infinicore/nn/functional/embedding.py b/python/infinicore/nn/functional/embedding.py
@@ -0,0 +1,35 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+__all__ = ["embedding"]
+
+
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx=None,
+    max_norm=None,
+    norm_type=2.0,
+    scale_grad_by_freq=False,
+    sparse=False,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Generate a simple lookup table that looks up embeddings in a fixed dictionary and size."""
+
+    assert (
+        (padding_idx is None)
+        and (max_norm is None)
+        and (scale_grad_by_freq is False)
+        and (sparse is False)
+    ), "Unsupported parameters."
+
+    assert "cpu" == input.device.type, (
+        "The device of 'input' variable must be on the CPU."
+    )
+
+    if out is None:
+        return Tensor(_infinicore.embedding(input._underlying, weight._underlying))
+
+    _infinicore.embedding_(out._underlying, input._underlying, weight._underlying)
+    return out
diff --git a/python/infinicore/nn/functional/rope.py b/python/infinicore/nn/functional/rope.py
@@ -0,0 +1,44 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+__all__ = ["rope", "RopeAlgo"]
+
+
+class RopeAlgo:
+    r"""Different types of RoPE algorithms."""
+
+    GPT_J = _infinicore.Algo.GPT_J
+    GPT_NEOX = _infinicore.Algo.GPT_NEOX
+
+
+def rope(
+    x: Tensor,
+    pos_ids: Tensor,
+    sin_table: Tensor,
+    cos_table: Tensor,
+    algo: RopeAlgo = RopeAlgo.GPT_NEOX,
+    *,
+    out=None,
+) -> Tensor:
+    r"""Rotary Position Embedding(RoPE)."""
+
+    if out is None:
+        return Tensor(
+            _infinicore.rope(
+                x._underlying,
+                pos_ids._underlying,
+                sin_table._underlying,
+                cos_table._underlying,
+                algo,
+            )
+        )
+
+    _infinicore.rope_(
+        out._underlying,
+        x._underlying,
+        pos_ids._underlying,
+        sin_table._underlying,
+        cos_table._underlying,
+        algo,
+    )
+    return out
diff --git a/src/infinicore/ops/embedding/embedding.cc b/src/infinicore/ops/embedding/embedding.cc
@@ -0,0 +1,90 @@
+#include "infinicore/ops/embedding.hpp"
+#include "infinicore/context/context.hpp"
+#include <cstring>
+
+namespace infinicore::op {
+
+Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
+                 Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
+) {
+    auto input_shape = input->shape();
+    auto weight_shape = weight->shape();
+    auto vocab_size = weight_shape[0];
+    auto embedding_dim = weight_shape[1];
+
+    // Assign memory to out variables
+    auto output_shape = input_shape;
+    output_shape.push_back(embedding_dim);
+    Tensor inputs_embeds = Tensor::empty(output_shape, weight->dtype(), weight->device());
+
+    embedding_(inputs_embeds, input, weight);
+    return inputs_embeds;
+}
+
+void embedding_(Tensor out, Tensor input, Tensor weight) {
+    assert(infinicore::DataType::I64 == input->dtype() || (infinicore::DataType::I32 == input->dtype()));
+    assert(infinicore::Device::Type::CPU == input->device());
+
+    auto input_shape = input->shape();
+    auto weight_shape = weight->shape();
+    auto vocab_size = weight_shape[0];
+    auto embedding_dim = weight_shape[1];
+
+    // Calculate the number of token
+    Size counts = 1;
+    for (auto &v : input_shape) {
+        counts *= v;
+    }
+
+    // the bytes of one token
+    const Size bytes = dsize(weight->dtype()) * embedding_dim;
+    auto *weight_ptr = weight->data();
+    auto *out_ptr = out->data();
+
+    // copies
+    if (weight->device().getType() == Device::Type::CPU) {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        }
+
+    } else {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < vocab_size));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        }
+    }
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/rope/rope.cc b/src/infinicore/ops/rope/rope.cc
@@ -9,25 +9,25 @@ common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
     return dispatcher_;
 };
 
-void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
     auto device_type = context::getDevice().getType();
     auto func = dispatcher().lookup(device_type);
 
     if (func == nullptr) {
         throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
     }
 
-    func(x_out, x, pos, sin_cache, cos_cache, algo);
+    func(x_out, x, pos, sin_table, cos_table, algo);
 }
 
-void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
-    RoPE::execute(x_out, x, pos, sin_cache, cos_cache, algo);
+void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
+    RoPE::execute(x_out, x, pos, sin_table, cos_table, algo);
 }
 
-Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
+Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
     Shape shape = x->shape();
     auto x_out = Tensor::empty(shape, x->dtype(), x->device());
-    rope_(x_out, x, pos, sin_cache, cos_cache, algo);
+    rope_(x_out, x, pos, sin_table, cos_table, algo);
     return x_out;
 }
 

diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -5,12 +5,14 @@
 #include "ops/add.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/embedding.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
+#include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
 
@@ -30,6 +32,8 @@ inline void bind(py::module &m) {
     bind_rms_norm(m);
     bind_silu(m);
     bind_swiglu(m);
+    bind_rope(m);
+    bind_embedding(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/embedding.hpp b/src/infinicore/pybind11/ops/embedding.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/embedding.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_embedding(py::module &m) {
+
+    m.def("embedding",
+          &op::embedding,
+          py::arg("input"),
+          py::arg("weight"),
+          R"doc(Generate a simple lookup table that looks up embeddings in a fixed dictionary and size..)doc");
+
+    m.def("embedding_",
+          &op::embedding_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("weight"),
+          R"doc(In-place, Generate a simple lookup table that looks up embeddings in a fixed dictionary and size..)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/rope.hpp b/src/infinicore/pybind11/ops/rope.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/rope.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_rope(py::module &m) {
+
+    py::enum_<infinicore::nn::RoPE::Algo>(m, "Algo")
+        .value("GPT_J", infinicore::nn::RoPE::Algo::GPT_J)
+        .value("GPT_NEOX", infinicore::nn::RoPE::Algo::GPT_NEOX);
+
+    m.def("rope",
+          &op::rope,
+          py::arg("x"),
+          py::arg("pos"),
+          py::arg("sin_table"),
+          py::arg("cos_table"),
+          py::arg("algo"),
+          R"doc( Rotary Position Embedding(RoPE).)doc");
+
+    m.def("rope_",
+          &op::rope_,
+          py::arg("x_out"),
+          py::arg("x"),
+          py::arg("pos"),
+          py::arg("sin_table"),
+          py::arg("cos_table"),
+          py::arg("algo"),
+          R"doc(In-place, Rotary Position Embedding(RoPE).)doc");
+}
+
+} // namespace infinicore::ops