InfiniTensor
diff --git a/‎csrc/cache/kv_cache.cpp‎
Lines changed: 13 additions & 2 deletions b/‎csrc/cache/kv_cache.cpp‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎csrc/engine/infer_engine.cpp‎
Lines changed: 18 additions & 2 deletions b/‎csrc/engine/infer_engine.cpp‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎csrc/engine/rank_worker.cpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/engine/rank_worker.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/engine/rank_worker.hpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/engine/rank_worker.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/models/llama/llama_attention.cpp‎
Lines changed: 134 additions & 20 deletions b/‎csrc/models/llama/llama_attention.cpp‎
Lines changed: 134 additions & 20 deletions
diff --git a/‎csrc/models/llama/llama_attention.hpp‎
Lines changed: 18 additions & 1 deletion b/‎csrc/models/llama/llama_attention.hpp‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎csrc/pybind11/engine/engine.hpp‎
Lines changed: 2 additions & 0 deletions b/‎csrc/pybind11/engine/engine.hpp‎
Lines changed: 2 additions & 0 deletions
@@ -1,7 +1,7 @@
 #include "kv_cache.hpp"
 
 #include "../utils.hpp"
-
+#include "infinicore/ops.hpp"
 #include <stdexcept>
 
 namespace infinilm::cache {
@@ -155,6 +155,7 @@ PagedKVCache::PagedKVCache(
     num_blocks_per_layer_ = config.max_kv_memory_bytes()
                           / (k_dim * num_rank_k_heads_ + v_dim * num_rank_v_heads_)
                           / block_size_
+                          / rank_num_layers_
                           / infinicore::dsize(dtype_);
     if (num_blocks_per_layer_ == 0) {
         throw std::runtime_error("Not enough memory for KV cache");
@@ -191,7 +192,17 @@ std::tuple<infinicore::Tensor, infinicore::Tensor> PagedKVCache::update(
     auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
 
     /// @todo: implement paged cache update here
-
+    auto k_shape = k->shape();
+    auto b = k_shape[0];
+    auto s = k_shape[1];
+    auto n = k_shape[2];
+    auto d = k_shape[3];
+
+    infinicore::op::paged_caching_(k->view({b * s, n, d}),
+                                   v->view({b * s, n, d}),
+                                   k_cache_layer,
+                                   v_cache_layer,
+                                   slot_mapping);
     return {k_cache_layer, v_cache_layer};
 }
 } // namespace infinilm::cache
@@ -56,8 +56,24 @@ std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEng
 //------------------------------------------------------
 // forward
 //------------------------------------------------------
-infinilm::InfinilmModel::Input InferEngine::Input::to_model_input() const {
-    return {input_ids, position_ids, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping};
+infinilm::InfinilmModel::Input InferEngine::Input::to_model_input(infinicore::Device device) const {
+
+    std::optional<infinicore::Tensor> input_lengths_on_device;
+    if (input_lengths.has_value()) {
+        input_lengths_on_device = input_lengths.value()->to(device);
+    }
+
+    std::optional<infinicore::Tensor> block_tables_on_device;
+    if (block_tables.has_value()) {
+        block_tables_on_device = block_tables.value()->to(device);
+    }
+
+    std::optional<infinicore::Tensor> slot_mapping_on_device;
+    if (slot_mapping.has_value()) {
+        slot_mapping_on_device = slot_mapping.value()->to(device);
+    }
+
+    return {input_ids, position_ids, cache_lengths, input_lengths_on_device, input_offsets, block_tables_on_device, slot_mapping_on_device};
 }
 
 InferEngine::Output InferEngine::forward(const InferEngine::Input &input) {
 
@@ -206,7 +206,7 @@ void RankWorker::thread_loop() {
                     local_param_name = pending_param_name_;
                     local_param = pending_param_;
                 } else if (local_cmd == Command::RUN) {
-                    local_args = pending_args_.to_model_input();
+                    local_args = pending_args_.to_model_input(rank_info_.device);
                 } else if (local_cmd == Command::RESET_CACHE) {
                     if (pending_cache_config_ != nullptr) {
                         local_cache_config = pending_cache_config_->unique_copy();
 
@@ -47,7 +47,7 @@ class RankWorker {
 
         float random_val{0.1};
 
-        infinilm::InfinilmModel::Input to_model_input() const;
+        infinilm::InfinilmModel::Input to_model_input(infinicore::Device device) const;
     };
 
     struct Output {
 
@@ -43,6 +43,7 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config,
     } else {
         throw std::runtime_error("num_attention_heads / tp_size error.");
     }
+    scaling_ = 1.0f / std::sqrt(static_cast<float>(head_dim_));
 
     // Initialize projection layers
     INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_,
@@ -52,17 +53,10 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config,
                               dtype, device, tp_rank, tp_size, rank_info.comm);
 }
 
-infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
-                                           const infinicore::Tensor &position_ids,
-                                           std::shared_ptr<cache::Cache> kv_cache,
-                                           std::optional<infinicore::Tensor> cache_lengths,
-                                           std::optional<infinicore::Tensor> input_lengths,
-                                           std::optional<infinicore::Tensor> input_offsets,
-                                           std::optional<infinicore::Tensor> block_tables,
-                                           std::optional<infinicore::Tensor> slot_mapping) const {
-    if (!rotary_emb_) {
-        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
-    }
+infinicore::Tensor LlamaAttention::forward_static_(const infinicore::Tensor &hidden_states,
+                                                   const infinicore::Tensor &position_ids,
+                                                   std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                                   std::optional<infinicore::Tensor> cache_lengths) const {
     // Input shape: [batch, seq_len, hidden_size]
     auto hidden_states_mutable = hidden_states;
     auto shape = hidden_states->shape();
@@ -73,7 +67,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
     auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
 
     // 2. Reshape for multi-head attention
-
     // Reshape Q, K, V to include batch dimension
     // Python: query_states = self.q_proj(hidden_states).view(querys_shape)
     // The view operation requires the tensor to be contiguous in the required dimensions
@@ -114,13 +107,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
         auto [k_total_tmp, v_total_tmp] = static_kv_cache->update(layer_idx_, k_permuted, v_permuted, cache_lengths.value());
         k_total = k_total_tmp;
         v_total = v_total_tmp;
-    } else if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
-        auto [k_total_tmp, v_total_tmp] = paged_kv_cache->update(layer_idx_, k_permuted, v_permuted, slot_mapping.value());
-        k_total = k_total_tmp;
-        v_total = v_total_tmp;
-
-        /// @todo Implement paged attention here.
-        throw std::runtime_error("LlamaAttention: Paged attention not implemented");
     } else {
         throw std::runtime_error("LlamaAttention: Unsupported kvcache type");
     }
@@ -152,8 +138,136 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
     return output;
 }
 
+infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidden_states,
+                                                  const infinicore::Tensor &position_ids,
+                                                  std::shared_ptr<infinilm::cache::PagedKVCache> paged_kv_cache,
+                                                  std::optional<infinicore::Tensor> cache_lengths,
+                                                  std::optional<infinicore::Tensor> input_lengths,
+                                                  std::optional<infinicore::Tensor> input_offsets,
+                                                  std::optional<infinicore::Tensor> block_tables,
+                                                  std::optional<infinicore::Tensor> slot_mapping) const {
+    if (!block_tables.has_value() or !input_lengths.has_value() or !slot_mapping.has_value()) {
+        throw std::runtime_error("LlamaAttention::forward_paged: block_tables or input_lengths or slot_mapping is not set");
+    }
+
+    // Input shape: [batch, seq_len, hidden_size]
+    auto hidden_states_mutable = hidden_states;
+    auto shape = hidden_states->shape();
+    size_t batch_size = shape[0];
+    size_t seq_len = shape[1];
+
+    bool is_prefill = (batch_size * seq_len != input_lengths.value()->shape()[0]);
+    assert(batch_size == 1);
+
+    // 1. Project Q, K, V
+    auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
+
+    // 2. Reshape for multi-head attention
+
+    // Reshape Q, K, V to include batch dimension
+    // Python: query_states = self.q_proj(hidden_states).view(querys_shape)
+    // The view operation requires the tensor to be contiguous in the required dimensions
+    auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+    auto k_reshaped = k->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    auto v_reshaped = v->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+
+    // 3. Prepare position_ids for RoPE - align with Python pattern
+
+    auto pos_shape = position_ids->shape();
+    infinicore::Tensor pos_ids_for_rope = position_ids;
+    if (pos_shape.size() == 2) {
+        auto pos_narrowed = position_ids->narrow({{0, 0, 1}});
+        pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]});
+    } else if (pos_shape.size() == 1) {
+        pos_ids_for_rope = position_ids->contiguous();
+    } else {
+        throw std::runtime_error("Unexpected position_ids shape");
+    }
+
+    // 4. Apply RoPE to Q and K
+    auto q_rope = infinicore::Tensor::empty({batch_size, num_attention_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3});
+    auto k_rope = infinicore::Tensor::empty({batch_size, num_key_value_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3});
+    rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
+    rotary_emb_->forward(k_rope, k_reshaped, pos_ids_for_rope); // [bs, seq_len, n_kv_head, head_dim]
+
+    //  5. Prepare KV caches
+    //  Ensure contiguous after permute for F16 compatibility with cache operations
+    auto [k_total, v_total] = paged_kv_cache->update(layer_idx_,
+                                                     k_rope->contiguous(), // 如果不contiguous，报错Incompatible shape for view operation.
+                                                     v_reshaped,
+                                                     slot_mapping.value());
+
+    // 6. Compute attention
+    infinicore::Tensor attn_output;
+    if (is_prefill) {
+        q_reshaped = q_rope->permute({0, 2, 1, 3});          // [bs, n_q_head, seq_len, head_dim]
+        auto k_permuted = k_rope->permute({0, 2, 1, 3});     // [bs, n_kv_head, seq_len, head_dim]
+        auto v_permuted = v_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim]
+
+        auto total_seq_len = k_permuted->shape()[2];
+        size_t ngroup = num_attention_heads_ / num_key_value_heads_;
+
+        auto Q = q_reshaped->view({batch_size * num_key_value_heads_, ngroup * seq_len, head_dim_});
+        auto K = k_permuted->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_});
+        auto V = v_permuted->contiguous()->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_});
+
+        auto K_transposed = K->permute({0, 2, 1}); // [bs * n_kv_head, head_dim, total_seq_len]
+
+        auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling_); // [bs * n_kv_head, ng * seq_len, total_seq_len]
+
+        auto attn_weight_softmax = attn_weight->view({batch_size * num_attention_heads_, seq_len, total_seq_len});
+        infinicore::op::causal_softmax_(attn_weight_softmax, attn_weight_softmax);
+
+        auto out = infinicore::op::matmul(attn_weight, V); // [bs * n_kv_head, ng * seq_len, head_dim]
+
+        attn_output = out->view({batch_size, num_attention_heads_, seq_len, head_dim_})
+                          ->permute({0, 2, 1, 3})
+                          ->contiguous()
+                          ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim]
+
+    } else {
+        q_reshaped = q_rope->contiguous()->view({1 * seq_len, num_attention_heads_, head_dim_}); // q_reshaped需要是contiguous
+        auto out = infinicore::Tensor::empty({1 * seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device());
+        infinicore::op::paged_attention_(out,
+                                         q_reshaped,
+                                         k_total,
+                                         v_total,
+                                         block_tables.value(),
+                                         input_lengths.value(),
+                                         std::nullopt,
+                                         scaling_);
+
+        attn_output = out->view({1, seq_len, num_attention_heads_, head_dim_})->view({1, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim]
+    }
+
+    // 7. Project output
+    return o_proj_->forward(attn_output); // [ 1 13 3584 ] => [ 1 13 4096 ]
+}
+
+infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
+                                           const infinicore::Tensor &position_ids,
+                                           std::shared_ptr<cache::Cache> kv_cache,
+                                           std::optional<infinicore::Tensor> cache_lengths,
+                                           std::optional<infinicore::Tensor> input_lengths,
+                                           std::optional<infinicore::Tensor> input_offsets,
+                                           std::optional<infinicore::Tensor> block_tables,
+                                           std::optional<infinicore::Tensor> slot_mapping) const {
+    if (!rotary_emb_) {
+        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
+    }
+
+    infinicore::Tensor output;
+    if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
+        output = forward_paged_(hidden_states, position_ids, paged_kv_cache, cache_lengths, input_lengths, input_offsets, block_tables, slot_mapping);
+    } else {
+
+        output = forward_static_(hidden_states, position_ids, kv_cache, cache_lengths);
+    }
+    return output;
+}
+
 void LlamaAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
     rotary_emb_ = rotary_emb;
 }
 
-} // namespace infinilm::models::llama
+} // namespace infinilm::models::llama
@@ -55,7 +55,7 @@ class LlamaAttention : public infinicore::nn::Module {
                                std::optional<infinicore::Tensor> input_lengths,
                                std::optional<infinicore::Tensor> input_offsets,
                                std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mappin) const;
+                               std::optional<infinicore::Tensor> slot_mapping) const;
 
     /**
      * @brief Get the layer index
@@ -73,6 +73,21 @@ class LlamaAttention : public infinicore::nn::Module {
     size_t head_dim() const { return head_dim_; }
     size_t hidden_size() const { return hidden_size_; }
 
+private:
+    infinicore::Tensor forward_static_(const infinicore::Tensor &hidden_states,
+                                       const infinicore::Tensor &position_ids,
+                                       std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                       std::optional<infinicore::Tensor> cache_lengths) const;
+
+    infinicore::Tensor forward_paged_(const infinicore::Tensor &hidden_states,
+                                      const infinicore::Tensor &position_ids,
+                                      std::shared_ptr<infinilm::cache::PagedKVCache> kv_cache,
+                                      std::optional<infinicore::Tensor> cache_lengths,
+                                      std::optional<infinicore::Tensor> input_lengths,
+                                      std::optional<infinicore::Tensor> input_offsets,
+                                      std::optional<infinicore::Tensor> block_tables,
+                                      std::optional<infinicore::Tensor> slot_mapping) const;
+
 protected:
     // Projection layers
     INFINICORE_NN_MODULE(infinilm::layers::QKVParallelLinear, qkv_proj);
@@ -93,6 +108,8 @@ class LlamaAttention : public infinicore::nn::Module {
     bool use_bias_;                  // Bias for Q/K/V projections
     bool use_output_bias_;           // Bias for output projection (o_proj)
     size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility)
+
+    float scaling_;
 };
 
 } // namespace infinilm::models::llama
@@ -90,6 +90,8 @@ inline void bind_infer_engine(py::module &m) {
                     std::move(input_ids),
                     std::move(position_ids),
                     std::move(cache_lengths),
+                    std::move(input_lengths),
+                    std::move(input_offsets),
                     std::move(block_tables),
                     std::move(slot_mapping)}};