xiao-yu-chen
diff --git a/‎xllm/core/framework/CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions b/‎xllm/core/framework/CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎xllm/core/framework/model_context.cpp‎
Lines changed: 47 additions & 0 deletions b/‎xllm/core/framework/model_context.cpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎xllm/core/framework/context.h‎ ‎xllm/core/framework/model_context.h‎xllm/core/framework/context.h renamed to xllm/core/framework/model_context.h
Lines changed: 29 additions & 22 deletions b/‎xllm/core/framework/context.h‎ ‎xllm/core/framework/model_context.h‎xllm/core/framework/context.h renamed to xllm/core/framework/model_context.h
Lines changed: 29 additions & 22 deletions
diff --git a/‎xllm/core/layers/npu/atb_base.cpp‎
Lines changed: 6 additions & 5 deletions b/‎xllm/core/layers/npu/atb_base.cpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎xllm/core/layers/npu/atb_base.h‎
Lines changed: 3 additions & 4 deletions b/‎xllm/core/layers/npu/atb_base.h‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎xllm/core/layers/npu/atb_head_impl.cpp‎
Lines changed: 2 additions & 14 deletions b/‎xllm/core/layers/npu/atb_head_impl.cpp‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎xllm/core/layers/npu/atb_head_impl.h‎
Lines changed: 2 additions & 4 deletions b/‎xllm/core/layers/npu/atb_head_impl.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎xllm/core/layers/npu/atb_linear.cpp‎
Lines changed: 6 additions & 8 deletions b/‎xllm/core/layers/npu/atb_linear.cpp‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎xllm/core/layers/npu/atb_linear.h‎
Lines changed: 6 additions & 8 deletions b/‎xllm/core/layers/npu/atb_linear.h‎
Lines changed: 6 additions & 8 deletions
@@ -65,3 +65,15 @@ cc_library(
     :tokenizer
     torch
 )
+
+cc_library(
+  NAME 
+    model_context
+  HDRS
+    model_context.h
+  SRCS
+    model_context.cpp
+  DEPS
+    torch
+    $<$<BOOL:${USE_NPU}>:torch_npu>
+)
@@ -0,0 +1,47 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "core/framework/model_context.h"
+
+#include <torch/torch.h>
+#if defined(USE_NPU)
+#ifdef TORCH_HIGHER_THAN_PTA6
+// #include <torch_npu/csrc/core/npu/NPUFormat.h>
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+#include <torch_npu/csrc/libs/init_npu.h>
+#endif
+
+namespace xllm {
+ModelContext::ModelContext(const ParallelArgs& input_parallel_args,
+                           const ModelArgs& model_args,
+                           const QuantArgs& quant_args,
+                           const torch::TensorOptions& tensor_options)
+    : parallel_args_(input_parallel_args),
+      model_args_(model_args),
+      quant_args_(quant_args),
+      tensor_options_(tensor_options) {
+#if defined(USE_NPU)
+  int32_t device_id = tensor_options.device().index();
+  void* stream = c10_npu::getCurrentNPUStream(device_id).stream();
+  atb::CreateContext(&context_);
+  context_->SetExecuteStream(stream);
+  context_->SetAsyncTilingCopyStatus(true);
+#endif
+}
+}  // namespace xllm
@@ -15,6 +15,10 @@ limitations under the License.
 
 #pragma once
 
+#if defined(USE_NPU)
+#include <acl/acl.h>
+#endif
+
 #include <memory>
 
 #include "core/framework/model/model_args.h"
@@ -23,38 +27,41 @@ limitations under the License.
 
 namespace xllm {
 
-class Context {
+class ModelContext {
  public:
-  Context(const ParallelArgs& input_parallel_args)
-      : parallel_args(input_parallel_args) {}
+  ModelContext() : parallel_args_(1, 1, nullptr) {};
 
-  const ModelArgs& get_model_args() const { return model_args; }
-  void set_model_args(const ModelArgs& model_args) {
-    this->model_args = model_args;
-  }
+  ModelContext(const ParallelArgs& input_parallel_args,
+               const ModelArgs& model_args,
+               const QuantArgs& quant_args,
+               const torch::TensorOptions& tensor_options);
 
-  const QuantArgs& get_quant_args() const { return quant_args; }
-  void set_quant_args(const QuantArgs& quant_args) {
-    this->quant_args = quant_args;
-  }
+  const ModelArgs& get_model_args() const { return model_args_; }
+
+  const QuantArgs& get_quant_args() const { return quant_args_; }
 
-  const ParallelArgs& get_parallel_args() const { return parallel_args; }
-  //   void set_paralle_args(const ParallelArgs& parallel_args) {
-  //     this->parallel_args = parallel_args;
-  //   }
+  const ParallelArgs& get_parallel_args() const { return parallel_args_; }
 
   const torch::TensorOptions& get_tensor_options() const {
-    return tensor_options;
+    return tensor_options_;
   }
-  void set_tensor_options(const torch::TensorOptions& tensor_options) {
-    this->tensor_options = tensor_options;
+
+  const atb::Context* get_atb_context() const { return context_; }
+
+  void set_image_embedding_mode(bool image_embedding_mode) {
+    model_args_.image_embedding_mode() = image_embedding_mode;
   }
 
  private:
-  ModelArgs model_args;
-  QuantArgs quant_args;
-  ParallelArgs parallel_args;
-  torch::TensorOptions tensor_options;
+  ModelArgs model_args_;
+  QuantArgs quant_args_;
+  ParallelArgs parallel_args_;
+  torch::TensorOptions tensor_options_;
+
+#if defined(USE_NPU)
+  // used for npu atb
+  atb::Context* context_;
+#endif
 };
 
 }  // namespace xllm
@@ -20,10 +20,11 @@ limitations under the License.
 namespace xllm::hf {
 static std::atomic<bool> g_executeOk(true);
 
-ATBBase::ATBBase(const Context& context)
+ATBBase::ATBBase(const ModelContext& context)
     : device_(context.get_tensor_options().device()),
       name_(""),
       parallel_args_(context.get_parallel_args()) {
+  context_ = const_cast<atb::Context*>(context.get_atb_context());
   auto quant_args = context.get_quant_args();
   if (!quant_args.quantize_type().empty()) {
     quantize_type_ = quant_args.quantize_type();
@@ -39,6 +40,8 @@ ATBBase::ATBBase(const Context& context)
   CHECK_EQ(parallel_args_.world_size(), dp_size_ * dp_local_tp_size_);
   dp_local_tp_rank_ = parallel_args_.rank() % dp_local_tp_size_;
 
+  work_space_ = AtbWorkspace(device_);
+
   runTaskFunc_ = std::bind(
       &ATBBase::run_task, this, std::placeholders::_1, std::placeholders::_2);
 }
@@ -195,8 +198,6 @@ void ATBBase::run_task(std::string taskName, std::function<int()> task) const {
 }
 
 atb::Status ATBBase::execute_node(atb_speed::Model::Node& node,
-                                  atb::Context* context,
-                                  AtbWorkspace& workspace,
                                   int nodeId,
                                   aclrtEvent* event,
                                   std::atomic<bool>* event_flag) {
@@ -208,7 +209,7 @@ atb::Status ATBBase::execute_node(atb_speed::Model::Node& node,
        << std::endl;
     throw std::runtime_error(ss.str());
   }
-  context_ = context;
+
   atb::Status st =
       node.operation->Setup(node.variantPack, node.workspaceSize, context_);
   if (st != 0) {
@@ -217,7 +218,7 @@ atb::Status ATBBase::execute_node(atb_speed::Model::Node& node,
   }
 
   if (node.workspaceSize > 0) {
-    node.workspace = workspace.GetWorkspaceBuffer(node.workspaceSize);
+    node.workspace = work_space_.GetWorkspaceBuffer(node.workspaceSize);
   }
 
   runTaskFunc_(name_ + std::to_string(nodeId), [=]() {
 
@@ -29,9 +29,9 @@ limitations under the License.
 
 #include "atb/atb_infer.h"
 #include "buffer/atb_workspace.h"
-#include "framework/context.h"
 #include "framework/kv_cache/kv_cache.h"
 #include "framework/model/model_input_params.h"
+#include "framework/model_context.h"
 #include "framework/state_dict/state_dict.h"
 #include "pytorch/adapter/utils/utils.h"
 #include "pytorch/adapter/workspace/workspace.h"
@@ -97,7 +97,7 @@ enum class LinearTypeV2 : int {
 
 class ATBBase {
  public:
-  ATBBase(const Context& context);
+  ATBBase(const ModelContext& context);
   virtual ~ATBBase() {};
 
   using Task = std::function<int()>;
@@ -132,8 +132,6 @@ class ATBBase {
   // void get_sharded(at::Tensor weight_tensor,int dim);
 
   atb::Status execute_node(atb_speed::Model::Node& node,
-                           atb::Context* context,
-                           AtbWorkspace& workspace,
                            int nodeId = 0,
                            aclrtEvent* event = nullptr,
                            std::atomic<bool>* event_flag = nullptr);
@@ -152,6 +150,7 @@ class ATBBase {
 
  protected:
   atb::Context* context_;
+  AtbWorkspace work_space_;
   std::vector<at::Tensor> at_weight_tensors_;
   std::vector<atb::Tensor> atb_weight_tensors_;
   at::Device device_;
 
@@ -67,7 +67,7 @@ void AtbLmHeadImpl::param_from_args(atb_speed::common::LmHeadParam& param,
   }
 }
 
-AtbLmHeadImpl::AtbLmHeadImpl(const Context& context) : ATBBase(context) {
+AtbLmHeadImpl::AtbLmHeadImpl(const ModelContext& context) : ATBBase(context) {
   param_from_args(llm_head_param_prefill_,
                   context.get_model_args(),
                   context.get_parallel_args(),
@@ -161,22 +161,10 @@ int64_t AtbLmHeadImpl::init_node(atb_speed::Model::Node& node,
 
 torch::Tensor AtbLmHeadImpl::forward(const torch::Tensor& hidden_states,
                                      const torch::Tensor& seleted_idxes,
-                                     atb::Context* context,
-                                     AtbWorkspace& workspace,
                                      int nodeId) {
   atb::Status st;
   build_node_variant_pack(llm_head_node_prefill_, hidden_states, seleted_idxes);
-  st = execute_node(llm_head_node_prefill_, context, workspace, nodeId);
-  // if (is_prefill) {
-  //   build_node_variant_pack(llm_head_node_prefill_,
-  //   hidden_states,seleted_idxes); st = execute_node(llm_head_node_prefill_,
-  //   context, workspace ,nodeId);
-  // } else {
-  //   build_node_variant_pack(llm_head_node_decode_,
-  //   hidden_states,seleted_idxes); st = execute_node(llm_head_node_decode_,
-  //   context, workspace ,nodeId);
-  // }
-  // c10_npu::NPUCachingAllocator::emptyCache();
+  st = execute_node(llm_head_node_prefill_, nodeId);
   LOG_IF(FATAL, st != 0) << model_name_
                          << "execute llmhead node fail, error code: " << st;
   return atOutTensors_[0];
 
@@ -28,8 +28,8 @@ limitations under the License.
 
 #include "atb/atb_infer.h"
 #include "atb_base.h"
-#include "framework/context.h"
 #include "framework/model/model_input_params.h"
+#include "framework/model_context.h"
 #include "layers/npu/llm_head.h"
 #include "nlohmann/json.hpp"
 #include "pytorch/adapter/utils/utils.h"
@@ -47,7 +47,7 @@ class AtbLmHeadImpl : public LlmHeadImpl, public ATBBase {
   using RunTaskFunc =
       std::function<void(const std::string& taskName, Task task)>;
 
-  explicit AtbLmHeadImpl(const Context& context);
+  explicit AtbLmHeadImpl(const ModelContext& context);
 
   ~AtbLmHeadImpl() {};
 
@@ -66,8 +66,6 @@ class AtbLmHeadImpl : public LlmHeadImpl, public ATBBase {
 
   torch::Tensor forward(const torch::Tensor& hidden_states,
                         const torch::Tensor& seleted_idxes,
-                        atb::Context* context,
-                        AtbWorkspace& workspace,
                         int nodeId) override;
 
   // void build_node_variant_pack(atb_speed::Model::Node& node, torch::Tensor&
 
@@ -23,11 +23,12 @@ limitations under the License.
 #include "xllm_kernels/operations/fusion/utils.h"
 
 namespace xllm::hf {
-std::shared_ptr<AtbLinearImpl> create_atb_linear_layer(const Context& context) {
+std::shared_ptr<AtbLinearImpl> create_atb_linear_layer(
+    const ModelContext& context) {
   return std::make_shared<AtbLinearImpl>(context);
 }
 
-AtbLinearImpl::AtbLinearImpl(const Context& context) : ATBBase(context) {
+AtbLinearImpl::AtbLinearImpl(const ModelContext& context) : ATBBase(context) {
   at_weight_tensors_.resize(1);
   atb_weight_tensors_.resize(1);
   at_out_tensors_.resize(1);
@@ -103,14 +104,11 @@ int64_t AtbLinearImpl::init_node(atb_speed::Model::Node& node) {
   return atb::NO_ERROR;
 }
 
-torch::Tensor AtbLinearImpl::forward(const torch::Tensor& input,
-                                     atb::Context* context,
-                                     AtbWorkspace& workspace,
-                                     int nodeId) {
+torch::Tensor AtbLinearImpl::forward(const torch::Tensor& input, int nodeId) {
   atb::Status st;
 
   build_node_variant_pack(linear_node_, input);
-  st = execute_node(linear_node_, context, workspace, nodeId);
+  st = execute_node(linear_node_, nodeId);
   LOG_IF(FATAL, st != 0) << model_name_
                          << "infer shape fail, error code: " << st;
 
@@ -156,7 +154,7 @@ void AtbLinearImpl::build_node_variant_pack(atb_speed::Model::Node& node,
       atb_speed::Utils::AtTensor2Tensor(at_out_tensors_.at(0));
 }
 
-AtbLinear::AtbLinear(const Context& context)
+AtbLinear::AtbLinear(const ModelContext& context)
     : ModuleHolder(create_atb_linear_layer(context)) {}
 
 }  // namespace xllm::hf
@@ -29,8 +29,8 @@ limitations under the License.
 
 #include "atb/atb_infer.h"
 #include "atb_base.h"
-#include "framework/context.h"
 #include "framework/model/model_input_params.h"
+#include "framework/model_context.h"
 #include "framework/state_dict/state_dict.h"
 #include "nlohmann/json.hpp"
 #include "pytorch/adapter/utils/utils.h"
@@ -47,7 +47,7 @@ class AtbLinearImpl : public torch::nn::Module, public ATBBase {
   using RunTaskFunc =
       std::function<void(const std::string& taskName, Task task)>;
 
-  explicit AtbLinearImpl(const Context& context);
+  explicit AtbLinearImpl(const ModelContext& context);
 
   ~AtbLinearImpl() {};
 
@@ -59,10 +59,7 @@ class AtbLinearImpl : public torch::nn::Module, public ATBBase {
 
   int64_t init_layer();
 
-  torch::Tensor forward(const torch::Tensor& input,
-                        atb::Context* context,
-                        AtbWorkspace& workspace,
-                        int nodeId);
+  torch::Tensor forward(const torch::Tensor& input, int nodeId);
 
   void build_node_variant_pack(atb_speed::Model::Node& node,
                                const torch::Tensor& input);
@@ -83,9 +80,10 @@ class AtbLinear : public torch::nn::ModuleHolder<AtbLinearImpl> {
   using torch::nn::ModuleHolder<AtbLinearImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = AtbLinearImpl;
 
-  AtbLinear(const Context& context);
+  AtbLinear(const ModelContext& context);
 };
 
-std::shared_ptr<AtbLinearImpl> create_atb_linear_layer(const Context& context);
+std::shared_ptr<AtbLinearImpl> create_atb_linear_layer(
+    const ModelContext& context);
 
 }  // namespace xllm::hf