diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 00000000..617e46a3
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 0c9ef52c..6db80df1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,7 @@ cache/
 *.txt
 
 *.http
+.DS_Store
+**/.DS_Store
+.DS_Store
+**/.DS_Store
diff --git a/README.md b/README.md
index a3158f08..058b1100 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ xmake && xmake install
 - 运行模型推理测试
 
 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
+python scripts/qwen.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
 ```
 
 - 部署模型推理服务
@@ -34,4 +34,4 @@ python scripts/test_perf.py
 
 ```bash
 python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
+```
\ No newline at end of file
diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h
index 7f7d72d0..32b4954f 100644
--- a/include/infinicore_infer.h
+++ b/include/infinicore_infer.h
@@ -1,8 +1,10 @@
 #ifndef INFINICORE_INFER_H
 #define INFINICORE_INFER_H
 
+// 为了解决函数命名冲突，我们需要调整包含顺序并添加命名空间
+#include "infinicore_infer/models/qwen.h"
+#include "infinicore_infer/models/qwen_moe.h"
 #include "infinicore_infer/models/jiuge.h"
 
 
-
-#endif /* INFINICORE_INFER_H */
+#endif /* INFINICORE_INFER_H */
\ No newline at end of file
diff --git a/include/infinicore_infer/imodel.h b/include/infinicore_infer/imodel.h
new file mode 100644
index 00000000..e73c4e66
--- /dev/null
+++ b/include/infinicore_infer/imodel.h
@@ -0,0 +1,24 @@
+// imodel.h
+#pragma once
+
+// 只需要 KVCache 的前向声明，不需要知道它的具体实现
+struct KVCache; 
+
+// 这是所有模型都必须遵守的通用接口
+class IModel {
+public:
+    // C++ 接口类必须有虚析构函数
+    virtual ~IModel() = default;
+
+    // 定义所有模型都必须提供的功能作为“纯虚函数” (= 0)
+    // 任何继承 IModel 的类都必须自己实现这些函数
+    
+    // 创建一个适用于此模型的 KVCache 结构
+    virtual KVCache* createKVCache() const = 0;
+
+    // 复制 KVCache（例如用于 beam search）
+    virtual KVCache* duplicateKVCache(const KVCache* cache, unsigned int seq_len) const = 0;
+    
+    // 销毁 KVCache
+    virtual void dropKVCache(KVCache* cache) const = 0;
+};
\ No newline at end of file
diff --git a/include/infinicore_infer/models/jiuge.h b/include/infinicore_infer/models/jiuge.h
index e89e171a..5e1da050 100644
--- a/include/infinicore_infer/models/jiuge.h
+++ b/include/infinicore_infer/models/jiuge.h
@@ -1,14 +1,23 @@
+// model_jiuge.h
+
 #ifndef MODEL_JIUGE_H
 #define MODEL_JIUGE_H
 
 #include <infiniccl.h>
 #include <infiniop.h>
 #include <infinirt.h>
-
 #include <stdint.h>
 
+// --- 修改：添加 extern "C" 以保证 C 链接性，保持风格统一 ---
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declarations
 struct JiugeModel;
+struct KVCache;
 
+// Meta and Weights structs remain the same...
 typedef struct
 {
     infiniDtype_t dt_logits;
@@ -45,68 +54,38 @@ typedef struct
     const void *const *ffn_down;
 } JiugeWeights;
 
-//////////////////// APIs ///////////////////////
+
 /// @brief 创建模型
-/// @param device 协处理器种类
-/// @param ndev 协处理器数量
-/// @param dev_ids 协处理器编号，长度为 ndev
-__C __export struct JiugeModel *
-createJiugeModel(const JiugeMeta *,
-                 const JiugeWeights *,
-                 infiniDevice_t device,
-                 int ndev,
-                 const int *dev_ids);
+__C __export struct JiugeModel*
+createJiugeModel(const JiugeMeta*, const JiugeWeights*, infiniDevice_t device, int ndev, const int* dev_ids);
 
 /// @brief 销毁模型
 __C __export void
-destroyJiugeModel(struct JiugeModel *);
+destroyJiugeModel(struct JiugeModel*);
 
 /// @brief 创建 KV Cache
-__C __export struct KVCache *
-createKVCache(const struct JiugeModel *);
+__C __export struct KVCache*
+createJiugeKVCache(const struct JiugeModel*);
 
 /// @brief 复制 KV Cache
-__C __export struct KVCache *
-duplicateKVCache(const struct JiugeModel *,
-                 const struct KVCache *, uint32_t seq_len);
+__C __export struct KVCache*
+duplicateJiugeKVCache(const struct JiugeModel*, const struct KVCache*, uint32_t seq_len);
 
 /// @brief 销毁 KV Cache
 __C __export void
-dropKVCache(const struct JiugeModel *,
-            struct KVCache *);
-
-/// @brief 批次推理一轮，并采样出新的 token
-/// @param tokens 输入 token 地址
-/// @param ntok 输入 token 数量
-/// @param nreq 请求数量
-/// @param req_lens 每个请求的 token 数量
-/// @param req_pos 每个请求的起始位置
-/// @param kv_caches 每个请求的 KV Cache
-/// @param temperature 采样温度（0. 表示贪心采样）
-/// @param topk 采样 topk（1 表示贪心采样）
-/// @param topp 采样 topp
-/// @param output 输出 token 数组，每个请求一个输出，长度至少为nreq
-__C __export void
-inferBatch(struct JiugeModel *,
-           const uint32_t *tokens, uint32_t ntok,
-           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-           struct KVCache **kv_caches,
-           const float *temperature, const uint32_t *topk, const float *topp,
-           uint32_t *output);
+dropJiugeKVCache(const struct JiugeModel*, struct KVCache*);
 
-/// @brief 批次推理一轮，输出 output embedding 后的 logits
-/// @param tokens 输入 token 地址
-/// @param ntok 输入 token 数量
-/// @param nreq 请求数量
-/// @param req_lens 每个请求的 token 数量
-/// @param req_pos 每个请求的起始位置
-/// @param kv_caches 每个请求的 KV Cache
-/// @param logits 输出 token 数组，每个请求一个输出，长度至少为nreq
+/// @brief 批次推理一轮
 __C __export void
-forwardBatch(struct JiugeModel *,
-             const uint32_t *tokens, uint32_t ntok,
-             const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-             struct KVCache **kv_caches,
-             void *logits);
+inferJiugeBatch(struct JiugeModel*,
+                const uint32_t* tokens, uint32_t ntok,
+                const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
+                struct KVCache** kv_caches,
+                const float* temperature, const uint32_t* topk, const float* topp,
+                uint32_t* output);
 
+#ifdef __cplusplus
+}
 #endif
+
+#endif // MODEL_JIUGE_H
\ No newline at end of file
diff --git a/include/infinicore_infer/models/qwen.h b/include/infinicore_infer/models/qwen.h
new file mode 100644
index 00000000..92101b66
--- /dev/null
+++ b/include/infinicore_infer/models/qwen.h
@@ -0,0 +1,96 @@
+#ifndef MODEL_QWEN_H
+#define MODEL_QWEN_H
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declarations
+struct QwenModel;
+struct KVCache; // 假设 KVCache 是一个通用的结构体
+
+typedef struct
+{
+    infiniDtype_t dt_logits;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+} QwenMeta;
+
+typedef struct
+{
+    size_t nlayer;
+    infiniDtype_t dt_norm, dt_mat;
+    // 0 if linear weights are passed as W, any other value if passed as W^T (default format in pytorch)
+    int transpose_linear_weights;
+    // [dvoc, d]
+    const void *input_embd;
+    // [d]
+    const void *output_norm;
+    // [dvoc, d]
+    const void *output_embd;
+    // nlayer * [d]
+    const void *const *attn_norm;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
+    const void *const *attn_qkv;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
+    const void *const *attn_qkv_b;
+    // nlayer * [dh]
+    const void *const *attn_q_norm;
+    // nlayer * [dh]
+    const void *const *attn_k_norm;
+    // nlayer * [ndev, d, nkvh / ndev * dh]
+    const void *const *attn_o;
+    // nlayer * [d]
+    const void *const *ffn_norm;
+    // nlayer * [ndev, 2 * di / ndev, d]
+    const void *const *ffn_gate_up;
+    // nlayer * [ndev, d, di / ndev]
+    const void *const *ffn_down;
+} QwenWeights;
+
+
+//////////////////// APIs ///////////////////////
+/// @brief 创建模型
+__C __export struct QwenModel*
+createQwenModel(const QwenMeta*, const QwenWeights*, infiniDevice_t device, int ndev, const int* dev_ids);
+
+/// @brief 销毁模型
+__C __export void
+destroyQwenModel(struct QwenModel*);
+
+/// @brief 创建 KV Cache
+// --- 修改：函数重命名 ---
+__C __export struct KVCache*
+createQwenKVCache(const struct QwenModel*);
+
+/// @brief 复制 KV Cache
+// --- 修改：函数重命名 ---
+__C __export struct KVCache*
+duplicateQwenKVCache(const struct QwenModel*, const struct KVCache*, uint32_t seq_len);
+
+/// @brief 销毁 KV Cache
+// --- 修改：函数重命名 ---
+__C __export void
+dropQwenKVCache(const struct QwenModel*, struct KVCache*);
+
+/// @brief 批次推理一轮
+// --- 修改：函数重命名 ---
+__C __export void
+inferQwenBatch(struct QwenModel*,
+               const uint32_t* tokens, uint32_t ntok,
+               const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
+               struct KVCache** kv_caches,
+               const float* temperature, const uint32_t* topk, const float* topp,
+               uint32_t* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MODEL_QWEN_H
\ No newline at end of file
diff --git a/include/infinicore_infer/models/qwen_moe.h b/include/infinicore_infer/models/qwen_moe.h
new file mode 100644
index 00000000..3292ea18
--- /dev/null
+++ b/include/infinicore_infer/models/qwen_moe.h
@@ -0,0 +1,112 @@
+#ifndef MODEL_QWEN_MOE_H
+#define MODEL_QWEN_MOE_H
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declaration for the new MoE model handle
+struct QwenMoeModel;
+// KVCache struct can be reused if its definition is generic enough,
+// otherwise it should also be specialized. Assuming it's generic for now.
+struct KVCache;
+
+
+// Renamed and specialized Meta struct for MoE
+typedef struct
+{
+    // --- Standard Fields (same as dense model) ---
+    infiniDtype_t dt_logits;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+
+    // --- New MoE-Specific Fields ---
+    size_t num_experts;         // Total number of experts per layer
+    size_t num_experts_per_tok; // Number of active experts per token
+    size_t moe_intermediate_size; // Intermediate size of a single expert's FFN
+    int norm_topk_prob;         // Flag (0 or 1) for routing logic
+    
+} QwenMoeMeta;
+
+// Renamed and redesigned Weights struct for MoE
+typedef struct
+{
+    // --- Standard Fields (same as dense model) ---
+    size_t nlayer;
+    infiniDtype_t dt_norm, dt_mat;
+    int transpose_linear_weights;
+    const void *input_embd;       // [dvoc, d]
+    const void *output_norm;      // [d]
+    const void *output_embd;      // [dvoc, d]
+
+    // --- Attention Block (same as dense model) ---
+    const void *const *attn_norm;     // nlayer * [d]
+    const void *const *attn_qkv;      // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
+    const void *const *attn_qkv_b;    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
+    const void *const *attn_q_norm;   // nlayer * [dh]
+    const void *const *attn_k_norm;   // nlayer * [dh]
+    const void *const *attn_o;        // nlayer * [ndev, d, nkvh / ndev * dh]
+
+    // --- MoE Block (replaces dense FFN) ---
+    const void *const *ffn_norm;      // Still needed: nlayer * [d] (post_attention_layernorm)
+    
+    // Pointers for the Gating Network in each layer
+    const void *const *moe_gate;      // nlayer * [num_experts, d]
+
+    // Pointers for the Experts. These point to flattened arrays of pointers.
+    // The total length of each array is (nlayer * num_experts).
+    // Access in C++ via: array[layer_idx * num_experts + expert_idx]
+    const void *const *moe_experts_gate_up; // Flat array of pointers to each expert's gate_up/swiglu weights
+    const void *const *moe_experts_down;    // Flat array of pointers to each expert's down_proj weights
+
+} QwenMoeWeights;
+
+
+//////////////////// New MoE APIs ///////////////////////
+/// @brief 创建 MoE 模型
+__C __export struct QwenMoeModel *
+createQwenMoeModel(const QwenMoeMeta *,
+                   const QwenMoeWeights *,
+                   infiniDevice_t device,
+                   int ndev,
+                   const int *dev_ids);
+
+/// @brief 销毁 MoE 模型
+__C __export void
+destroyQwenMoeModel(struct QwenMoeModel *);
+
+/// @brief 为 MoE 模型创建 KV Cache
+__C __export struct KVCache *
+createQwenMoeKVCache(const struct QwenMoeModel *);
+
+/// @brief 为 MoE 模型复制 KV Cache
+__C __export struct KVCache *
+duplicateQwenMoeKVCache(const struct QwenMoeModel *,
+                        const struct KVCache *, uint32_t seq_len);
+
+/// @brief 为 MoE 模型销毁 KV Cache
+__C __export void
+dropQwenMoeKVCache(const struct QwenMoeModel *,
+                   struct KVCache *);
+
+/// @brief MoE 模型批次推理一轮，并采样出新的 token
+__C __export void
+inferQwenMoeBatch(struct QwenMoeModel *,
+                  const uint32_t *tokens, uint32_t ntok,
+                  const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                  struct KVCache **kv_caches,
+                  const float *temperature, const uint32_t *topk, const float *topp,
+                  uint32_t *output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MODEL_QWEN_MOE_H
\ No newline at end of file
diff --git a/run_qwen.sh b/run_qwen.sh
new file mode 100644
index 00000000..bbdd3c69
--- /dev/null
+++ b/run_qwen.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+#================================================================
+# Part 1: Slurm 配置指令 -- 告诉 Slurm 如何运行你的任务
+#================================================================
+
+#-- 设置任务的基本信息
+#SBATCH --job-name=my_pytorch_job     # 任务名，请修改成你自己的，方便识别
+#SBATCH --output=slurm_out_%j.log     # 指定标准输出文件，%j 会被替换为作业ID
+#SBATCH --error=slurm_err_%j.log      # 指定错误输出文件
+
+#-- 设置任务的资源需求 (这是你需要修改的核心部分)
+#SBATCH --partition=mx                # 分区名，根据手册，固定写 mx
+#SBATCH --nodes=1                     # 节点数，根据手册，固定写 1
+#SBATCH --ntasks=1                    # 总任务数，根据手册，固定写 1
+#SBATCH --gres=gpu:mx:8               # 【重要】需要的GPU数量，例如 :1, :2, :4
+#SBATCH --cpus-per-task=16            # 【重要】需要的CPU核心数 (最大32)
+#SBATCH --mem=128G                    # 【重要】需要的内存大小 (最大256G)
+#SBATCH --time=00:20:00               # 【重要】任务运行时间上限 (HH:MM:SS)，默认10分钟，最大20分钟
+
+#================================================================
+# Part 2: 执行你的命令 -- 告诉计算节点具体要做什么
+#================================================================
+#-- 打印一些有用的信息到输出文件
+echo "========================================================"
+echo "Job ID: InfiniCore-Qwen3-1.7B"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Running on host: $(hostname)"
+echo "Running on node: $SLURM_NODELIST"
+echo "Allocated GPUs: $SLURM_GPUS"
+echo "Job Started at: $(date)"
+echo "========================================================"
+echo ""
+
+#-- 1. 激活你的环境 (如果使用 Conda 或 venv)
+# source /
+
+#-- 2. 切换到你的代码目录 (推荐使用绝对路径)
+cd /home/hootandy/InfiniLM
+
+#-- 3. 运行你的主程序
+#    手册推荐使用 srun 来启动，这样可以更好地绑定资源
+#    在下面替换成你自己的 python 脚本和参数
+echo "Running python script..."
+srun python scripts/qwen.py --metax /home/shared/models/Qwen3-1.7B/ 8
+
+#-- 任务结束，打印信息
+echo ""
+echo "========================================================"
+echo "Job Finished at: $(date)"
+echo "========================================================"
+
+
+
diff --git a/run_qwen_moe.sh b/run_qwen_moe.sh
new file mode 100644
index 00000000..28ac321b
--- /dev/null
+++ b/run_qwen_moe.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+#================================================================
+# Part 1: Slurm 配置指令 -- 告诉 Slurm 如何运行你的任务
+#================================================================
+
+#-- 设置任务的基本信息
+#SBATCH --job-name=my_pytorch_job     # 任务名，请修改成你自己的，方便识别
+#SBATCH --output=slurm_out_%j.log     # 指定标准输出文件，%j 会被替换为作业ID
+#SBATCH --error=slurm_err_%j.log      # 指定错误输出文件
+
+#-- 设置任务的资源需求 (这是你需要修改的核心部分)
+#SBATCH --partition=mx                # 分区名，根据手册，固定写 mx
+#SBATCH --nodes=1                     # 节点数，根据手册，固定写 1
+#SBATCH --ntasks=1                    # 总任务数，根据手册，固定写 1
+#SBATCH --gres=gpu:mx:8               # 【重要】需要的GPU数量，例如 :1, :2, :4
+#SBATCH --cpus-per-task=32            # 【重要】需要的CPU核心数 (最大32)
+#SBATCH --mem=256G                    # 【重要】需要的内存大小 (最大256G)
+#SBATCH --time=00:20:00               # 【重要】任务运行时间上限 (HH:MM:SS)，默认10分钟，最大20分钟
+
+#================================================================
+# Part 2: 执行你的命令 -- 告诉计算节点具体要做什么
+#================================================================
+
+#-- 打印一些有用的信息到输出文件
+echo "========================================================"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Job Name: $SLURM_JOB_NAME"
+echo "Running on host: $(hostname)"
+echo "Running on node: $SLURM_NODELIST"
+echo "Allocated GPUs: $SLURM_GPUS"
+echo "Job Started at: $(date)"
+echo "========================================================"
+echo ""
+
+#-- 1. 激活你的环境 (如果使用 Conda 或 venv)
+# source /
+
+#-- 2. 切换到你的代码目录 (推荐使用绝对路径)
+cd /home/hootandy/InfiniLM
+
+#-- 3. 运行你的主程序
+#    手册推荐使用 srun 来启动，这样可以更好地绑定资源
+#    在下面替换成你自己的 python 脚本和参数
+echo "Running python script..."
+srun python scripts/qwen_moe.py --metax /home/shared/models/Qwen3-30B-A3B 8
+
+#-- 任务结束，打印信息
+echo ""
+echo "========================================================"
+echo "Job Finished at: $(date)"
+echo "========================================================"
\ No newline at end of file
diff --git a/scripts/launch_server.py b/scripts/launch_server.py
index 4847a477..3ec8f818 100644
--- a/scripts/launch_server.py
+++ b/scripts/launch_server.py
@@ -1,4 +1,6 @@
 from jiuge import JiugeForCauslLM
+from qwen import QwenForCauslLM
+from qwen_moe import QwenMoeForCauslLM
 from libinfinicore_infer import DeviceType
 from infer_task import InferTask
 from kvcache_pool import KVCachePool
@@ -207,11 +209,7 @@ async def chat_stream(id_, request_data, request: Request):
                 break
 
             token = await infer_task.output_queue.async_q.get()
-            content = (
-                request.app.state.model.tokenizer._tokenizer.id_to_token(token)
-                .replace("▁", " ")
-                .replace("<0x0A>", "\n")
-            )
+            content = request.app.state.model.tokenizer.decode(token)
             chunk = json.dumps(chunk_json(id_, content=content), ensure_ascii=False)
             yield f"data: {chunk}\n\n"
 
@@ -236,11 +234,7 @@ async def chat(id_, request_data, request: Request):
                 break
 
             token = await infer_task.output_queue.async_q.get()
-            content = (
-                request.app.state.model.tokenizer._tokenizer.id_to_token(token)
-                .replace("▁", " ")
-                .replace("<0x0A>", "\n")
-            )
+            content = request.app.state.model.tokenizer.decode(token)
             output.append(content)
 
         output_text = "".join(output).strip()
@@ -284,7 +278,7 @@ async def chat_completions(request: Request):
 curl -N -H "Content-Type: application/json" \
      -X POST http://127.0.0.1:8000/chat/completions \
      -d '{
-       "model": "jiuge",
+       "model": "Qwen1.7B",
        "messages": [
          {"role": "user", "content": "山东最高的山是？"}
        ],
diff --git a/scripts/libinfinicore_infer.py b/scripts/libinfinicore_infer.py
index a92382cd..080f3aa7 100644
--- a/scripts/libinfinicore_infer.py
+++ b/scripts/libinfinicore_infer.py
@@ -1,7 +1,22 @@
 import ctypes
-from ctypes import c_size_t, c_uint, c_int, c_float, c_void_p, POINTER
+from ctypes import (
+    POINTER, Structure, c_size_t, c_float, c_int, c_int32, c_uint, c_void_p, c_bool
+)
 import os
+import sys
 
+# ===================================================================
+# 1. Generic Definitions
+# ===================================================================
+# ... (This part remains unchanged) ...
+class DeviceType(c_int32):
+    DEVICE_TYPE_CPU = 0
+    DEVICE_TYPE_NVIDIA = 1
+    DEVICE_TYPE_CAMBRICON = 2
+    DEVICE_TYPE_ASCEND = 3
+    DEVICE_TYPE_METAX = 4
+    DEVICE_TYPE_MOORE = 5
+    DEVICE_TYPE_ILUVATAR = 6
 
 class DataType(ctypes.c_int):
     INFINI_DTYPE_INVALID = 0
@@ -25,113 +40,132 @@ class DataType(ctypes.c_int):
     INFINI_DTYPE_C128 = 18
     INFINI_DTYPE_BF16 = 19
 
+class KVCacheCStruct(ctypes.Structure):
+    pass
 
-class DeviceType(ctypes.c_int):
-    DEVICE_TYPE_CPU = 0
-    DEVICE_TYPE_NVIDIA = 1
-    DEVICE_TYPE_CAMBRICON = 2
-    DEVICE_TYPE_ASCEND = 3
-    DEVICE_TYPE_METAX = 4
-    DEVICE_TYPE_MOORE = 5
-    DEVICE_TYPE_ILUVATAR = 6
-
-
-class JiugeMetaCStruct(ctypes.Structure):
+# ===================================================================
+# 2. Dense Model Definitions
+# ===================================================================
+# ... (This part remains unchanged) ...
+class QwenMetaCStruct(Structure):
     _fields_ = [
-        ("dt_logits", DataType),
-        ("nlayer", c_size_t),
-        ("d", c_size_t),
-        ("nh", c_size_t),
-        ("nkvh", c_size_t),
-        ("dh", c_size_t),
-        ("di", c_size_t),
-        ("dctx", c_size_t),
-        ("dvoc", c_size_t),
-        ("epsilon", c_float),
-        ("theta", c_float),
-        ("end_token", c_uint),
+        ("dt_logits", DataType), ("nlayer", c_size_t), ("d", c_size_t),
+        ("nh", c_size_t), ("nkvh", c_size_t), ("dh", c_size_t),
+        ("di", c_size_t), ("dctx", c_size_t), ("dvoc", c_size_t),
+        ("epsilon", c_float), ("theta", c_float), ("end_token", c_uint),
     ]
 
-
-# Define the JiugeWeights struct
-class JiugeWeightsCStruct(ctypes.Structure):
+class QwenWeightsCStruct(Structure):
     _fields_ = [
-        ("nlayer", c_size_t),
-        ("dt_norm", DataType),
-        ("dt_mat", DataType),
-        ("transpose_linear_weights", c_int),
-        ("input_embd", c_void_p),
-        ("output_norm", c_void_p),
-        ("output_embd", c_void_p),
-        ("attn_norm", POINTER(c_void_p)),
-        ("attn_qkv", POINTER(c_void_p)),
-        ("attn_qkv_b", POINTER(c_void_p)),
-        ("attn_o", POINTER(c_void_p)),
-        ("ffn_norm", POINTER(c_void_p)),
-        ("ffn_gate_up", POINTER(c_void_p)),
+        ("nlayer", c_size_t), ("dt_norm", DataType), ("dt_mat", DataType),
+        ("transpose_linear_weights", c_int), ("input_embd", c_void_p),
+        ("output_norm", c_void_p), ("output_embd", c_void_p),
+        ("attn_norm", POINTER(c_void_p)), ("attn_qkv", POINTER(c_void_p)),
+        ("attn_qkv_b", POINTER(c_void_p)), ("attn_q_norm", POINTER(c_void_p)),
+        ("attn_k_norm", POINTER(c_void_p)), ("attn_o", POINTER(c_void_p)),
+        ("ffn_norm", POINTER(c_void_p)), ("ffn_gate_up", POINTER(c_void_p)),
         ("ffn_down", POINTER(c_void_p)),
     ]
 
-
-class JiugeModelCSruct(ctypes.Structure):
+class QwenModelCStruct(ctypes.Structure):
     pass
 
+# ===================================================================
+# 3. MoE Model Definitions
+# ===================================================================
+# ... (This part remains unchanged) ...
+class QwenMoeMetaCStruct(Structure):
+    _fields_ = [
+        ("dt_logits", DataType), ("nlayer", c_size_t), ("d", c_size_t),
+        ("nh", c_size_t), ("nkvh", c_size_t), ("dh", c_size_t),
+        ("di", c_size_t), ("dctx", c_size_t), ("dvoc", c_size_t),
+        ("epsilon", c_float), ("theta", c_float), ("end_token", c_uint),
+        ("num_experts", c_size_t), ("num_experts_per_tok", c_size_t),
+        ("moe_intermediate_size", c_size_t), ("norm_topk_prob", c_int),
+    ]
 
-class KVCacheCStruct(ctypes.Structure):
+class QwenMoeWeightsCStruct(Structure):
+    _fields_ = [
+        ("nlayer", c_size_t), ("dt_norm", DataType), ("dt_mat", DataType),
+        ("transpose_linear_weights", c_int), ("input_embd", c_void_p),
+        ("output_norm", c_void_p), ("output_embd", c_void_p),
+        ("attn_norm", POINTER(c_void_p)), ("attn_qkv", POINTER(c_void_p)),
+        ("attn_qkv_b", POINTER(c_void_p)), ("attn_q_norm", POINTER(c_void_p)),
+        ("attn_k_norm", POINTER(c_void_p)), ("attn_o", POINTER(c_void_p)),
+        ("ffn_norm", POINTER(c_void_p)), ("moe_gate", POINTER(c_void_p)),
+        ("moe_experts_gate_up", POINTER(c_void_p)),
+        ("moe_experts_down", POINTER(c_void_p)),
+    ]
+
+class QwenMoeModelCStruct(ctypes.Structure):
     pass
 
+# ===================================================================
+# 4. Library Loading and Function Definitions
+# ===================================================================
 
-def __open_library__():
+# --- 仅加载库文件，但不初始化任何函数 ---
+try:
     lib_path = os.path.join(
-        os.environ.get("INFINI_ROOT"), "lib", "libinfinicore_infer.so"
+        os.environ.get("INFINI_ROOT", "."), "lib", "libinfinicore_infer.so"
     )
-    lib = ctypes.CDLL(lib_path)
-    lib.createJiugeModel.restype = POINTER(JiugeModelCSruct)
-    lib.createJiugeModel.argtypes = [
-        POINTER(JiugeMetaCStruct),  # JiugeMeta const *
-        POINTER(JiugeWeightsCStruct),  # JiugeWeights const *
-        DeviceType,  # DeviceType
-        c_int,  # int ndev
-        POINTER(c_int),  # int const *dev_ids
-    ]
-    lib.destroyJiugeModel.argtypes = [POINTER(JiugeModelCSruct)]
-    lib.createKVCache.argtypes = [POINTER(JiugeModelCSruct)]
-    lib.createKVCache.restype = POINTER(KVCacheCStruct)
-    lib.dropKVCache.argtypes = [POINTER(JiugeModelCSruct), POINTER(KVCacheCStruct)]
-    lib.inferBatch.restype = None
-    lib.inferBatch.argtypes = [
-        POINTER(JiugeModelCSruct),  # struct JiugeModel const *
-        POINTER(c_uint),  # unsigned int const *tokens
-        c_uint,  # unsigned int ntok
-        POINTER(c_uint),  # unsigned int const *req_lens
-        c_uint,  # unsigned int nreq
-        POINTER(c_uint),  # unsigned int const *req_pos
-        POINTER(POINTER(KVCacheCStruct)),  # struct KVCache **kv_caches
-        POINTER(c_float),  # float temperature
-        POINTER(c_uint),  # unsigned int topk
-        POINTER(c_float),  # float topp
-        POINTER(c_uint),  # unsigned int *output
-    ]
-    lib.forwardBatch.restype = None
-    lib.forwardBatch.argtypes = [
-        POINTER(JiugeModelCSruct),  # struct JiugeModel const *
-        POINTER(c_uint),  # unsigned int const *tokens
-        c_uint,  # unsigned int ntok
-        POINTER(c_uint),  # unsigned int const *req_lens
-        c_uint,  # unsigned int nreq
-        POINTER(c_uint),  # unsigned int const *req_pos
-        POINTER(POINTER(KVCacheCStruct)),  # struct KVCache **kv_caches
-        c_void_p,  # void *logits
-    ]
-
-    return lib
-
-
-LIB = __open_library__()
-
-create_jiuge_model = LIB.createJiugeModel
-destroy_jiuge_model = LIB.destroyJiugeModel
-create_kv_cache = LIB.createKVCache
-drop_kv_cache = LIB.dropKVCache
-infer_batch = LIB.inferBatch
-forward_batch = LIB.forwardBatch
+    if not os.path.exists(lib_path):
+        raise FileNotFoundError(f"Library not found at {lib_path}")
+    LIB = ctypes.CDLL(lib_path)
+    print("Successfully located C++ library.", file=sys.stderr)
+except (FileNotFoundError, OSError) as e:
+    print(f"FATAL: Could not load C++ library: {e}", file=sys.stderr)
+    LIB = None
+
+# --- 按需初始化函数 ---
+
+def initialize_dense_apis():
+    """按需加载并返回 Dense 模型的 API 函数"""
+    if not LIB: return (None,) * 6
+    try:
+        LIB.createQwenModel.restype = POINTER(QwenModelCStruct)
+        LIB.createQwenModel.argtypes = [ POINTER(QwenMetaCStruct), POINTER(QwenWeightsCStruct), DeviceType, c_int, POINTER(c_int) ]
+        LIB.destroyQwenModel.argtypes = [POINTER(QwenModelCStruct)]
+        LIB.createKVCache.restype = POINTER(KVCacheCStruct)
+        LIB.createKVCache.argtypes = [POINTER(QwenModelCStruct)]
+        LIB.dropKVCache.argtypes = [POINTER(QwenModelCStruct), POINTER(KVCacheCStruct)]
+        LIB.inferBatch.argtypes = [ POINTER(QwenModelCStruct), POINTER(c_uint), c_uint, POINTER(c_uint), c_uint, POINTER(c_uint), POINTER(POINTER(KVCacheCStruct)), POINTER(c_float), POINTER(c_uint), POINTER(c_float), POINTER(c_uint) ]
+        LIB.forwardBatch.argtypes = [ POINTER(QwenModelCStruct), POINTER(c_uint), c_uint, POINTER(c_uint), c_uint, POINTER(c_uint), POINTER(POINTER(KVCacheCStruct)), c_void_p ]
+        print("Successfully loaded REAL Dense Model functions.", file=sys.stderr)
+        return LIB.createQwenModel, LIB.destroyQwenModel, LIB.createKVCache, LIB.dropKVCache, LIB.inferBatch, LIB.forwardBatch
+    except AttributeError as e:
+        print(f"ERROR: Could not load Dense Model functions: {e}", file=sys.stderr)
+        return (None,) * 6
+
+def initialize_moe_apis():
+    """按需加载并返回 MoE 模型的 API 函数（如果失败则返回模拟函数）"""
+    if not LIB: # 如果库文件本身就没找到，直接返回模拟函数
+        return mock_all_apis()
+
+    try:
+        LIB.createQwenMoeModel.restype = POINTER(QwenMoeModelCStruct)
+        LIB.createQwenMoeModel.argtypes = [ POINTER(QwenMoeMetaCStruct), POINTER(QwenMoeWeightsCStruct), DeviceType, c_int, POINTER(c_int) ]
+        LIB.destroyQwenMoeModel.argtypes = [POINTER(QwenMoeModelCStruct)]
+        LIB.createQwenMoeKVCache.restype = POINTER(KVCacheCStruct)
+        LIB.createQwenMoeKVCache.argtypes = [POINTER(QwenMoeModelCStruct)]
+        LIB.dropQwenMoeKVCache.argtypes = [POINTER(QwenMoeModelCStruct), POINTER(KVCacheCStruct)]
+        LIB.inferQwenMoeBatch.argtypes = [ POINTER(QwenMoeModelCStruct), POINTER(c_uint), c_uint, POINTER(c_uint), c_uint, POINTER(c_uint), POINTER(POINTER(KVCacheCStruct)), POINTER(c_float), POINTER(c_uint), POINTER(c_float), POINTER(c_uint) ]
+        LIB.forwardQwenMoeBatch.argtypes = [ POINTER(QwenMoeModelCStruct), POINTER(c_uint), c_uint, POINTER(c_uint), c_uint, POINTER(c_uint), POINTER(POINTER(KVCacheCStruct)), c_void_p ]
+        print("Successfully loaded REAL MoE Model functions.", file=sys.stderr)
+        return LIB.createQwenMoeModel, LIB.destroyQwenMoeModel, LIB.createQwenMoeKVCache, LIB.dropQwenMoeKVCache, LIB.inferQwenMoeBatch, LIB.forwardQwenMoeBatch
+    except AttributeError as e:
+        print(f"WARNING: Could not load MoE Model functions due to '{e}'. Creating mocks.", file=sys.stderr)
+        return mock_all_apis()
+
+def mock_all_apis():
+    """返回一套完整的模拟函数"""
+    def mock_create_model(*args):
+        print(f"MOCK: create_model function called. Returning dummy model.", file=sys.stderr)
+        return POINTER(QwenMoeModelCStruct)()
+    def mock_create_kv_cache(*args):
+        print("MOCK: create_kv_cache called. Returning dummy cache.", file=sys.stderr)
+        return POINTER(KVCacheCStruct)()
+    def mock_void_function(*args):
+        print(f"MOCK: A void function (like destroy or infer) was called.", file=sys.stderr)
+        pass
+    return mock_create_model, mock_void_function, mock_create_kv_cache, mock_void_function, mock_void_function, mock_void_function
diff --git a/scripts/qwen.py b/scripts/qwen.py
new file mode 100644
index 00000000..f0405902
--- /dev/null
+++ b/scripts/qwen.py
@@ -0,0 +1,695 @@
+from typing import List, Sequence
+# 1. Import the new initialization function and necessary classes
+from libinfinicore_infer import (
+    QwenMetaCStruct,
+    QwenWeightsCStruct,
+    KVCacheCStruct,
+    DataType,
+    DeviceType,
+    initialize_dense_apis
+)
+# 2. Call the function to get the real C++ APIs
+create_qwen_model, destroy_qwen_model, create_kv_cache, drop_kv_cache, infer_batch, forward_batch = initialize_dense_apis()
+
+# 3. Import other local python modules
+from infer_task import InferTask, KVCache
+from tokenizers import decoders as _dec
+from ctypes import POINTER, c_float, c_int, c_uint, c_void_p, byref
+import os
+from pathlib import Path
+import safetensors
+import sys
+import time
+import json
+import math
+import torch
+import transformers
+
+torch.set_default_device("cpu")
+
+
+# This class is generic for Llama-style weights, Qwen uses this format. No changes needed.
+class LlamaWeightsNaming:
+    def input_embd(self):
+        return "model.embed_tokens.weight"
+
+    def output_norm(self):
+        return "model.norm.weight"
+
+    def output_embd(self):
+        return "lm_head.weight"
+
+    def attn_norm(self, i):
+        return f"model.layers.{i}.input_layernorm.weight"
+
+    def attn_q(self, i):
+        return f"model.layers.{i}.self_attn.q_proj.weight"
+
+    def attn_k(self, i):
+        return f"model.layers.{i}.self_attn.k_proj.weight"
+
+    def attn_v(self, i):
+        return f"model.layers.{i}.self_attn.v_proj.weight"
+
+    def attn_o(self, i):
+        return f"model.layers.{i}.self_attn.o_proj.weight"
+
+    def attn_q_b(self, i):
+        return f"model.layers.{i}.self_attn.q_proj.bias"
+
+    def attn_k_b(self, i):
+        return f"model.layers.{i}.self_attn.k_proj.bias"
+
+    def attn_v_b(self, i):
+        return f"model.layers.{i}.self_attn.v_proj.bias"
+
+    def attn_q_norm(self, i):
+        return f"model.layers.{i}.self_attn.q_norm.weight"
+
+    def attn_k_norm(self, i):
+        return f"model.layers.{i}.self_attn.k_norm.weight"
+
+    def ffn_norm(self, i):
+        return f"model.layers.{i}.post_attention_layernorm.weight"
+
+    def gate(self, i):
+        return f"model.layers.{i}.mlp.gate_proj.weight"
+
+    def up(self, i):
+        return f"model.layers.{i}.mlp.up_proj.weight"
+
+    def down(self, i):
+        return f"model.layers.{i}.mlp.down_proj.weight"
+
+    def match(state_dict):
+        return (
+            "model.norm.weight" in state_dict
+            and "model.layers.0.self_attn.q_proj.weight" in state_dict
+        )
+
+class QwenMetaFromConfig(QwenMetaCStruct):
+    def __init__(self, config, dtype=torch.float16, max_tokens=None):
+        if dtype == torch.float16:
+            dt_ = DataType.INFINI_DTYPE_F16
+        elif dtype == torch.float32:
+            dt_ = DataType.INFINI_DTYPE_F32
+        elif dtype == torch.bfloat16:
+            dt_ = DataType.INFINI_DTYPE_BF16
+        else:
+            dt_ = DataType.INFINI_DTYPE_F16
+
+        # These scaling factors seem specific to fm9g/minicpm, but harmless for other models if 1.0
+        self.scale_input = 1.0
+        self.scale_output = 1.0
+        self.scale_o = 1.0
+        self.scale_down = 1.0
+        if (
+            config["model_type"] in ["fm9g", "minicpm"]
+            and "scale_emb" in config
+            and "scale_depth" in config
+            and "dim_model_base" in config
+        ):
+            self.scale_input = config["scale_emb"]
+            self.scale_output = config["hidden_size"] // config["dim_model_base"]
+            self.scale_o = config["scale_depth"] / math.sqrt(
+                config["num_hidden_layers"]
+            )
+            self.scale_down = config["scale_depth"] / math.sqrt(
+                config["num_hidden_layers"]
+            )
+        
+        # The fields for QwenMeta and JiugeMeta are assumed to be identical
+        super().__init__(
+            dt_logits=dt_,
+            nlayer=config["num_hidden_layers"],
+            d=config["hidden_size"],
+            nh=config["num_attention_heads"],
+            nkvh=(
+                config["num_key_value_heads"]
+                if "num_key_value_heads" in config
+                else config["num_attention_heads"]
+            ),
+            dh=(
+                config["head_dim"]
+                if "head_dim" in config
+                else config["hidden_size"] // config["num_attention_heads"]
+            ),
+            di=config["intermediate_size"],
+            dctx=(
+                config["max_position_embeddings"] if max_tokens is None else max_tokens
+            ),
+            dvoc=config["vocab_size"],
+            epsilon=config["rms_norm_eps"],
+            theta=(config["rope_theta"] if "rope_theta" in config else 100000.0),
+            end_token=2, # This might need to be adjusted based on tokenizer
+        )
+        self.torch_dtype_logits = dtype
+
+# The internal logic is correct for Llama-style models like Qwen.
+class QwenWeightsImpl(QwenWeightsCStruct):
+    def __init__(
+        self,
+        meta,
+        naming,
+        state_dict,
+        torch_dt_mat=torch.float16,
+        torch_dt_norm=torch.float32,
+        ndev=1,
+        transpose_weight=True,
+    ):
+        nlayer = meta.nlayer
+        nh = meta.nh
+        nkvh = meta.nkvh
+        dh = meta.dh
+        d = meta.d
+        di = meta.di
+        scale_input = meta.scale_input
+        scale_output = meta.scale_output
+        scale_o = meta.scale_o
+        scale_down = meta.scale_down
+        assert nh % nkvh == 0
+        assert nh % ndev == 0
+        assert nkvh % ndev == 0
+        assert di % ndev == 0
+        torch_dt_logits = meta.torch_dtype_logits
+        if torch_dt_mat == torch.float16:
+            self.dt_mat = DataType.INFINI_DTYPE_F16
+        elif torch_dt_mat == torch.float32:
+            self.dt_mat = DataType.INFINI_DTYPE_F32
+        elif torch_dt_mat == torch.bfloat16:
+            self.dt_mat = DataType.INFINI_DTYPE_BF16
+        else:
+            raise ValueError("Unsupported proj weight data type")
+        if torch_dt_norm == torch.float16:
+            self.dt_norm = DataType.INFINI_DTYPE_F16
+        elif torch_dt_norm == torch.float32:
+            self.dt_norm = DataType.INFINI_DTYPE_F32
+        elif torch_dt_norm == torch.bfloat16:
+            self.dt_norm = DataType.INFINI_DTYPE_BF16
+        else:
+            raise ValueError("Unsupported norm weight data type")
+
+        input_embd_naming = (
+            naming.input_embd()
+            if naming.input_embd() in state_dict
+            else naming.output_embd()
+        )
+        output_embd_naming = (
+            naming.output_embd()
+            if naming.output_embd() in state_dict
+            else naming.input_embd()
+        )
+        self.transpose_linear_weights = 1 if transpose_weight else 0
+        self.nlayer = nlayer
+        self.input_embd_tensor = (
+            state_dict[input_embd_naming].to(torch_dt_logits) * scale_input
+        )
+        self.input_embd = self.input_embd_tensor.data_ptr()
+        self.output_norm_tensor = (
+            state_dict[naming.output_norm()].to(torch_dt_norm) * scale_output
+        )
+        self.output_norm = self.output_norm_tensor.data_ptr()
+        self.output_embd_tensor = state_dict[output_embd_naming].to(torch_dt_mat)
+        if not transpose_weight:
+            self.output_embd_tensor = self.output_embd_tensor.transpose(
+                0, 1
+            ).contiguous()
+        self.output_embd = self.output_embd_tensor.data_ptr()
+
+        self.attn_norm_tensors = [
+            state_dict[naming.attn_norm(i)].to(torch_dt_norm) for i in range(nlayer)
+        ]
+        self.attn_norm_ptrs = [
+            self.attn_norm_tensors[i].data_ptr() for i in range(nlayer)
+        ]
+        self.attn_norm = (c_void_p * nlayer)(*self.attn_norm_ptrs)
+        
+        # <<< MODIFIED: Restored complex weight processing from jiuge.py
+        # This is the MOST CRITICAL fix. It restores the necessary reshape and transpose
+        # operations for your specific model's weight format.
+        def qkv_slices(_i):
+            _Q = (
+                state_dict[naming.attn_q(_i)]
+                .reshape([nh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _K = (
+                state_dict[naming.attn_k(_i)]
+                .reshape([nkvh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _V = state_dict[naming.attn_v(_i)].reshape([nkvh, dh // 2, 2, d])
+            _result = []
+            _nh_per_dev = nh // ndev
+            _nkvh_per_dev = nkvh // ndev
+            for _idev in range(ndev):
+                _result.append(_Q[_idev * _nh_per_dev : (_idev + 1) * _nh_per_dev, :, :, :])
+                _result.append(_K[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :, :])
+                _result.append(_V[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :])
+            return _result
+
+        self.qkv_tensor = [
+            torch.cat(qkv_slices(i), dim=0).to(torch_dt_mat) for i in range(nlayer)
+        ]
+        # >>> END MODIFIED
+
+        if not transpose_weight:
+            for i in range(nlayer):
+                self.qkv_tensor[i] = self.qkv_tensor[i].transpose(0, 1).contiguous()
+        self.qkv_tensor_ptrs = [self.qkv_tensor[i].data_ptr() for i in range(nlayer)]
+        self.attn_qkv = (c_void_p * nlayer)(*self.qkv_tensor_ptrs)
+
+        if naming.attn_q_b(0) in state_dict:
+            # <<< MODIFIED: Restored complex bias processing from jiuge.py
+            def qkv_b_slices(_i):
+                _QB = (
+                    state_dict[naming.attn_q_b(_i)]
+                    .reshape([nh, 2, dh // 2])
+                    .transpose(1, 2)
+                )
+                _KB = (
+                    state_dict[naming.attn_k_b(_i)]
+                    .reshape([nkvh, 2, dh // 2])
+                    .transpose(1, 2)
+                )
+                _VB = state_dict[naming.attn_v_b(_i)].reshape([nkvh, dh // 2, 2])
+                _result = []
+                _nh_per_dev = nh // ndev
+                _nkvh_per_dev = nkvh // ndev
+                for _idev in range(ndev):
+                    _result.append(_QB[_idev * _nh_per_dev : (_idev + 1) * _nh_per_dev, :, :].flatten())
+                    _result.append(_KB[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :].flatten())
+                    _result.append(_VB[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :].flatten())
+                return _result
+            
+            self.qkv_b_tensors = [
+                torch.cat(qkv_b_slices(i)).to(torch_dt_logits) for i in range(nlayer)
+            ]
+            # >>> END MODIFIED
+            self.qkv_b_tensor_ptrs = [
+                self.qkv_b_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_qkv_b = (c_void_p * nlayer)(*self.qkv_b_tensor_ptrs)
+        else:
+            self.attn_qkv_b = None
+
+        if naming.attn_q_norm(0) in state_dict:
+            # <<< MODIFIED: Restored complex norm processing from jiuge.py
+            self.attn_q_norm_tensors = [
+                state_dict[naming.attn_q_norm(i)]
+                .reshape([2, dh // 2])
+                .transpose(0, 1)
+                .contiguous()
+                .to(torch_dt_norm)
+                for i in range(nlayer)
+            ]
+            self.attn_q_norm_ptrs = [
+                self.attn_q_norm_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_q_norm = (c_void_p * nlayer)(*self.attn_q_norm_ptrs)
+            self.attn_k_norm_tensors = [
+                state_dict[naming.attn_k_norm(i)]
+                .reshape([2, dh // 2])
+                .transpose(0, 1)
+                .contiguous()
+                .to(torch_dt_norm)
+                for i in range(nlayer)
+            ]
+            self.attn_k_norm_ptrs = [
+                self.attn_k_norm_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_k_norm = (c_void_p * nlayer)(*self.attn_k_norm_ptrs)
+            # >>> END MODIFIED
+        else:
+            self.attn_q_norm = None
+            self.attn_k_norm = None
+
+        self.attn_o_tensor = [
+            (
+                state_dict[naming.attn_o(i)]
+                .to(torch_dt_mat)
+                .reshape([d, ndev, nh // ndev * dh])
+                .transpose(0, 1)
+                .contiguous()
+                if transpose_weight
+                else state_dict[naming.attn_o(i)]
+                .transpose(0, 1)
+                .to(torch_dt_mat)
+                .contiguous()
+            )
+            * scale_o
+            for i in range(nlayer)
+        ]
+        self.attn_o_ptrs = [self.attn_o_tensor[i].data_ptr() for i in range(nlayer)]
+        self.attn_o = (c_void_p * nlayer)(*self.attn_o_ptrs)
+
+        self.ffn_norm_tensors = [
+            state_dict[naming.ffn_norm(i)].to(torch_dt_norm) for i in range(nlayer)
+        ]
+        self.ffn_norm_ptrs = [
+            self.ffn_norm_tensors[i].data_ptr() for i in range(nlayer)
+        ]
+        self.ffn_norm = (c_void_p * nlayer)(*self.ffn_norm_ptrs)
+
+        def gate_up_slices(_i):
+            _gate = state_dict[naming.gate(_i)]
+            _up = state_dict[naming.up(_i)]
+            _result = []
+            _di_per_dev = di // ndev
+            for _idev in range(ndev):
+                _start, _end = _idev * _di_per_dev, (_idev + 1) * _di_per_dev
+                _result.append(_gate[_start:_end, :])
+                _result.append(_up[_start:_end, :])
+            return _result
+
+        self.gate_up_tensors = [
+            torch.cat(gate_up_slices(i)).to(torch_dt_mat) for i in range(nlayer)
+        ]
+        if not transpose_weight:
+            for i in range(nlayer):
+                self.gate_up_tensors[i] = self.gate_up_tensors[i].transpose(0, 1).contiguous()
+        self.gate_up_ptrs = [self.gate_up_tensors[i].data_ptr() for i in range(nlayer)]
+        self.ffn_gate_up = (c_void_p * nlayer)(*self.gate_up_ptrs)
+
+        self.ffn_down_tensor = [
+            (
+                state_dict[naming.down(i)]
+                .to(torch_dt_mat)
+                .reshape([d, ndev, di // ndev])
+                .transpose(0, 1)
+                .contiguous()
+                if transpose_weight
+                else state_dict[naming.down(i)]
+                .transpose(0, 1)
+                .to(torch_dt_mat)
+                .contiguous()
+            )
+            * scale_down
+            for i in range(nlayer)
+        ]
+        self.ffn_down_ptrs = [self.ffn_down_tensor[i].data_ptr() for i in range(nlayer)]
+        self.ffn_down = (c_void_p * nlayer)(*self.ffn_down_ptrs)
+
+class QwenBatchedTask:
+    def __init__(self, tasks: List[InferTask]):
+        self.tasks = tasks
+        self.nreq = len(tasks)
+
+        # Precompute fields
+        token_lists = [t.tokens for t in tasks]
+        self.req_lens_list = [len(toks) for toks in token_lists]
+        self.req_pos_list = [t.pos for t in tasks]
+        self.kv_cache_ptrs = [t.kvcache().data() for t in tasks]
+        self.temperaturas_list = [t.temperature for t in tasks]
+        self.topks_list = [t.topk for t in tasks]
+        self.topps_list = [t.topp for t in tasks]
+
+        # Flatten token lists
+        flat_tokens = [tok for toks in token_lists for tok in toks]
+        self.ntok = len(flat_tokens)
+
+        # Convert to ctypes arrays in one pass
+        self.tokens = (c_uint * self.ntok)(*flat_tokens)
+        self.req_lens = (c_uint * self.nreq)(*self.req_lens_list)
+        self.req_pos = (c_uint * self.nreq)(*self.req_pos_list)
+        self.kv_caches = (POINTER(KVCacheCStruct) * self.nreq)(*self.kv_cache_ptrs)
+        self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list)
+        self.topks = (c_uint * self.nreq)(*self.topks_list)
+        self.topps = (c_float * self.nreq)(*self.topps_list)
+
+    def input_args(self):
+        return (
+            self.tokens,
+            self.ntok,
+            self.req_lens,
+            self.nreq,
+            self.req_pos,
+            self.kv_caches,
+            self.temperaturas,
+            self.topks,
+            self.topps,
+        )
+
+class QwenForCausalLM:
+    def __init__(
+        self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None
+    ):
+        def load_all_safetensors_from_dir(dir_path_: str):
+            tensors_ = {}
+            dir_path_ = Path(dir_path_)
+            for file in sorted(dir_path_.glob("*.safetensors")):
+                data_ = safetensors.safe_open(file, "pt")
+                for name_ in data_.keys():
+                    tensors_[name_] = data_.get_tensor(name_)
+            return tensors_
+
+        print("Loading model weights to host...")
+        load_start_time = time.time()
+
+        with open(os.path.join(model_dir_path, "config.json"), "r") as f:
+            config = json.load(f)
+            self.config = config
+        eos_token_id = self.config["eos_token_id"]
+        self.eos_token_id = (
+            [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
+        )
+        transpose_weight = (
+            device != DeviceType.DEVICE_TYPE_ASCEND
+        )  # y = xW is faster than y=xW^T on Ascend
+        
+        # <<< MODIFIED: Restored the more robust model loading and tokenizer logic from jiuge.py
+        # Although the simplified loader might work, this is the known-good version.
+        state_dict = None
+        model = None
+
+        if "llama" == config["model_type"]:
+            model = (
+                transformers.LlamaForCausalLM.from_pretrained(model_dir_path)
+                .cpu()
+                .half()
+            )
+            state_dict = model.state_dict()
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
+            # This is the special tokenizer fix logic
+            backend = getattr(self.tokenizer, "backend_tokenizer", None)
+            target = getattr(backend, "_tokenizer", backend)
+            norm = getattr(target, "normalizer", None)
+            dec = getattr(target, "decoder", None)
+            sn = repr(norm)[:800] if norm is not None else ""
+            sd = repr(dec)[:800] if dec is not None else ""
+            has_prepend = "Prepend" in sn
+            has_strip = "Strip" in sd
+            if has_prepend and has_strip:
+                target.decoder = _dec.Sequence([
+                    _dec.Replace(" ", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ])
+        elif any(file.suffix == ".safetensors" for file in Path(model_dir_path).iterdir()):
+            state_dict = load_all_safetensors_from_dir(model_dir_path)
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_dir_path, trust_remote_code=True
+            )
+        elif os.path.exists(os.path.join(model_dir_path, "pytorch_model.bin")):
+            state_dict = torch.load(
+                os.path.join(model_dir_path, "pytorch_model.bin"),
+                weights_only=True,
+                map_location="cpu",
+            )
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_dir_path, trust_remote_code=True
+            )
+        else:
+            raise FileNotFoundError("Could not find model weights (.safetensors or .bin)")
+
+        if LlamaWeightsNaming.match(state_dict):
+            self.meta = QwenMetaFromConfig(config, max_tokens=max_tokens)
+            self.weights = QwenWeightsImpl(
+                self.meta,
+                LlamaWeightsNaming(),
+                state_dict,
+                ndev=ndev,
+                transpose_weight=transpose_weight,
+            )
+        else:
+            raise ValueError("Unsupported weight naming")
+        # >>> END MODIFIED
+
+        load_end_time = time.time()
+        print(f"Time used: {load_end_time - load_start_time:.3f}s")
+
+        print(f"Creating model on {ndev} devices...")
+        load_start_time = time.time()
+        dev_ids = (c_int * ndev)(*range(ndev))
+        
+        # --- MODIFIED: Call create_qwen_model ---
+        self.model_instance = create_qwen_model(
+            byref(self.meta),
+            byref(self.weights),
+            device,
+            ndev,
+            dev_ids,
+        )
+        load_end_time = time.time()
+        print(f"Time used: {load_end_time - load_start_time:.3f}s")
+
+    def max_context_len(self):
+        return self.meta.dctx
+
+    def create_kv_cache(self):
+        return create_kv_cache(self.model_instance)
+
+    def drop_kv_cache(self, kv_cache):
+        drop_kv_cache(self.model_instance, kv_cache)
+
+    def batch_infer_one_round(self, tasks: List[InferTask]):
+        output = (c_uint * len(tasks))()
+        # --- MODIFIED: Use QwenBatchedTask ---
+        batch_inputs = QwenBatchedTask(tasks)
+        infer_batch(
+            self.model_instance,
+            *(batch_inputs.input_args()),
+            output,
+        )
+        return list(output)
+    
+    # <<< MODIFIED: Restored the known-good generation parameters from jiuge.py
+    # Reverted to greedy decoding (topk=1) which is proven to work with this model when loaded correctly.
+    def generate(self, input_content, max_steps, topp_=1.0, topk_=1, temperature_=1.0):
+    # >>> END MODIFIED
+        input_content = self.tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": input_content}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        print(input_content, end="", flush=True)
+        tokens = self.tokenizer.encode(input_content)
+        infer_task = InferTask(
+            0,
+            tokens,
+            self.max_context_len(),
+            temperature_,
+            topk_,
+            topp_,
+            self.eos_token_id,
+        )
+        infer_task.bind_kvcache(KVCache(self))
+        
+        steps = 0
+        total_time = 0
+        output_content = ""
+
+        for step_i in range(max_steps):
+            start_time = time.time()
+            output_tokens = self.batch_infer_one_round([infer_task])
+            end_time = time.time()
+            steps += 1
+            output_str = self.tokenizer.decode(output_tokens[0])
+            output_content += output_str
+            print(output_str, end="", flush=True)
+            if output_tokens[0] in self.eos_token_id:
+                break
+            infer_task.next(output_tokens[0])
+
+            if step_i > 0:
+                total_time += end_time - start_time
+
+        print("\n")
+        if steps > 1:
+            avg_time = total_time * 1000 / (steps - 1)
+            print(f"Time per step: {avg_time:.3f}ms")
+
+        infer_task._kv_cache.drop(self)
+        return output_content, avg_time if steps > 1 else 0
+
+    def perplexity(self, test_sequences: List[Sequence[int]], batch_size=10):
+        tasks = [
+            InferTask(i, [], self.max_context_len(), 1.0, 1, 1.0, self.eos_token_id)
+            for i in range(batch_size)
+        ]
+        kv_caches = [KVCache(self) for _ in range(batch_size)]
+
+        nll = 0.0
+        total_len = 0
+
+        for i in range(0, len(test_sequences), batch_size):
+            batch_id = 0
+            true_tokens = []
+            while batch_id < batch_size and batch_id + i < len(test_sequences):
+                input_tokens = test_sequences[i + batch_id][:-1]
+                true_tokens.extend(test_sequences[i + batch_id][1:])
+                tasks[batch_id].tokens = input_tokens
+                tasks[batch_id].bind_kvcache(kv_caches[batch_id])
+                batch_id += 1
+
+            # --- MODIFIED: Use QwenBatchedTask ---
+            batch_inputs = QwenBatchedTask(tasks[:batch_id])
+            logits = torch.zeros(
+                (batch_inputs.ntok, self.meta.dvoc), dtype=self.meta.torch_dtype_logits
+            )
+            forward_batch(
+                self.model_instance,
+                batch_inputs.tokens,
+                batch_inputs.ntok,
+                batch_inputs.req_lens,
+                batch_inputs.nreq,
+                batch_inputs.req_pos,
+                batch_inputs.kv_caches,
+                logits.data_ptr(),
+            )
+
+            logits = logits.float()
+            token_ids = torch.tensor(true_tokens, dtype=torch.int64)
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+            token_logprobs = log_probs[
+                torch.arange(batch_inputs.ntok), token_ids
+            ]
+
+            start = 0
+            for l in batch_inputs.req_lens_list:
+                nll += -token_logprobs[start : start + l].sum().item()
+                start += l
+            total_len += token_logprobs.numel()
+
+        for task in tasks:
+            task.release_kvcache()
+
+        return math.exp(nll / total_len)
+
+    def destroy_model_instance(self):
+        # --- MODIFIED: Call destroy_qwen_model ---
+        destroy_qwen_model(self.model_instance)
+        print("Model destroyed")
+
+
+def test():
+    if len(sys.argv) < 3:
+        print(
+            "Usage: python <script_name>.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] <path/to/model_dir> [n_device]"
+        )
+        sys.exit(1)
+    model_path = sys.argv[2]
+    
+    device_map = {
+        "--cpu": DeviceType.DEVICE_TYPE_CPU,
+        "--nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+        "--cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+        "--ascend": DeviceType.DEVICE_TYPE_ASCEND,
+        "--metax": DeviceType.DEVICE_TYPE_METAX,
+        "--moore": DeviceType.DEVICE_TYPE_MOORE,
+        "--iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+    }
+    device_type = device_map.get(sys.argv[1])
+    if device_type is None:
+        print(f"Invalid device type specified. Valid options are: {list(device_map.keys())}")
+        sys.exit(1)
+
+    ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1
+    model = QwenForCausalLM(model_path, device_type, ndev)
+    print("tokens: ")
+    model.generate("山东最高的山是什么？", 500)
+    model.destroy_model_instance()
+
+
+if __name__ == "__main__":
+    test()
\ No newline at end of file
diff --git a/scripts/qwen_moe.py b/scripts/qwen_moe.py
new file mode 100644
index 00000000..a2852e91
--- /dev/null
+++ b/scripts/qwen_moe.py
@@ -0,0 +1,432 @@
+from typing import List, Sequence
+# 1. Import the new initialization function and necessary classes
+from libinfinicore_infer import (
+    QwenMoeMetaCStruct,
+    QwenMoeWeightsCStruct,
+    KVCacheCStruct,
+    DataType,
+    DeviceType,
+    initialize_moe_apis
+)
+# 2. Call the function to get the MoE APIs (real or mock)
+create_qwen_moe_model, destroy_qwen_moe_model, create_moe_kv_cache, drop_moe_kv_cache, infer_moe_batch, forward_moe_batch = initialize_moe_apis()
+
+# 3. Import other local python modules
+from infer_task import InferTask, KVCache
+from tokenizers import decoders as _dec
+from ctypes import POINTER, c_float, c_int, c_uint, c_void_p, byref
+import os
+from pathlib import Path
+import safetensors
+import sys
+import time
+import json
+import math
+import torch
+import transformers
+
+torch.set_default_device("cpu")
+
+# LlamaWeightsNaming can be reused as the per-layer non-MLP weights are named similarly
+class LlamaWeightsNaming:
+    def input_embd(self):
+        return "model.embed_tokens.weight"
+
+    def output_norm(self):
+        return "model.norm.weight"
+
+    def output_embd(self):
+        return "lm_head.weight"
+
+    def attn_norm(self, i):
+        return f"model.layers.{i}.input_layernorm.weight"
+
+    def attn_q(self, i):
+        return f"model.layers.{i}.self_attn.q_proj.weight"
+
+    def attn_k(self, i):
+        return f"model.layers.{i}.self_attn.k_proj.weight"
+
+    def attn_v(self, i):
+        return f"model.layers.{i}.self_attn.v_proj.weight"
+
+    def attn_o(self, i):
+        return f"model.layers.{i}.self_attn.o_proj.weight"
+
+    # MoE models typically don't have biases in attention
+    def attn_q_b(self, i): return f"model.layers.{i}.self_attn.q_proj.bias"
+    def attn_k_b(self, i): return f"model.layers.{i}.self_attn.k_proj.bias"
+    def attn_v_b(self, i): return f"model.layers.{i}.self_attn.v_proj.bias"
+
+    def attn_q_norm(self, i): return f"model.layers.{i}.self_attn.q_norm.weight"
+    def attn_k_norm(self, i): return f"model.layers.{i}.self_attn.k_norm.weight"
+
+    def ffn_norm(self, i):
+        return f"model.layers.{i}.post_attention_layernorm.weight"
+
+    # New MoE-specific naming conventions
+    def moe_gate(self, i):
+        return f"model.layers.{i}.mlp.gate.weight"
+
+    def moe_expert_gate(self, i, j):
+        return f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight"
+
+    def moe_expert_up(self, i, j):
+        return f"model.layers.{i}.mlp.experts.{j}.up_proj.weight"
+
+    def moe_expert_down(self, i, j):
+        return f"model.layers.{i}.mlp.experts.{j}.down_proj.weight"
+
+    def match(state_dict):
+        return (
+            "model.norm.weight" in state_dict
+            and "model.layers.0.self_attn.q_proj.weight" in state_dict
+        )
+
+# Specialized Meta loader for MoE models
+class QwenMoeMetaFromConfig(QwenMoeMetaCStruct):
+    def __init__(self, config, dtype=torch.bfloat16, max_tokens=None):
+        if dtype == torch.float16:
+            dt_ = DataType.INFINI_DTYPE_F16
+        elif dtype == torch.float32:
+            dt_ = DataType.INFINI_DTYPE_F32
+        elif dtype == torch.bfloat16:
+            dt_ = DataType.INFINI_DTYPE_BF16
+        else:
+            dt_ = DataType.INFINI_DTYPE_BF16
+
+        super().__init__(
+            dt_logits=dt_,
+            nlayer=config["num_hidden_layers"],
+            d=config["hidden_size"],
+            nh=config["num_attention_heads"],
+            nkvh=config["num_key_value_heads"],
+            dh=config["head_dim"],
+            di=config["intermediate_size"], # This is for dense layers if any, can be ignored if all are sparse
+            dctx=(
+                config["max_position_embeddings"] if max_tokens is None else max_tokens
+            ),
+            dvoc=config["vocab_size"],
+            epsilon=config["rms_norm_eps"],
+            theta=config["rope_theta"],
+            end_token=config["eos_token_id"],
+            # New MoE fields
+            num_experts=config["num_experts"],
+            num_experts_per_tok=config["num_experts_per_tok"],
+            moe_intermediate_size=config["moe_intermediate_size"],
+            norm_topk_prob=1 if config.get("norm_topk_prob", False) else 0,
+        )
+        self.torch_dtype_logits = dtype
+
+# Specialized and completely rewritten Weights loader for MoE models
+class QwenMoeWeightsImpl(QwenMoeWeightsCStruct):
+    def __init__(
+        self,
+        meta,
+        naming,
+        state_dict,
+        torch_dt_mat=torch.bfloat16,
+        torch_dt_norm=torch.float32,
+        ndev=1,
+        transpose_weight=True,
+    ):
+        # Most of the initial setup is the same
+        nlayer = meta.nlayer
+        nh = meta.nh
+        nkvh = meta.nkvh
+        dh = meta.dh
+        d = meta.d
+        num_experts = meta.num_experts
+
+        # Data type setup...
+        if torch_dt_mat == torch.float16: self.dt_mat = DataType.INFINI_DTYPE_F16
+        elif torch_dt_mat == torch.float32: self.dt_mat = DataType.INFINI_DTYPE_F32
+        elif torch_dt_mat == torch.bfloat16: self.dt_mat = DataType.INFINI_DTYPE_BF16
+        else: raise ValueError("Unsupported proj weight data type")
+        if torch_dt_norm == torch.float16: self.dt_norm = DataType.INFINI_DTYPE_F16
+        elif torch_dt_norm == torch.float32: self.dt_norm = DataType.INFINI_DTYPE_F32
+        elif torch_dt_norm == torch.bfloat16: self.dt_norm = DataType.INFINI_DTYPE_BF16
+        else: raise ValueError("Unsupported norm weight data type")
+
+        self.transpose_linear_weights = 1 if transpose_weight else 0
+        self.nlayer = nlayer
+
+        # --- Global and Attention Weights (largely the same logic) ---
+        # NOTE: MoE model has tie_word_embeddings=False, so we must load both.
+        self.input_embd_tensor = state_dict[naming.input_embd()].to(meta.torch_dtype_logits)
+        self.input_embd = self.input_embd_tensor.data_ptr()
+        self.output_norm_tensor = state_dict[naming.output_norm()].to(torch_dt_norm)
+        self.output_norm = self.output_norm_tensor.data_ptr()
+        self.output_embd_tensor = state_dict[naming.output_embd()].to(torch_dt_mat)
+        if not transpose_weight:
+            self.output_embd_tensor = self.output_embd_tensor.transpose(0, 1).contiguous()
+        self.output_embd = self.output_embd_tensor.data_ptr()
+        
+        # Attention weights... (This part is complex and model-specific, reusing a simplified version)
+        self.attn_norm_tensors = [state_dict[naming.attn_norm(i)].to(torch_dt_norm) for i in range(nlayer)]
+        self.attn_norm_ptrs = [t.data_ptr() for t in self.attn_norm_tensors]
+        self.attn_norm = (c_void_p * nlayer)(*self.attn_norm_ptrs)
+
+        # Simplified QKV loading for clarity
+        def qkv_slices(_i):
+            _Q = (
+                state_dict[naming.attn_q(_i)]
+                .reshape([nh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _K = (
+                state_dict[naming.attn_k(_i)]
+                .reshape([nkvh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _V = state_dict[naming.attn_v(_i)].reshape([nkvh, dh // 2, 2, d])
+            _result = []
+            _nh_per_dev = nh // ndev
+            _nkvh_per_dev = nkvh // ndev
+            for _idev in range(ndev):
+                _result.append(_Q[_idev * _nh_per_dev : (_idev + 1) * _nh_per_dev, :, :, :])
+                _result.append(_K[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :, :])
+                _result.append(_V[_idev * _nkvh_per_dev : (_idev + 1) * _nkvh_per_dev, :, :])
+            return _result
+
+        self.qkv_tensor = [
+            torch.cat(qkv_slices(i), dim=0).to(torch_dt_mat) for i in range(nlayer)
+        ]
+        if not transpose_weight:
+            for i in range(nlayer): self.qkv_tensor[i] = self.qkv_tensor[i].transpose(0, 1).contiguous()
+        self.qkv_tensor_ptrs = [t.data_ptr() for t in self.qkv_tensor]
+        self.attn_qkv = (c_void_p * nlayer)(*self.qkv_tensor_ptrs)
+        
+        self.attn_o_tensor = [state_dict[naming.attn_o(i)].to(torch_dt_mat) for i in range(nlayer)]
+        if not transpose_weight:
+            for i in range(nlayer): self.attn_o_tensor[i] = self.attn_o_tensor[i].transpose(0, 1).contiguous()
+        self.attn_o_ptrs = [t.data_ptr() for t in self.attn_o_tensor]
+        self.attn_o = (c_void_p * nlayer)(*self.attn_o_ptrs)
+
+        self.ffn_norm_tensors = [state_dict[naming.ffn_norm(i)].to(torch_dt_norm) for i in range(nlayer)]
+        self.ffn_norm_ptrs = [t.data_ptr() for t in self.ffn_norm_tensors]
+        self.ffn_norm = (c_void_p * nlayer)(*self.ffn_norm_ptrs)
+
+        # --- MoE Weight Loading Logic (CORE NEW IMPLEMENTATION) ---
+        self.moe_gate_tensors = []
+        self.moe_experts_gate_up_tensors = []
+        self.moe_experts_down_tensors = []
+
+        print("Loading MoE weights...")
+        for i in range(nlayer):
+            # Load the gate for the current layer
+            gate_tensor = state_dict[naming.moe_gate(i)].to(torch_dt_mat)
+            self.moe_gate_tensors.append(gate_tensor)
+
+            # Loop through all experts for the current layer
+            for j in range(num_experts):
+                gate_proj = state_dict[naming.moe_expert_gate(i, j)]
+                up_proj = state_dict[naming.moe_expert_up(i, j)]
+                down_proj = state_dict[naming.moe_expert_down(i, j)]
+
+                # Combine gate and up projections, similar to dense FFNs
+                gate_up_tensor = torch.cat([gate_proj, up_proj], dim=0).to(torch_dt_mat)
+                
+                # Append to the flattened lists
+                self.moe_experts_gate_up_tensors.append(gate_up_tensor)
+                self.moe_experts_down_tensors.append(down_proj.to(torch_dt_mat))
+        
+        print("Converting MoE weights to CTypes pointers...")
+        # Convert Python lists of tensors to CTypes pointer arrays
+        moe_gate_ptrs = [t.data_ptr() for t in self.moe_gate_tensors]
+        self.moe_gate = (c_void_p * nlayer)(*moe_gate_ptrs)
+
+        total_experts = nlayer * num_experts
+        moe_experts_gate_up_ptrs = [t.data_ptr() for t in self.moe_experts_gate_up_tensors]
+        self.moe_experts_gate_up = (c_void_p * total_experts)(*moe_experts_gate_up_ptrs)
+        
+        moe_experts_down_ptrs = [t.data_ptr() for t in self.moe_experts_down_tensors]
+        self.moe_experts_down = (c_void_p * total_experts)(*moe_experts_down_ptrs)
+        print("-" * 50)
+        print(">>> Weight Loader Verification <<<")
+        print(f"Expected layers (nlayer): {nlayer}")
+        print(f"Expected experts per layer: {num_experts}")
+        print(f"Total experts expected: {nlayer * num_experts}")
+        print("-" * 50)
+        print(f"Loaded gate tensors: {len(self.moe_gate_tensors)}")
+        print(f"Loaded expert gate_up tensors: {len(self.moe_experts_gate_up_tensors)}")
+        print(f"Loaded expert down tensors: {len(self.moe_experts_down_tensors)}")
+        print("-" * 50)
+        # 断言检查，如果数量不对，程序会直接报错
+        assert len(self.moe_gate_tensors) == nlayer
+        assert len(self.moe_experts_gate_up_tensors) == nlayer * num_experts
+        assert len(self.moe_experts_down_tensors) == nlayer * num_experts
+        print(">>> Verification PASSED: Correct number of MoE weights loaded.")
+        print("-" * 50)
+
+# BatchedTask can be reused if its structure is generic
+class QwenMoeBatchedTask:
+    def __init__(self, tasks: List[InferTask]):
+        self.tasks = tasks
+        self.nreq = len(tasks)
+        token_lists = [t.tokens for t in tasks]
+        self.req_lens_list = [len(toks) for toks in token_lists]
+        self.req_pos_list = [t.pos for t in tasks]
+        self.kv_cache_ptrs = [t.kvcache().data() for t in tasks]
+        self.temperaturas_list = [t.temperature for t in tasks]
+        self.topks_list = [t.topk for t in tasks]
+        self.topps_list = [t.topp for t in tasks]
+        flat_tokens = [tok for toks in token_lists for tok in toks]
+        self.ntok = len(flat_tokens)
+        self.tokens = (c_uint * self.ntok)(*flat_tokens)
+        self.req_lens = (c_uint * self.nreq)(*self.req_lens_list)
+        self.req_pos = (c_uint * self.nreq)(*self.req_pos_list)
+        self.kv_caches = (POINTER(KVCacheCStruct) * self.nreq)(*self.kv_cache_ptrs)
+        self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list)
+        self.topks = (c_uint * self.nreq)(*self.topks_list)
+        self.topps = (c_float * self.nreq)(*self.topps_list)
+
+    def input_args(self):
+        return (self.tokens, self.ntok, self.req_lens, self.nreq, self.req_pos,
+                self.kv_caches, self.temperaturas, self.topks, self.topps)
+
+# Main class for the MoE model
+class QwenMoeForCausalLM:
+    def __init__(
+        self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None
+    ):
+        def load_all_safetensors_from_dir(dir_path_: str):
+            tensors_ = {}
+            dir_path_ = Path(dir_path_)
+            for file in sorted(dir_path_.glob("*.safetensors")):
+                with safetensors.safe_open(file, "pt") as f:
+                    for name_ in f.keys():
+                        tensors_[name_] = f.get_tensor(name_)
+            return tensors_
+
+        print("Loading MoE model config and weights to host...")
+        load_start_time = time.time()
+
+        with open(os.path.join(model_dir_path, "config.json"), "r") as f:
+            config = json.load(f)
+            self.config = config
+        
+        # Assert that we are loading the correct model type
+        assert "moe" in config.get("model_type", ""), "This script is for MoE models only."
+
+        state_dict = load_all_safetensors_from_dir(model_dir_path)
+        
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_dir_path, trust_remote_code=True
+        )
+        
+        self.meta = QwenMoeMetaFromConfig(config, max_tokens=max_tokens)
+        self.weights = QwenMoeWeightsImpl(
+            self.meta,
+            LlamaWeightsNaming(),
+            state_dict,
+            ndev=ndev,
+            transpose_weight=(device != DeviceType.DEVICE_TYPE_ASCEND),
+        )
+
+        load_end_time = time.time()
+        print(f"Weight loading time: {load_end_time - load_start_time:.3f}s")
+
+        print(f"Creating MoE model on {ndev} devices...")
+        create_start_time = time.time()
+        dev_ids = (c_int * ndev)(*range(ndev))
+        
+        self.model_instance = create_qwen_moe_model(
+            byref(self.meta),
+            byref(self.weights),
+            device,
+            ndev,
+            dev_ids,
+        )
+        create_end_time = time.time()
+        print(f"Model creation time: {create_end_time - create_start_time:.3f}s")
+
+    def max_context_len(self):
+        return self.meta.dctx
+
+    def create_kv_cache(self):
+        return create_moe_kv_cache(self.model_instance)
+
+    def drop_kv_cache(self, kv_cache):
+        drop_moe_kv_cache(self.model_instance, kv_cache)
+
+    def batch_infer_one_round(self, tasks: List[InferTask]):
+        output = (c_uint * len(tasks))()
+        batch_inputs = QwenMoeBatchedTask(tasks)
+        infer_moe_batch(
+            self.model_instance,
+            *(batch_inputs.input_args()),
+            output,
+        )
+        return list(output)
+
+    def generate(self, input_content, max_steps, topp_=0.95, topk_=20, temperature_=0.6):
+        # Generation logic remains largely the same, just calling the new functions
+        input_content_templated = self.tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": input_content}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        print(input_content_templated, end="", flush=True)
+        tokens = self.tokenizer.encode(input_content_templated)
+        
+        eos_token_id = self.config["eos_token_id"]
+        eos_token_id_list = [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
+
+        infer_task = InferTask(
+            0, tokens, self.max_context_len(),
+            temperature_, topk_, topp_, eos_token_id_list
+        )
+        infer_task.bind_kvcache(KVCache(self))
+        
+        output_content = ""
+        for _ in range(max_steps):
+            output_tokens = self.batch_infer_one_round([infer_task])
+            if output_tokens[0] in eos_token_id_list:
+                break
+            
+            output_str = self.tokenizer.decode(output_tokens[0])
+            output_content += output_str
+            print(output_str, end="", flush=True)
+            
+            infer_task.next(output_tokens[0])
+        
+        print("\n")
+        infer_task._kv_cache.drop(self)
+        return output_content
+
+    def destroy_model_instance(self):
+        destroy_qwen_moe_model(self.model_instance)
+        print("MoE Model destroyed")
+
+def test():
+    if len(sys.argv) < 3:
+        print(
+            "Usage: python qwen_moe.py [--cpu|--nvidia|...] <path/to/moe_model_dir> [n_device]"
+        )
+        sys.exit(1)
+    
+    model_path = sys.argv[2]
+    device_map = {
+        "--cpu": DeviceType.DEVICE_TYPE_CPU,
+        "--nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+        "--cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+        "--ascend": DeviceType.DEVICE_TYPE_ASCEND,
+        "--metax": DeviceType.DEVICE_TYPE_METAX,
+        "--moore": DeviceType.DEVICE_TYPE_MOORE,
+        "--iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+    }
+    device_type = device_map.get(sys.argv[1])
+    if device_type is None:
+        print(f"Invalid device type. Valid options: {list(device_map.keys())}")
+        sys.exit(1)
+
+    ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1
+    model = QwenMoeForCausalLM(model_path, device_type, ndev)
+    
+    model.generate("你好，请介绍一下自己。", 100)
+    
+    model.destroy_model_instance()
+
+if __name__ == "__main__":
+    test()
diff --git a/scripts/jiuge_ppl.py b/scripts/qwen_ppl.py
similarity index 100%
rename from scripts/jiuge_ppl.py
rename to scripts/qwen_ppl.py
diff --git a/src/models/cache_manager.hpp b/src/models/cache_manager.hpp
index 4d1b5aa7..c6819fa0 100644
--- a/src/models/cache_manager.hpp
+++ b/src/models/cache_manager.hpp
@@ -149,6 +149,10 @@ class CacheManager {
     LRUDescriptorCache<infiniopCausalSoftmaxDescriptor_t> causal_softmax_cache;
     LRUDescriptorCache<infiniopSwiGLUDescriptor_t> swiglu_cache;
     LRUDescriptorCache<infiniopRandomSampleDescriptor_t> random_sample_cache;
+    LRUDescriptorCache<infiniopGatherDescriptor_t> gather_cache;
+    LRUDescriptorCache<infiniopScatterDescriptor_t> scatter_cache;
+    LRUDescriptorCache<infiniopTopKDescriptor_t> topk_cache;
+    LRUDescriptorCache<infiniopNormalizeDescriptor_t> normalize_cache; 
 
 public:
     CacheManager(size_t capacity = 100)
@@ -159,7 +163,11 @@ class CacheManager {
           rearrange_cache(capacity, infiniopDestroyRearrangeDescriptor),
           causal_softmax_cache(capacity, infiniopDestroyCausalSoftmaxDescriptor),
           swiglu_cache(capacity, infiniopDestroySwiGLUDescriptor),
-          random_sample_cache(capacity, infiniopDestroyRandomSampleDescriptor) {}
+          random_sample_cache(capacity, infiniopDestroyRandomSampleDescriptor),
+          gather_cache(capacity, infiniopDestroyGatherDescriptor),
+          scatter_cache(capacity, infiniopDestroyScatterDescriptor),
+          topk_cache(capacity, infiniopDestroyTopKDescriptor),
+          normalize_cache(capacity, infiniopDestroyNormalizeDescriptor) {}
 
     // Add operations
     bool getAddDescriptor(size_t key, infiniopAddDescriptor_t &desc) {
@@ -233,6 +241,41 @@ class CacheManager {
         random_sample_cache.put(key, desc);
     }
 
+    // Gather operations
+    bool getGatherDescriptor(size_t key, infiniopGatherDescriptor_t &desc) {
+        return gather_cache.get(key, desc);
+    }
+
+    void putGatherDescriptor(size_t key, const infiniopGatherDescriptor_t &desc) {
+        gather_cache.put(key, desc);
+    }
+
+    bool getScatterDescriptor(size_t key, infiniopScatterDescriptor_t &desc) {
+        return scatter_cache.get(key, desc);
+    }
+
+    void putScatterDescriptor(size_t key, const infiniopScatterDescriptor_t &desc) {
+        scatter_cache.put(key, desc);
+    }
+
+    // TopK operations
+    bool getTopKDescriptor(size_t key, infiniopTopKDescriptor_t &desc) {
+        return topk_cache.get(key, desc);
+    }
+
+    void putTopKDescriptor(size_t key, const infiniopTopKDescriptor_t &desc) {
+        topk_cache.put(key, desc);
+    }
+
+    // Normalize operations
+    bool getNormalizeDescriptor(size_t key, infiniopNormalizeDescriptor_t &desc) {
+        return normalize_cache.get(key, desc);
+    }
+
+    void putNormalizeDescriptor(size_t key, const infiniopNormalizeDescriptor_t &desc) {
+        normalize_cache.put(key, desc);
+    }
+
     template <typename... Tensors>
     static size_t createDescriptorKey(Tensors... tensors) {
         size_t seed = 0;
@@ -241,4 +284,4 @@ class CacheManager {
     }
 };
 
-#endif // CACHE_MANAGER_HPP
+#endif // CACHE_MANAGER_HPP
\ No newline at end of file
diff --git a/src/models/common_structs.hpp b/src/models/common_structs.hpp
new file mode 100644
index 00000000..eac0c38b
--- /dev/null
+++ b/src/models/common_structs.hpp
@@ -0,0 +1,40 @@
+#ifndef COMMON_STRUCTS_H
+#define COMMON_STRUCTS_H
+
+#include "../tensor.hpp" // KVCache depends on Tensor
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+#include <cstdint> // For uint32_t
+
+// These structs are generic and can be shared between dense and MoE models.
+
+struct InferState {
+    std::mutex mtx;
+    std::condition_variable cv_load, cv_start, cv_done;
+    bool loaded = false;
+    bool proceed = false;
+    bool exit_flag = false;
+};
+
+struct InferRequest {
+    const uint32_t *tokens;
+    uint32_t ntok;
+    const uint32_t *req_lens;
+    uint32_t nreq;
+    const uint32_t *req_pos;
+    struct KVCache **kv_caches;
+    const float *temperature;
+    const uint32_t *topk;
+    const float *topp;
+    uint32_t *output;
+    void *logits;
+};
+
+struct KVCache {
+    std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
+};
+
+#endif // COMMON_STRUCTS_H
diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp
index fd0dea64..d8861e79 100644
--- a/src/models/inference_context.cpp
+++ b/src/models/inference_context.cpp
@@ -1,6 +1,8 @@
 #include "inference_context.hpp"
 #include "../tensor.hpp"
 #include "../utils.hpp"
+#include <numeric>
+#include <functional>
 
 InferenceContext::InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream)
     : rsrc(rsrc), cache_manager(cache_manager), stream(stream) {}
@@ -231,3 +233,194 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
         add(c, c, bias->view_as(c->shape(), strides));
     }
 }
+
+void InferenceContext::gather(std::shared_ptr<Tensor> output,
+                              std::shared_ptr<Tensor> input,
+                              const std::vector<uint32_t> &indices,
+                              int dim) {
+    // 1. 准备索引张量：将 CPU 上的 vector 索引上传到 GPU
+    auto index_tensor = Tensor::buffer(INFINI_DTYPE_I32, output->shape(), rsrc->memory_pool);
+    RUN_INFINI(infinirtMemcpyAsync(index_tensor->data(), indices.data(), indices.size() * sizeof(uint32_t),
+                                   INFINIRT_MEMCPY_H2D, stream));
+
+    // 2. 创建描述符 (并利用缓存)
+    size_t key = CacheManager::createDescriptorKey(output, input, index_tensor);
+    infiniopGatherDescriptor_t desc;
+    if (!cache_manager->getGatherDescriptor(key, desc)) {
+    RUN_INFINI(infiniopCreateGatherDescriptor(
+        rsrc->handle, &desc, output->desc(), input->desc(), dim, index_tensor->desc()));
+        cache_manager->putGatherDescriptor(key, desc);
+    }
+
+    // 3. 准备工作空间
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetGatherWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    // 4. 执行 Gather 操作
+    RUN_INFINI(infiniopGather(
+        desc, workspace, workspace_size,
+        output->data(), input->data(), index_tensor->data(), stream));
+}
+
+void InferenceContext::scatter_add(std::shared_ptr<Tensor> target,
+                                   std::shared_ptr<Tensor> source,
+                                   const std::vector<uint32_t> &indices,
+                                   int dim) {
+
+    // 使用 Gather-Add-Scatter 模式实现
+    
+    // 1. 准备索引张量 (与 gather 共享)
+    auto index_tensor = Tensor::buffer(INFINI_DTYPE_I32, source->shape(), rsrc->memory_pool);
+    RUN_INFINI(infinirtMemcpyAsync(index_tensor->data(), indices.data(), indices.size() * sizeof(uint32_t),
+                                   INFINIRT_MEMCPY_H2D, stream));
+
+    // 2. Gather: 从 target 中取出需要更新的原始值
+    auto original_values = Tensor::buffer(source->dtype(), source->shape(), rsrc->memory_pool);
+    gather(original_values, target, indices, dim);
+
+    // 3. Add: 将 source (新值) 和 original_values (原始值) 相加
+    auto updated_values = Tensor::buffer(source->dtype(), source->shape(), rsrc->memory_pool);
+    add(updated_values, original_values, source);
+
+    // 4. Scatter: 将相加后的结果写回 target 的原始位置
+    // 创建描述符
+    size_t key = CacheManager::createDescriptorKey(target, updated_values, index_tensor);
+    infiniopScatterDescriptor_t desc;
+    if (!cache_manager->getScatterDescriptor(key, desc)) {
+RUN_INFINI(infiniopCreateScatterDescriptor(
+    rsrc->handle, &desc, target->desc(), target->desc(), updated_values->desc(), index_tensor->desc(), dim));
+        cache_manager->putScatterDescriptor(key, desc);
+    }
+
+    // 准备工作空间
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetScatterWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    // 执行 Scatter 操作
+    RUN_INFINI(infiniopScatter(
+        desc, workspace, workspace_size,
+        target->data(), updated_values->data(), index_tensor->data(), source->data(), stream));
+}
+
+
+void InferenceContext::scale(std::shared_ptr<Tensor> y,
+                               std::shared_ptr<Tensor> x,
+                               float alpha) {
+    // 使用gemm实现标量缩放: y = alpha * x
+    if (y.get() != x.get()) {
+        size_t x_nelem = std::accumulate(x->shape().begin(), x->shape().end(), 1ULL, std::multiplies<size_t>());
+        RUN_INFINI(infinirtMemcpyAsync(y->data(), x->data(), 
+                                       x_nelem * dsize(x->dtype()), 
+                                       INFINIRT_MEMCPY_D2D, stream));
+    }
+    
+    // 使用gemm实现缩放: y = alpha * y + 0 * y
+    auto ones = Tensor::buffer(x->dtype(), {1, 1}, rsrc->memory_pool);
+    float one_value = 1.0f;
+    RUN_INFINI(infinirtMemcpyAsync(ones->data(), &one_value, sizeof(float), INFINIRT_MEMCPY_H2D, stream));
+    
+    size_t total_elements = std::accumulate(x->shape().begin(), x->shape().end(), 1ULL, std::multiplies<size_t>());
+    auto y_flat = y->view({total_elements, 1});
+    gemm(y_flat, y_flat, ones, alpha, 0.0f);
+}
+
+void InferenceContext::scale(std::shared_ptr<Tensor> y,
+                               std::shared_ptr<Tensor> x,
+                               const std::vector<float> &weights) {
+    // 先复制数据
+    if (y.get() != x.get()) {
+        size_t x_nelem = std::accumulate(x->shape().begin(), x->shape().end(), 1ULL, std::multiplies<size_t>());
+        RUN_INFINI(infinirtMemcpyAsync(y->data(), x->data(), 
+                                       x_nelem * dsize(x->dtype()), 
+                                       INFINIRT_MEMCPY_D2D, stream));
+    }
+    
+    // 为每个token应用对应的权重
+    size_t num_tokens = weights.size();
+    size_t d = y->shape()[1]; // hidden dimension
+    
+    for (size_t i = 0; i < num_tokens; ++i) {
+        auto token_output = y->slice(0, i, 1); // 取出第i个token的输出
+        auto ones = Tensor::buffer(y->dtype(), {1, 1}, rsrc->memory_pool);
+        float one_value = 1.0f;
+        RUN_INFINI(infinirtMemcpyAsync(ones->data(), &one_value, sizeof(float), INFINIRT_MEMCPY_H2D, stream));
+        
+        auto token_flat = token_output->view({d, 1});
+        gemm(token_flat, token_flat, ones, weights[i], 0.0f);
+    }
+}
+
+void InferenceContext::zeros(std::shared_ptr<Tensor> t) {
+    // 暂时使用简单的临时实现，将tensor的所有值设为0
+    // 创建一个同样大小的零值tensor，然后复制过去
+    size_t nelem = std::accumulate(t->shape().begin(), t->shape().end(), 1ULL, std::multiplies<size_t>());
+    std::vector<float> zero_data(nelem, 0.0f);
+    
+    if (t->dtype() == INFINI_DTYPE_F32) {
+        RUN_INFINI(infinirtMemcpyAsync(t->data(), zero_data.data(), 
+                                       nelem * sizeof(float), 
+                                       INFINIRT_MEMCPY_H2D, stream));
+    } else {
+        // 对于其他数据类型，暂时跳过实现
+        // 在实际使用中可能需要根据dtype进行转换
+    }
+}
+
+void InferenceContext::normalize(std::shared_ptr<Tensor> y,
+                                   std::shared_ptr<Tensor> x,
+                                   int dim,
+                                   float epsilon) {
+    // normalize算子是就地操作，先复制x到y
+    if (y.get() != x.get()) {
+        size_t x_nelem = std::accumulate(x->shape().begin(), x->shape().end(), 1ULL, std::multiplies<size_t>());
+        RUN_INFINI(infinirtMemcpyAsync(y->data(), x->data(), 
+                                       x_nelem * dsize(x->dtype()), 
+                                       INFINIRT_MEMCPY_D2D, stream));
+    }
+    
+    size_t key = CacheManager::createDescriptorKey(y);
+
+    infiniopNormalizeDescriptor_t desc;
+    if (!cache_manager->getNormalizeDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateNormalizeDescriptor(
+            rsrc->handle, &desc, y->desc()));
+        cache_manager->putNormalizeDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetNormalizeWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopNormalize(
+        desc, workspace, workspace_size,
+        y->data(), stream));
+}
+
+void InferenceContext::topk_fun(std::shared_ptr<Tensor> values,
+                              std::shared_ptr<Tensor> indices,
+                              std::shared_ptr<Tensor> input,
+                              uint32_t k,
+                              int dim) {
+    size_t key = CacheManager::createDescriptorKey(values, indices, input);
+
+    infiniopTopKDescriptor_t desc;
+    if (!cache_manager->getTopKDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateTopKDescriptor(
+            rsrc->handle, &desc, input->desc(), values->desc(), indices->desc(), k, dim, true, true));
+        cache_manager->putTopKDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetTopKWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopTopK(
+        desc, workspace, workspace_size,
+        input->data(), values->data(), indices->data(), stream));
+}
\ No newline at end of file
diff --git a/src/models/inference_context.hpp b/src/models/inference_context.hpp
index dd5f4b78..016ff99c 100644
--- a/src/models/inference_context.hpp
+++ b/src/models/inference_context.hpp
@@ -1,8 +1,10 @@
 #pragma once
 
 #include "cache_manager.hpp"
-#include "jiuge/jiuge_impl.hpp"
-#include "jiuge/jiuge_weight.hpp"
+#include "qwen/qwen_impl.hpp"
+#include "qwen/qwen_weight.hpp"
+// #include "qwen_moe/qwen_moe_impl.hpp"
+// #include "qwen_moe/qwen_moe_weight.hpp"
 #include <cassert>
 
 struct InferenceContext {
@@ -49,6 +51,37 @@ struct InferenceContext {
                 float alpha, float beta,
                 std::shared_ptr<Tensor> residual,
                 std::shared_ptr<Tensor> bias);
+
+    void gather(std::shared_ptr<Tensor> output,
+                std::shared_ptr<Tensor> input,
+                const std::vector<uint32_t> &indices,
+                int dim = 0);
+
+    void scatter_add(std::shared_ptr<Tensor> target,
+                     std::shared_ptr<Tensor> source,
+                     const std::vector<uint32_t> &indices,
+                     int dim = 0);
+
+    void scale(std::shared_ptr<Tensor> y,
+               std::shared_ptr<Tensor> x,
+               float alpha);
+               
+    void scale(std::shared_ptr<Tensor> y,
+               std::shared_ptr<Tensor> x,
+               const std::vector<float> &weights);
+
+    void zeros(std::shared_ptr<Tensor> t);
+
+    void normalize(std::shared_ptr<Tensor> y,
+                   std::shared_ptr<Tensor> x,
+                   int dim,
+                   float epsilon);
+
+    void topk_fun(std::shared_ptr<Tensor> values,
+              std::shared_ptr<Tensor> indices,
+              std::shared_ptr<Tensor> input,
+              uint32_t k,
+              int dim = -1);
 };
 
 namespace {
@@ -107,3 +140,40 @@ inline void linear(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
                    std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
     getInferenceContext().linear(c, a, b, alpha, beta, residual, bias);
 }
+
+inline void gather(std::shared_ptr<Tensor> output,
+                   std::shared_ptr<Tensor> input,
+                   const std::vector<uint32_t> &indices,
+                   int dim = 0) {
+    getInferenceContext().gather(output, input, indices, dim);
+}
+
+inline void scatter_add(std::shared_ptr<Tensor> target,
+                        std::shared_ptr<Tensor> source,
+                        const std::vector<uint32_t> &indices,
+                        int dim = 0) {
+    getInferenceContext().scatter_add(target, source, indices, dim);
+}
+
+inline void scale(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, float alpha) {
+    getInferenceContext().scale(y, x, alpha);
+}
+
+inline void scale(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, const std::vector<float> &weights) {
+    getInferenceContext().scale(y, x, weights);
+}
+
+inline void zeros(std::shared_ptr<Tensor> t) {
+    getInferenceContext().zeros(t);
+}
+
+inline void normalize(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x, int dim, float epsilon) {
+    getInferenceContext().normalize(y, x, dim, epsilon);
+}
+
+inline void topk_fun(std::shared_ptr<Tensor> values, std::shared_ptr<Tensor> indices,
+                 std::shared_ptr<Tensor> input, uint32_t k, int dim = -1) {
+    getInferenceContext().topk_fun(values, indices, input, k, dim);
+}
+
+
diff --git a/src/models/jiuge/jiuge.cpp b/src/models/jiuge/jiuge.cpp
index bafe784e..3365daaf 100644
--- a/src/models/jiuge/jiuge.cpp
+++ b/src/models/jiuge/jiuge.cpp
@@ -1,455 +1,455 @@
-#include "jiuge_impl.hpp"
-#include "jiuge_weight.hpp"
-
-#include "../../tensor.hpp"
-#include "../../utils.hpp"
-#include "../inference_context.hpp"
-#include "infinicore_infer.h"
-
-#include <random>
-#include <thread>
-#include <vector>
-
-void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
-                          const JiugeWeights *weights,
-                          infiniDevice_t device, int idev,
-                          int ndev, int dev_id,
-                          infinicclComm_t comm) {
-    RUN_INFINI(infinirtSetDevice(device, dev_id));
-    infiniopHandle_t handle;
-    infiniopCreateHandle(&handle);
-    infinirtStream_t stream;
-    infinirtStreamCreate(&stream);
-
-    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
-        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
-    for (size_t layer = 0; layer < meta->nlayer; layer++) {
-        w_attn_norm.push_back(
-            getAttnNorm(meta, weights, layer));
-        w_attn_qkv.push_back(
-            getAttnQKV(meta, weights, layer, idev, ndev));
-        if (weights->attn_qkv_b != nullptr) {
-            b_attn_qkv.push_back(
-                getAttnQKVBias(meta, weights, layer, idev, ndev));
-        }
-        w_attn_out.push_back(
-            getAttnO(meta, weights, layer, idev, ndev));
-        w_ffn_norm.push_back(
-            getFFNNorm(meta, weights, layer));
-        w_ffn_gate_up.push_back(
-            getFFNGateUp(meta, weights, layer, idev, ndev));
-        w_ffn_down.push_back(
-            getFFNDown(meta, weights, layer, idev, ndev));
-    }
-
-    auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
-
-    *rsrc = DeviceResource{
-        device,
-        dev_id,
-        handle,
-        getInEmbd(meta, weights),
-        getOutNorm(meta, weights),
-        getOutEmbd(meta, weights),
-        getSinTable(meta),
-        getCosTable(meta),
-        w_attn_norm,
-        w_attn_qkv,
-        b_attn_qkv,
-        w_attn_out,
-        w_ffn_norm,
-        w_ffn_gate_up,
-        w_ffn_down,
-        stream,
-        comm,
-        memory_pool,
-    };
-    RUN_INFINI(infinirtDeviceSynchronize());
-}
-
-void releaseDeviceResource(DeviceResource &res) {
-    infinirtDeviceSynchronize();
-    // Release individual Tensors
-    res.w_in_embd.reset();
-    res.w_out_norm.reset();
-    res.w_out_embd.reset();
-    res.sin_table.reset();
-    res.cos_table.reset();
-    for (auto &t : res.w_attn_norm) {
-        t.reset();
-    }
-    res.w_attn_norm.clear();
-    for (auto &t : res.w_attn_qkv) {
-        t.reset();
-    }
-    res.w_attn_qkv.clear();
-    for (auto &t : res.b_attn_qkv) {
-        t.reset();
-    }
-    res.b_attn_qkv.clear();
-    for (auto &t : res.w_attn_out) {
-        t.reset();
-    }
-    res.w_attn_out.clear();
-    for (auto &t : res.w_ffn_norm) {
-        t.reset();
-    }
-    res.w_ffn_norm.clear();
-    for (auto &t : res.w_ffn_gate_up) {
-        t.reset();
-    }
-    res.w_ffn_gate_up.clear();
-    for (auto &t : res.w_ffn_down) {
-        t.reset();
-    }
-    res.w_ffn_down.clear();
-    infiniopDestroyHandle(res.handle);
-    res.handle = nullptr;
-    infinirtStreamDestroy(res.stream);
-    res.stream = nullptr;
-    infinicclCommDestroy(res.comm);
-    res.comm = nullptr;
-}
-
-void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
-                      uint32_t idev, uint32_t ndev,
-                      const uint32_t *tokens, uint32_t ntok,
-                      const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-                      struct KVCache **kv_caches,
-                      const float *temperature, const uint32_t *topk, const float *topp,
-                      uint32_t *output, void *last_logits) {
-    auto nlayer = meta.nlayer;
-    auto nkvh = meta.nkvh / ndev;
-    auto nh = meta.nh / ndev;
-    auto ngroup = nh / nkvh;
-    // auto dctx = meta.dctx;
-    auto dh = meta.dh;
-    auto d = meta.d;
-    auto dt_logits = meta.dt_logits;
-    auto di = meta.di / ndev;
-    auto dvoc = meta.dvoc;
-    auto stream = rsrc.stream;
-    bool has_qkv_bias = rsrc.b_attn_qkv.size() > 0;
-
-    // Allocate buffers
-    auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
-    auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
-    auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool);
-    auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * di}, rsrc.memory_pool);
-    auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
-    auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
-    auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
-    auto result_cpu = std::vector<int64_t>(nreq);
-
-    auto qkv_rope = qkv_buf->view({ntok, nh + nkvh * 2, dh});
-
-    // Prepare inputs
-    auto batch_pos_ids = std::vector<uint32_t>(ntok);
-    size_t req_start = 0;
-    for (uint32_t req = 0; req < nreq; req++) {
-        for (uint32_t i = 0; i < req_lens[req]; i++) {
-            batch_pos_ids[req_start + i] = req_pos[req] + i;
-        }
-        req_start += req_lens[req];
-    }
-
-    std::shared_ptr<Tensor> pos_ids_buf;
-    if (rsrc.device == INFINI_DEVICE_CPU) {
-        pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
-    } else {
-        pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
-        RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
-                                       INFINIRT_MEMCPY_H2D, stream));
-    }
-    for (uint32_t i = 0; i < ntok; i++) {
-        RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
-                                       rsrc.w_in_embd->data(tokens[i] * d),
-                                       dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
-    }
-
-    // Attention
-    // attention inner
-    size_t max_qk_size = 0;
-    size_t max_seq_len = 0;
-
-    for (uint32_t req = 0; req < nreq; req++) {
-        auto past_len = req_pos[req];
-        auto seq_len = req_lens[req];
-        auto total_len = past_len + seq_len;
-
-        max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
-        max_seq_len = std::max(max_seq_len, size_t(seq_len));
-    }
-
-    auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
-    auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
-    auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
-    auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
-    auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
-
-    // MLP buffers
-    auto gate_buf = gate_up_buf->slice(1, 0, di);
-    auto up_buf = gate_up_buf->slice(1, di, di);
-
-    // Compute
-    for (uint32_t layer = 0; layer < nlayer; layer++) {
-        // 1. Attention
-        // rms norm
-        rmsnorm(logits_out, logits_in, rsrc.w_attn_norm[layer], meta.epsilon);
-        // qkv_proj
-        linear(qkv_buf, logits_out, rsrc.w_attn_qkv[layer], 1.0, 0.0, nullptr, has_qkv_bias ? rsrc.b_attn_qkv[layer] : nullptr);
-        // rope
-        rope(qkv_rope->slice(1, 0, nh), qkv_rope->slice(1, 0, nh), pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
-        rope(qkv_rope->slice(1, nh, nkvh), qkv_rope->slice(1, nh, nkvh), pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
-
-        size_t token_offset = 0;
-        for (uint32_t req = 0; req < nreq; req++) {
-            auto past_len = req_pos[req];
-            auto seq_len = req_lens[req];
-            auto total_len = past_len + seq_len;
-            auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
-            auto q = qkv_rope->slice({{0, token_offset, seq_len}, {1, 0, nh}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
-            auto k = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh, nkvh}});
-            auto v = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}});
-
-            // self attention
-            // concat
-            rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k);
-            rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v);
-            // qk
-            rearrange(q_rearrange->slice(2, 0, seq_len), q);
-            auto qk_gemm = qk_buf->slice(1, 0, seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
-            auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0});
-            linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
-            // softmax
-            auto qk_softmax = qk_buf->slice(1, 0, seq_len * total_len)->view({nh, seq_len, total_len});
-            causalSoftmax(qk_softmax, qk_softmax);
-            auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2});
-            linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr);
-            // rearrange attn val
-            rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
-
-            token_offset += seq_len;
-        }
-
-        // o_proj
-        linear(logits_in, o_buf, rsrc.w_attn_out[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
-
-        // All_reduce if distributed
-        if (rsrc.comm != nullptr) {
-            RUN_INFINI(infinicclAllReduce(
-                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
-                INFINICCL_SUM, rsrc.comm, stream));
-            RUN_INFINI(infinirtStreamSynchronize(stream));
-        }
-        // 2. FFN
-        rmsnorm(logits_out, logits_in, rsrc.w_ffn_norm[layer], meta.epsilon);
-        linear(gate_up_buf, logits_out, rsrc.w_ffn_gate_up[layer], 1.0, 0.0, nullptr, nullptr);
-        swiglu(gate_buf, up_buf, gate_buf);
-        linear(logits_in, gate_buf, rsrc.w_ffn_down[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
-
-        // All_reduce if distributed
-        if (rsrc.comm != nullptr) {
-            RUN_INFINI(infinicclAllReduce(
-                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
-                INFINICCL_SUM, rsrc.comm, stream));
-            RUN_INFINI(infinirtStreamSynchronize(stream));
-        }
-    }
-    // Sample and Output
-    if (idev == 0) {
-        if (last_logits != nullptr) {
-            rmsnorm(logits_out, logits_in, rsrc.w_out_norm, meta.epsilon);
-            auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
-            linear(last_logits_buf, logits_out, rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
-            RUN_INFINI(infinirtStreamSynchronize(stream));
-            RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
-        }
-        if (output != nullptr) {
-            size_t token_offset = 0;
-            for (uint32_t req = 0; req < nreq; req++) {
-                auto seq_len = req_lens[req];
-                token_offset += seq_len;
-                rmsnorm(logits_out->slice(0, req, 1),
-                        logits_in->slice(0, token_offset - 1, 1),
-                        rsrc.w_out_norm,
-                        meta.epsilon);
-            }
-            linear(prob_buf, logits_out->slice(0, 0, nreq), rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
-            std::random_device _rd;
-            std::mt19937 gen(_rd());
-            token_offset = 0;
-            for (uint32_t req = 0; req < nreq; req++) {
-                auto seq_len = req_lens[req];
-                float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
-                randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
-                             prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
-                             random_val, topp[req], topk[req], temperature[req]);
-                token_offset += seq_len;
-            }
-            RUN_INFINI(infinirtStreamSynchronize(stream));
-            RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
-                                      sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
-            for (uint32_t req = 0; req < nreq; req++) {
-                output[req] = uint32_t(result_cpu[req]);
-            }
-        }
-    }
-}
-
-__C void
-inferBatch(struct JiugeModel *model,
-           const uint32_t *tokens, uint32_t ntok,
-           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-           struct KVCache **kv_caches,
-           const float *temperature, const uint32_t *topk, const float *topp,
-           uint32_t *output) {
-    model->req.tokens = tokens;
-    model->req.ntok = ntok;
-    model->req.req_lens = req_lens;
-    model->req.nreq = nreq;
-    model->req.req_pos = req_pos;
-    model->req.kv_caches = kv_caches;
-    model->req.output = output;
-    model->req.logits = nullptr;
-    model->req.temperature = temperature;
-    model->req.topk = topk;
-    model->req.topp = topp;
-
-    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
-        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
-        model->states[idev].proceed = true;
-        lock.unlock();
-        model->states[idev].cv_start.notify_one();
-    }
-    for (size_t i = model->dev_ids.size(); i > 0; i--) {
-        auto idev = i - 1;
-        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
-        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
-        lock.unlock();
-    }
-}
-
-__C void
-forwardBatch(struct JiugeModel *model,
-             const uint32_t *tokens, uint32_t ntok,
-             const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-             struct KVCache **kv_caches,
-             void *logits) {
-    model->req.tokens = tokens;
-    model->req.ntok = ntok;
-    model->req.req_lens = req_lens;
-    model->req.nreq = nreq;
-    model->req.req_pos = req_pos;
-    model->req.kv_caches = kv_caches;
-    model->req.output = nullptr;
-    model->req.logits = logits;
-    model->req.temperature = nullptr;
-    model->req.topk = nullptr;
-    model->req.topp = nullptr;
-
-    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
-        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
-        model->states[idev].proceed = true;
-        lock.unlock();
-        model->states[idev].cv_start.notify_one();
-    }
-    for (size_t i = model->dev_ids.size(); i > 0; i--) {
-        auto idev = i - 1;
-        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
-        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
-        lock.unlock();
-    }
-}
-
-void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
-                  infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
-    CacheManager cache_manager(100);
-    InferenceContext ctx(rsrc, &cache_manager, rsrc->stream);
-
-    // Set the inference context for this thread
-    setInferenceContext(&ctx);
-
-    // Create Device Resource
-    createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
-    {
-        std::unique_lock<std::mutex> lock(state.mtx);
-        state.loaded = true;
-        lock.unlock();
-        state.cv_load.notify_one();
-    }
-
-    // Infer Loop
-    while (true) {
-        std::unique_lock<std::mutex> lock(state.mtx);
-        state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
-        // quit if exit_flag is set
-        if (state.exit_flag) {
-            break;
-        }
-
-        inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
-                         req.req_lens, req.nreq, req.req_pos, req.kv_caches,
-                         req.temperature, req.topk, req.topp, req.output, req.logits);
-
-        state.proceed = false;
-        lock.unlock();
-        state.cv_done.notify_one();
-    }
-
-    // Clean-Up
-    releaseDeviceResource(*rsrc);
-    setInferenceContext(nullptr); // Clear the context when done
-}
-
-JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infiniDevice_t device_, std::vector<int> device_ids) : meta(*_meta) {
-    int ndev = int(device_ids.size());
-    device = device_;
-    dev_ids = device_ids;
-    dev_resources = std::vector<DeviceResource>(ndev);
-    states = std::vector<InferState>(ndev);
-    threads.resize(ndev);
-    RUN_INFINI(infinirtInit());
-    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
-    if (ndev > 1) {
-        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
-    }
-
-    for (int i = 0; i < ndev; i++) {
-        threads[i] = std::thread(launchDevice, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
-    }
-    for (int i = 0; i < ndev; i++) {
-        std::unique_lock<std::mutex> lock(states[i].mtx);
-        states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
-        lock.unlock();
-    }
-}
-
-__C struct JiugeModel *
-createJiugeModel(const JiugeMeta *meta,
-                 const JiugeWeights *weights,
-                 infiniDevice_t device,
-                 int ndev,
-                 const int *dev_ids) {
-    std::vector<int> device_ids(ndev);
-    std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
-    JiugeModel *model = new JiugeModel(meta, weights, device, device_ids);
-    return model;
-}
-
-__C void destroyJiugeModel(struct JiugeModel *model) {
-    auto ndev = model->dev_resources.size();
-
-    for (size_t idev = 0; idev < ndev; idev++) {
-        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
-        model->states[idev].exit_flag = true;
-        lock.unlock();
-        model->states[idev].cv_start.notify_one();
-    }
-
-    for (size_t idev = 0; idev < ndev; idev++) {
-        model->threads[idev].join();
-    }
-
-    delete model;
-}
+// #include "jiuge_impl.hpp"
+// #include "jiuge_weight.hpp"
+
+// #include "../../tensor.hpp"
+// #include "../../utils.hpp"
+// #include "../inference_context.hpp"
+// #include "infinicore_infer.h"
+
+// #include <random>
+// #include <thread>
+// #include <vector>
+
+// void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
+//                           const JiugeWeights *weights,
+//                           infiniDevice_t device, int idev,
+//                           int ndev, int dev_id,
+//                           infinicclComm_t comm) {
+//     RUN_INFINI(infinirtSetDevice(device, dev_id));
+//     infiniopHandle_t handle;
+//     infiniopCreateHandle(&handle);
+//     infinirtStream_t stream;
+//     infinirtStreamCreate(&stream);
+
+//     std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
+//         w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+//     for (size_t layer = 0; layer < meta->nlayer; layer++) {
+//         w_attn_norm.push_back(
+//             getAttnNorm(meta, weights, layer));
+//         w_attn_qkv.push_back(
+//             getAttnQKV(meta, weights, layer, idev, ndev));
+//         if (weights->attn_qkv_b != nullptr) {
+//             b_attn_qkv.push_back(
+//                 getAttnQKVBias(meta, weights, layer, idev, ndev));
+//         }
+//         w_attn_out.push_back(
+//             getAttnO(meta, weights, layer, idev, ndev));
+//         w_ffn_norm.push_back(
+//             getFFNNorm(meta, weights, layer));
+//         w_ffn_gate_up.push_back(
+//             getFFNGateUp(meta, weights, layer, idev, ndev));
+//         w_ffn_down.push_back(
+//             getFFNDown(meta, weights, layer, idev, ndev));
+//     }
+
+//     auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
+
+//     *rsrc = DeviceResource{
+//         device,
+//         dev_id,
+//         handle,
+//         getInEmbd(meta, weights),
+//         getOutNorm(meta, weights),
+//         getOutEmbd(meta, weights),
+//         getSinTable(meta),
+//         getCosTable(meta),
+//         w_attn_norm,
+//         w_attn_qkv,
+//         b_attn_qkv,
+//         w_attn_out,
+//         w_ffn_norm,
+//         w_ffn_gate_up,
+//         w_ffn_down,
+//         stream,
+//         comm,
+//         memory_pool,
+//     };
+//     RUN_INFINI(infinirtDeviceSynchronize());
+// }
+
+// void releaseDeviceResource(DeviceResource &res) {
+//     infinirtDeviceSynchronize();
+//     // Release individual Tensors
+//     res.w_in_embd.reset();
+//     res.w_out_norm.reset();
+//     res.w_out_embd.reset();
+//     res.sin_table.reset();
+//     res.cos_table.reset();
+//     for (auto &t : res.w_attn_norm) {
+//         t.reset();
+//     }
+//     res.w_attn_norm.clear();
+//     for (auto &t : res.w_attn_qkv) {
+//         t.reset();
+//     }
+//     res.w_attn_qkv.clear();
+//     for (auto &t : res.b_attn_qkv) {
+//         t.reset();
+//     }
+//     res.b_attn_qkv.clear();
+//     for (auto &t : res.w_attn_out) {
+//         t.reset();
+//     }
+//     res.w_attn_out.clear();
+//     for (auto &t : res.w_ffn_norm) {
+//         t.reset();
+//     }
+//     res.w_ffn_norm.clear();
+//     for (auto &t : res.w_ffn_gate_up) {
+//         t.reset();
+//     }
+//     res.w_ffn_gate_up.clear();
+//     for (auto &t : res.w_ffn_down) {
+//         t.reset();
+//     }
+//     res.w_ffn_down.clear();
+//     infiniopDestroyHandle(res.handle);
+//     res.handle = nullptr;
+//     infinirtStreamDestroy(res.stream);
+//     res.stream = nullptr;
+//     infinicclCommDestroy(res.comm);
+//     res.comm = nullptr;
+// }
+
+// void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
+//                       uint32_t idev, uint32_t ndev,
+//                       const uint32_t *tokens, uint32_t ntok,
+//                       const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+//                       struct KVCache **kv_caches,
+//                       const float *temperature, const uint32_t *topk, const float *topp,
+//                       uint32_t *output, void *last_logits) {
+//     auto nlayer = meta.nlayer;
+//     auto nkvh = meta.nkvh / ndev;
+//     auto nh = meta.nh / ndev;
+//     auto ngroup = nh / nkvh;
+//     // auto dctx = meta.dctx;
+//     auto dh = meta.dh;
+//     auto d = meta.d;
+//     auto dt_logits = meta.dt_logits;
+//     auto di = meta.di / ndev;
+//     auto dvoc = meta.dvoc;
+//     auto stream = rsrc.stream;
+//     bool has_qkv_bias = rsrc.b_attn_qkv.size() > 0;
+
+//     // Allocate buffers
+//     auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+//     auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+//     auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool);
+//     auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * di}, rsrc.memory_pool);
+//     auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
+//     auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
+//     auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
+//     auto result_cpu = std::vector<int64_t>(nreq);
+
+//     auto qkv_rope = qkv_buf->view({ntok, nh + nkvh * 2, dh});
+
+//     // Prepare inputs
+//     auto batch_pos_ids = std::vector<uint32_t>(ntok);
+//     size_t req_start = 0;
+//     for (uint32_t req = 0; req < nreq; req++) {
+//         for (uint32_t i = 0; i < req_lens[req]; i++) {
+//             batch_pos_ids[req_start + i] = req_pos[req] + i;
+//         }
+//         req_start += req_lens[req];
+//     }
+
+//     std::shared_ptr<Tensor> pos_ids_buf;
+//     if (rsrc.device == INFINI_DEVICE_CPU) {
+//         pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
+//     } else {
+//         pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
+//         RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
+//                                        INFINIRT_MEMCPY_H2D, stream));
+//     }
+//     for (uint32_t i = 0; i < ntok; i++) {
+//         RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
+//                                        rsrc.w_in_embd->data(tokens[i] * d),
+//                                        dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
+//     }
+
+//     // Attention
+//     // attention inner
+//     size_t max_qk_size = 0;
+//     size_t max_seq_len = 0;
+
+//     for (uint32_t req = 0; req < nreq; req++) {
+//         auto past_len = req_pos[req];
+//         auto seq_len = req_lens[req];
+//         auto total_len = past_len + seq_len;
+
+//         max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
+//         max_seq_len = std::max(max_seq_len, size_t(seq_len));
+//     }
+
+//     auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
+//     auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+//     auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
+//     auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+//     auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
+
+//     // MLP buffers
+//     auto gate_buf = gate_up_buf->slice(1, 0, di);
+//     auto up_buf = gate_up_buf->slice(1, di, di);
+
+//     // Compute
+//     for (uint32_t layer = 0; layer < nlayer; layer++) {
+//         // 1. Attention
+//         // rms norm
+//         rmsnorm(logits_out, logits_in, rsrc.w_attn_norm[layer], meta.epsilon);
+//         // qkv_proj
+//         linear(qkv_buf, logits_out, rsrc.w_attn_qkv[layer], 1.0, 0.0, nullptr, has_qkv_bias ? rsrc.b_attn_qkv[layer] : nullptr);
+//         // rope
+//         rope(qkv_rope->slice(1, 0, nh), qkv_rope->slice(1, 0, nh), pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+//         rope(qkv_rope->slice(1, nh, nkvh), qkv_rope->slice(1, nh, nkvh), pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+
+//         size_t token_offset = 0;
+//         for (uint32_t req = 0; req < nreq; req++) {
+//             auto past_len = req_pos[req];
+//             auto seq_len = req_lens[req];
+//             auto total_len = past_len + seq_len;
+//             auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+//             auto q = qkv_rope->slice({{0, token_offset, seq_len}, {1, 0, nh}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+//             auto k = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh, nkvh}});
+//             auto v = qkv_rope->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}});
+
+//             // self attention
+//             // concat
+//             rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k);
+//             rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v);
+//             // qk
+//             rearrange(q_rearrange->slice(2, 0, seq_len), q);
+//             auto qk_gemm = qk_buf->slice(1, 0, seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
+//             auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0});
+//             linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
+//             // softmax
+//             auto qk_softmax = qk_buf->slice(1, 0, seq_len * total_len)->view({nh, seq_len, total_len});
+//             causalSoftmax(qk_softmax, qk_softmax);
+//             auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2});
+//             linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr);
+//             // rearrange attn val
+//             rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
+
+//             token_offset += seq_len;
+//         }
+
+//         // o_proj
+//         linear(logits_in, o_buf, rsrc.w_attn_out[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+
+//         // All_reduce if distributed
+//         if (rsrc.comm != nullptr) {
+//             RUN_INFINI(infinicclAllReduce(
+//                 logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+//                 INFINICCL_SUM, rsrc.comm, stream));
+//             RUN_INFINI(infinirtStreamSynchronize(stream));
+//         }
+//         // 2. FFN
+//         rmsnorm(logits_out, logits_in, rsrc.w_ffn_norm[layer], meta.epsilon);
+//         linear(gate_up_buf, logits_out, rsrc.w_ffn_gate_up[layer], 1.0, 0.0, nullptr, nullptr);
+//         swiglu(gate_buf, up_buf, gate_buf);
+//         linear(logits_in, gate_buf, rsrc.w_ffn_down[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+
+//         // All_reduce if distributed
+//         if (rsrc.comm != nullptr) {
+//             RUN_INFINI(infinicclAllReduce(
+//                 logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+//                 INFINICCL_SUM, rsrc.comm, stream));
+//             RUN_INFINI(infinirtStreamSynchronize(stream));
+//         }
+//     }
+//     // Sample and Output
+//     if (idev == 0) {
+//         if (last_logits != nullptr) {
+//             rmsnorm(logits_out, logits_in, rsrc.w_out_norm, meta.epsilon);
+//             auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
+//             linear(last_logits_buf, logits_out, rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+//             RUN_INFINI(infinirtStreamSynchronize(stream));
+//             RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
+//         }
+//         if (output != nullptr) {
+//             size_t token_offset = 0;
+//             for (uint32_t req = 0; req < nreq; req++) {
+//                 auto seq_len = req_lens[req];
+//                 token_offset += seq_len;
+//                 rmsnorm(logits_out->slice(0, req, 1),
+//                         logits_in->slice(0, token_offset - 1, 1),
+//                         rsrc.w_out_norm,
+//                         meta.epsilon);
+//             }
+//             linear(prob_buf, logits_out->slice(0, 0, nreq), rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+//             std::random_device _rd;
+//             std::mt19937 gen(_rd());
+//             token_offset = 0;
+//             for (uint32_t req = 0; req < nreq; req++) {
+//                 auto seq_len = req_lens[req];
+//                 float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
+//                 randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
+//                              prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
+//                              random_val, topp[req], topk[req], temperature[req]);
+//                 token_offset += seq_len;
+//             }
+//             RUN_INFINI(infinirtStreamSynchronize(stream));
+//             RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
+//                                       sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
+//             for (uint32_t req = 0; req < nreq; req++) {
+//                 output[req] = uint32_t(result_cpu[req]);
+//             }
+//         }
+//     }
+// }
+
+// __C void
+// inferBatch(struct JiugeModel *model,
+//            const uint32_t *tokens, uint32_t ntok,
+//            const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+//            struct KVCache **kv_caches,
+//            const float *temperature, const uint32_t *topk, const float *topp,
+//            uint32_t *output) {
+//     model->req.tokens = tokens;
+//     model->req.ntok = ntok;
+//     model->req.req_lens = req_lens;
+//     model->req.nreq = nreq;
+//     model->req.req_pos = req_pos;
+//     model->req.kv_caches = kv_caches;
+//     model->req.output = output;
+//     model->req.logits = nullptr;
+//     model->req.temperature = temperature;
+//     model->req.topk = topk;
+//     model->req.topp = topp;
+
+//     for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+//         std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+//         model->states[idev].proceed = true;
+//         lock.unlock();
+//         model->states[idev].cv_start.notify_one();
+//     }
+//     for (size_t i = model->dev_ids.size(); i > 0; i--) {
+//         auto idev = i - 1;
+//         std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+//         model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+//         lock.unlock();
+//     }
+// }
+
+// __C void
+// forwardBatch(struct JiugeModel *model,
+//              const uint32_t *tokens, uint32_t ntok,
+//              const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+//              struct KVCache **kv_caches,
+//              void *logits) {
+//     model->req.tokens = tokens;
+//     model->req.ntok = ntok;
+//     model->req.req_lens = req_lens;
+//     model->req.nreq = nreq;
+//     model->req.req_pos = req_pos;
+//     model->req.kv_caches = kv_caches;
+//     model->req.output = nullptr;
+//     model->req.logits = logits;
+//     model->req.temperature = nullptr;
+//     model->req.topk = nullptr;
+//     model->req.topp = nullptr;
+
+//     for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+//         std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+//         model->states[idev].proceed = true;
+//         lock.unlock();
+//         model->states[idev].cv_start.notify_one();
+//     }
+//     for (size_t i = model->dev_ids.size(); i > 0; i--) {
+//         auto idev = i - 1;
+//         std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+//         model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+//         lock.unlock();
+//     }
+// }
+
+// void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
+//                   infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+//     CacheManager cache_manager(100);
+//     InferenceContext ctx(rsrc, &cache_manager, rsrc->stream);
+
+//     // Set the inference context for this thread
+//     setInferenceContext(&ctx);
+
+//     // Create Device Resource
+//     createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
+//     {
+//         std::unique_lock<std::mutex> lock(state.mtx);
+//         state.loaded = true;
+//         lock.unlock();
+//         state.cv_load.notify_one();
+//     }
+
+//     // Infer Loop
+//     while (true) {
+//         std::unique_lock<std::mutex> lock(state.mtx);
+//         state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
+//         // quit if exit_flag is set
+//         if (state.exit_flag) {
+//             break;
+//         }
+
+//         inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
+//                          req.req_lens, req.nreq, req.req_pos, req.kv_caches,
+//                          req.temperature, req.topk, req.topp, req.output, req.logits);
+
+//         state.proceed = false;
+//         lock.unlock();
+//         state.cv_done.notify_one();
+//     }
+
+//     // Clean-Up
+//     releaseDeviceResource(*rsrc);
+//     setInferenceContext(nullptr); // Clear the context when done
+// }
+
+// JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infiniDevice_t device_, std::vector<int> device_ids) : meta(*_meta) {
+//     int ndev = int(device_ids.size());
+//     device = device_;
+//     dev_ids = device_ids;
+//     dev_resources = std::vector<DeviceResource>(ndev);
+//     states = std::vector<InferState>(ndev);
+//     threads.resize(ndev);
+//     RUN_INFINI(infinirtInit());
+//     auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+//     if (ndev > 1) {
+//         RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+//     }
+
+//     for (int i = 0; i < ndev; i++) {
+//         threads[i] = std::thread(launchDevice, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
+//     }
+//     for (int i = 0; i < ndev; i++) {
+//         std::unique_lock<std::mutex> lock(states[i].mtx);
+//         states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
+//         lock.unlock();
+//     }
+// }
+
+// __C struct JiugeModel *
+// createJiugeModel(const JiugeMeta *meta,
+//                  const JiugeWeights *weights,
+//                  infiniDevice_t device,
+//                  int ndev,
+//                  const int *dev_ids) {
+//     std::vector<int> device_ids(ndev);
+//     std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
+//     JiugeModel *model = new JiugeModel(meta, weights, device, device_ids);
+//     return model;
+// }
+
+// __C void destroyJiugeModel(struct JiugeModel *model) {
+//     auto ndev = model->dev_resources.size();
+
+//     for (size_t idev = 0; idev < ndev; idev++) {
+//         std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+//         model->states[idev].exit_flag = true;
+//         lock.unlock();
+//         model->states[idev].cv_start.notify_one();
+//     }
+
+//     for (size_t idev = 0; idev < ndev; idev++) {
+//         model->threads[idev].join();
+//     }
+
+//     delete model;
+// }
diff --git a/src/models/jiuge/jiuge_impl.hpp b/src/models/jiuge/jiuge_impl.hpp
index be05b0e8..d8372e3d 100644
--- a/src/models/jiuge/jiuge_impl.hpp
+++ b/src/models/jiuge/jiuge_impl.hpp
@@ -1,71 +1,48 @@
-#ifndef JIUGE_IMPL_H
-#define JIUGE_IMPL_H
-
-#include "infinicore_infer.h"
-
-#include "../../allocator.hpp"
-#include "../../tensor.hpp"
-
-#include <condition_variable>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-struct DeviceResource {
-    // Device
-    infiniDevice_t device;
-    int device_id;
-    infiniopHandle_t handle;
-    // Weights
-    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
-        cos_table;
-    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
-        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
-    // Streams
-    infinirtStream_t stream;
-    // Communicator
-    infinicclComm_t comm;
-
-    std::shared_ptr<MemoryPool> memory_pool;
-};
-
-struct InferState {
-    std::mutex mtx;
-    std::condition_variable cv_load, cv_start, cv_done;
-    bool loaded = false;
-    bool proceed = false;
-    bool exit_flag = false;
-};
-
-struct InferRequest {
-    const uint32_t *tokens;
-    uint32_t ntok;
-    const uint32_t *req_lens;
-    uint32_t nreq;
-    const uint32_t *req_pos;
-    struct KVCache **kv_caches;
-    const float *temperature;
-    const uint32_t *topk;
-    const float *topp;
-    uint32_t *output;
-    void *logits;
-};
-
-struct JiugeModel {
-    JiugeMeta meta;
-    infiniDevice_t device;
-    std::vector<int> dev_ids;
-    std::vector<DeviceResource> dev_resources;
-    std::vector<InferState> states;
-    std::vector<std::thread> threads;
-    InferRequest req;
-
-    JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
-};
-
-struct KVCache {
-    std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
-};
-
-#endif
+// #ifndef JIUGE_IMPL_H
+// #define JIUGE_IMPL_H
+
+// #include "infinicore_infer.h"
+
+// #include "../common_structs.hpp"
+
+// #include "../../allocator.hpp"
+// #include "../../tensor.hpp"
+
+// #include <condition_variable>
+// #include <memory>
+// #include <mutex>
+// #include <thread>
+// #include <vector>
+
+// struct DeviceResourcejiuge {
+//     // Device
+//     infiniDevice_t device;
+//     int device_id;
+//     infiniopHandle_t handle;
+//     // Weights
+//     std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+//         cos_table;
+//     std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_out,
+//         w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+//     // Streams
+//     infinirtStream_t stream;
+//     // Communicator
+//     infinicclComm_t comm;
+
+//     std::shared_ptr<MemoryPool> memory_pool;
+// };
+
+// struct JiugeModel {
+//     JiugeMeta meta;
+//     infiniDevice_t device;
+//     std::vector<int> dev_ids;
+//     std::vector<DeviceResourcejiuge> dev_resources;
+//     std::vector<InferState> states;
+//     std::vector<std::thread> threads;
+//     InferRequest req;
+
+//     JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
+// };
+
+
+// #endif
diff --git a/src/models/jiuge/jiuge_kv_cache.cpp b/src/models/jiuge/jiuge_kv_cache.cpp
index db10f94e..2da87438 100644
--- a/src/models/jiuge/jiuge_kv_cache.cpp
+++ b/src/models/jiuge/jiuge_kv_cache.cpp
@@ -1,59 +1,59 @@
-#include "jiuge_impl.hpp"
+// #include "jiuge_impl.hpp"
 
-__C struct KVCache *createKVCache(const JiugeModel *model) {
-    KVCache *cache = new KVCache();
-    auto ndev = model->dev_resources.size();
-    auto nkvh = model->meta.nkvh / ndev;
-    auto max_len = model->meta.dctx;
-    auto dh = model->meta.dh;
-    auto shape = std::vector<size_t>{max_len, nkvh, dh};
-    for (unsigned int idev = 0; idev < ndev; idev++) {
-        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
-        auto kcache = std::vector<std::shared_ptr<Tensor>>();
-        auto vcache = std::vector<std::shared_ptr<Tensor>>();
-        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
-            kcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
-            vcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
-        }
-        cache->k.push_back(kcache);
-        cache->v.push_back(vcache);
-    }
+// __C struct KVCache *createKVCache(const JiugeModel *model) {
+//     KVCache *cache = new KVCache();
+//     auto ndev = model->dev_resources.size();
+//     auto nkvh = model->meta.nkvh / ndev;
+//     auto max_len = model->meta.dctx;
+//     auto dh = model->meta.dh;
+//     auto shape = std::vector<size_t>{max_len, nkvh, dh};
+//     for (unsigned int idev = 0; idev < ndev; idev++) {
+//         RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+//         auto kcache = std::vector<std::shared_ptr<Tensor>>();
+//         auto vcache = std::vector<std::shared_ptr<Tensor>>();
+//         for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+//             kcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+//             vcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+//         }
+//         cache->k.push_back(kcache);
+//         cache->v.push_back(vcache);
+//     }
 
-    return cache;
-}
+//     return cache;
+// }
 
-__C struct KVCache *duplicateKVCache(const JiugeModel *model,
-                                     const KVCache *kv_cache,
-                                     unsigned int seq_len) {
-    auto new_kv_cache = createKVCache(model);
-    auto ndev = model->dev_resources.size();
-    auto nkvh = model->meta.nkvh / ndev;
-    auto dh = model->meta.dh;
-    auto dt_size = dsize(model->meta.dt_logits);
-    for (unsigned int idev = 0; idev < ndev; idev++) {
-        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
-        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
-            RUN_INFINI(infinirtMemcpy(new_kv_cache->k[idev][layer]->data(),
-                                      kv_cache->k[idev][layer]->data(),
-                                      seq_len * nkvh * dh * dt_size,
-                                      INFINIRT_MEMCPY_D2D));
-            RUN_INFINI(infinirtMemcpy(new_kv_cache->v[idev][layer]->data(),
-                                      kv_cache->v[idev][layer]->data(),
-                                      seq_len * nkvh * dh * dt_size,
-                                      INFINIRT_MEMCPY_D2D));
-        }
-    }
-    return new_kv_cache;
-}
+// __C struct KVCache *duplicateKVCache(const JiugeModel *model,
+//                                      const KVCache *kv_cache,
+//                                      unsigned int seq_len) {
+//     auto new_kv_cache = createKVCache(model);
+//     auto ndev = model->dev_resources.size();
+//     auto nkvh = model->meta.nkvh / ndev;
+//     auto dh = model->meta.dh;
+//     auto dt_size = dsize(model->meta.dt_logits);
+//     for (unsigned int idev = 0; idev < ndev; idev++) {
+//         RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+//         for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+//             RUN_INFINI(infinirtMemcpy(new_kv_cache->k[idev][layer]->data(),
+//                                       kv_cache->k[idev][layer]->data(),
+//                                       seq_len * nkvh * dh * dt_size,
+//                                       INFINIRT_MEMCPY_D2D));
+//             RUN_INFINI(infinirtMemcpy(new_kv_cache->v[idev][layer]->data(),
+//                                       kv_cache->v[idev][layer]->data(),
+//                                       seq_len * nkvh * dh * dt_size,
+//                                       INFINIRT_MEMCPY_D2D));
+//         }
+//     }
+//     return new_kv_cache;
+// }
 
-__C void dropKVCache(JiugeModel const *model, KVCache *kv_cache) {
-    auto ndev = model->dev_resources.size();
-    for (unsigned int idev = 0; idev < ndev; idev++) {
-        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
-        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
-            kv_cache->k[idev][layer].reset();
-            kv_cache->v[idev][layer].reset();
-        }
-    }
-    delete kv_cache;
-}
+// __C void dropKVCache(JiugeModel const *model, KVCache *kv_cache) {
+//     auto ndev = model->dev_resources.size();
+//     for (unsigned int idev = 0; idev < ndev; idev++) {
+//         RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+//         for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+//             kv_cache->k[idev][layer].reset();
+//             kv_cache->v[idev][layer].reset();
+//         }
+//     }
+//     delete kv_cache;
+// }
diff --git a/src/models/jiuge/jiuge_weight.hpp b/src/models/jiuge/jiuge_weight.hpp
index 6e8bc33e..3ba14938 100644
--- a/src/models/jiuge/jiuge_weight.hpp
+++ b/src/models/jiuge/jiuge_weight.hpp
@@ -1,188 +1,188 @@
-#ifndef JIUGE_WEIGHT_HPP
-#define JIUGE_WEIGHT_HPP
-
-#include "jiuge_impl.hpp"
-
-#include <cmath>
-inline std::shared_ptr<Tensor> getInEmbd(
-    JiugeMeta const *meta,
-    JiugeWeights const *w) {
-    auto shape = std::vector<size_t>({meta->dvoc, meta->d});
-    return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
-}
-
-inline std::shared_ptr<Tensor> getOutNorm(
-    JiugeMeta const *meta,
-    JiugeWeights const *w) {
-    auto shape = std::vector<size_t>({meta->d});
-    return Tensor::weight((char *)w->output_norm, w->dt_norm, shape);
-}
-
-inline std::shared_ptr<Tensor> getOutEmbd(
-    JiugeMeta const *meta,
-    JiugeWeights const *w) {
-    if (w->transpose_linear_weights != 0) {
-        auto shape = std::vector<size_t>({meta->dvoc, meta->d});
-        return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape)
-            ->permute({1, 0});
-    } else {
-        auto shape = std::vector<size_t>({meta->d, meta->dvoc});
-        return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape);
-    }
-}
-
-inline std::shared_ptr<Tensor> getAttnNorm(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer) {
-    auto shape = std::vector<size_t>({meta->d});
-    return Tensor::weight((char *)(w->attn_norm[layer]), w->dt_norm, shape);
-}
-
-inline std::shared_ptr<Tensor> getAttnQKV(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer, size_t idev, size_t ndev) {
-    auto nkvh = meta->nkvh;
-    auto nh = meta->nh;
-    auto dh = meta->dh;
-    auto d = meta->d;
-    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(w->dt_mat);
-    if (w->transpose_linear_weights != 0) {
-        auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
-        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape)
-            ->permute({1, 0});
-    } else {
-        auto shape = std::vector<size_t>({d, (nh + 2 * nkvh) / ndev * dh});
-        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape);
-    }
-}
-
-inline std::shared_ptr<Tensor> getAttnQKVBias(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer, size_t idev, size_t ndev) {
-    auto nkvh = meta->nkvh;
-    auto nh = meta->nh;
-    auto dh = meta->dh;
-    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * dsize(w->dt_mat);
-    auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh});
-    return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, w->dt_mat, shape);
-}
-
-inline std::shared_ptr<Tensor> getAttnO(JiugeMeta const *meta,
-                                        JiugeWeights const *w, size_t layer,
-                                        size_t idev, size_t ndev) {
-    auto nh = meta->nh;
-    auto dh = meta->dh;
-    auto d = meta->d;
-    size_t offset = idev * d * (nh / ndev * dh) * dsize(w->dt_mat);
-    if (w->transpose_linear_weights != 0) {
-        auto shape = std::vector<size_t>({d, nh / ndev * dh});
-        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape)
-            ->permute({1, 0});
-    } else {
-        auto shape = std::vector<size_t>({nh / ndev * dh, d});
-        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape);
-    }
-}
-
-inline std::shared_ptr<Tensor> getFFNNorm(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer) {
-    auto shape = std::vector<size_t>({meta->d});
-    return Tensor::weight((char *)(w->ffn_norm[layer]), w->dt_norm, shape);
-}
-
-inline std::shared_ptr<Tensor> getFFNGateUp(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer, size_t idev, size_t ndev) {
-    auto di = meta->di;
-    auto d = meta->d;
-    size_t offset = idev * (2 * di / ndev) * d * dsize(w->dt_mat);
-    if (w->transpose_linear_weights != 0) {
-        auto shape = std::vector<size_t>({2 * di / ndev, d});
-        return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
-                              w->dt_mat, shape)
-            ->permute({1, 0});
-    } else {
-        auto shape = std::vector<size_t>({d, 2 * di / ndev});
-        return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
-                              w->dt_mat, shape);
-    }
-}
-
-inline std::shared_ptr<Tensor> getFFNDown(
-    JiugeMeta const *meta,
-    JiugeWeights const *w,
-    size_t layer, size_t idev, size_t ndev) {
-    auto di = meta->di;
-    auto d = meta->d;
-    size_t offset = idev * d * (di / ndev) * dsize(w->dt_mat);
-    if (w->transpose_linear_weights != 0) {
-        auto shape = std::vector<size_t>({d, di / ndev});
-        return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape)
-            ->permute({1, 0});
-    } else {
-        auto shape = std::vector<size_t>({di / ndev, d});
-        return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape);
-    }
-}
-
-inline std::shared_ptr<Tensor> getSinTable(JiugeMeta const *meta) {
-    auto half_dh = meta->dh / 2;
-    auto unit = dsize(meta->dt_logits);
-    void *table = std::malloc(meta->dctx * half_dh * unit);
-
-    for (size_t i = 0; i < meta->dctx; i++) {
-        for (size_t j = 0; j < half_dh; j++) {
-            float _sin = std::sin(
-                static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
-            if (meta->dt_logits == INFINI_DTYPE_F16) {
-                ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin);
-            } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
-                ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin);
-            } else if (meta->dt_logits == INFINI_DTYPE_F32) {
-                ((float *)table)[i * half_dh + j] = _sin;
-            } else {
-                std::cout << "unsupported data type" << std::endl;
-                exit(1);
-            }
-        }
-    }
-    auto shape = std::vector<size_t>({meta->dctx, half_dh});
-    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
-    std::free(table);
-    return tensor;
-}
-
-inline std::shared_ptr<Tensor> getCosTable(JiugeMeta const *meta) {
-    auto half_dh = meta->dh / 2;
-    auto unit = dsize(meta->dt_logits);
-    void *table = std::malloc(meta->dctx * half_dh * unit);
-
-    for (size_t i = 0; i < meta->dctx; i++) {
-        for (size_t j = 0; j < half_dh; j++) {
-            float _cos = std::cos(
-                static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
-            if (meta->dt_logits == INFINI_DTYPE_F16) {
-                ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos);
-            } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
-                ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos);
-            } else if (meta->dt_logits == INFINI_DTYPE_F32) {
-                ((float *)table)[i * half_dh + j] = _cos;
-            } else {
-                std::cout << "unsupported data type" << std::endl;
-                exit(1);
-            }
-        }
-    }
-    auto shape = std::vector<size_t>({meta->dctx, half_dh});
-    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
-    std::free(table);
-    return tensor;
-}
-
-#endif
+// #ifndef JIUGE_WEIGHT_HPP
+// #define JIUGE_WEIGHT_HPP
+
+// #include "jiuge_impl.hpp"
+
+// #include <cmath>
+// inline std::shared_ptr<Tensor> getInEmbd(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w) {
+//     auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+//     return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
+// }
+
+// inline std::shared_ptr<Tensor> getOutNorm(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w) {
+//     auto shape = std::vector<size_t>({meta->d});
+//     return Tensor::weight((char *)w->output_norm, w->dt_norm, shape);
+// }
+
+// inline std::shared_ptr<Tensor> getOutEmbd(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w) {
+//     if (w->transpose_linear_weights != 0) {
+//         auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+//         return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape)
+//             ->permute({1, 0});
+//     } else {
+//         auto shape = std::vector<size_t>({meta->d, meta->dvoc});
+//         return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape);
+//     }
+// }
+
+// inline std::shared_ptr<Tensor> getAttnNorm(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer) {
+//     auto shape = std::vector<size_t>({meta->d});
+//     return Tensor::weight((char *)(w->attn_norm[layer]), w->dt_norm, shape);
+// }
+
+// inline std::shared_ptr<Tensor> getAttnQKV(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer, size_t idev, size_t ndev) {
+//     auto nkvh = meta->nkvh;
+//     auto nh = meta->nh;
+//     auto dh = meta->dh;
+//     auto d = meta->d;
+//     size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(w->dt_mat);
+//     if (w->transpose_linear_weights != 0) {
+//         auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
+//         return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape)
+//             ->permute({1, 0});
+//     } else {
+//         auto shape = std::vector<size_t>({d, (nh + 2 * nkvh) / ndev * dh});
+//         return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape);
+//     }
+// }
+
+// inline std::shared_ptr<Tensor> getAttnQKVBias(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer, size_t idev, size_t ndev) {
+//     auto nkvh = meta->nkvh;
+//     auto nh = meta->nh;
+//     auto dh = meta->dh;
+//     size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * dsize(w->dt_mat);
+//     auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh});
+//     return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, w->dt_mat, shape);
+// }
+
+// inline std::shared_ptr<Tensor> getAttnO(JiugeMeta const *meta,
+//                                         JiugeWeights const *w, size_t layer,
+//                                         size_t idev, size_t ndev) {
+//     auto nh = meta->nh;
+//     auto dh = meta->dh;
+//     auto d = meta->d;
+//     size_t offset = idev * d * (nh / ndev * dh) * dsize(w->dt_mat);
+//     if (w->transpose_linear_weights != 0) {
+//         auto shape = std::vector<size_t>({d, nh / ndev * dh});
+//         return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape)
+//             ->permute({1, 0});
+//     } else {
+//         auto shape = std::vector<size_t>({nh / ndev * dh, d});
+//         return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape);
+//     }
+// }
+
+// inline std::shared_ptr<Tensor> getFFNNorm(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer) {
+//     auto shape = std::vector<size_t>({meta->d});
+//     return Tensor::weight((char *)(w->ffn_norm[layer]), w->dt_norm, shape);
+// }
+
+// inline std::shared_ptr<Tensor> getFFNGateUp(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer, size_t idev, size_t ndev) {
+//     auto di = meta->di;
+//     auto d = meta->d;
+//     size_t offset = idev * (2 * di / ndev) * d * dsize(w->dt_mat);
+//     if (w->transpose_linear_weights != 0) {
+//         auto shape = std::vector<size_t>({2 * di / ndev, d});
+//         return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
+//                               w->dt_mat, shape)
+//             ->permute({1, 0});
+//     } else {
+//         auto shape = std::vector<size_t>({d, 2 * di / ndev});
+//         return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset,
+//                               w->dt_mat, shape);
+//     }
+// }
+
+// inline std::shared_ptr<Tensor> getFFNDown(
+//     JiugeMeta const *meta,
+//     JiugeWeights const *w,
+//     size_t layer, size_t idev, size_t ndev) {
+//     auto di = meta->di;
+//     auto d = meta->d;
+//     size_t offset = idev * d * (di / ndev) * dsize(w->dt_mat);
+//     if (w->transpose_linear_weights != 0) {
+//         auto shape = std::vector<size_t>({d, di / ndev});
+//         return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape)
+//             ->permute({1, 0});
+//     } else {
+//         auto shape = std::vector<size_t>({di / ndev, d});
+//         return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape);
+//     }
+// }
+
+// inline std::shared_ptr<Tensor> getSinTable(JiugeMeta const *meta) {
+//     auto half_dh = meta->dh / 2;
+//     auto unit = dsize(meta->dt_logits);
+//     void *table = std::malloc(meta->dctx * half_dh * unit);
+
+//     for (size_t i = 0; i < meta->dctx; i++) {
+//         for (size_t j = 0; j < half_dh; j++) {
+//             float _sin = std::sin(
+//                 static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
+//             if (meta->dt_logits == INFINI_DTYPE_F16) {
+//                 ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin);
+//             } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
+//                 ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin);
+//             } else if (meta->dt_logits == INFINI_DTYPE_F32) {
+//                 ((float *)table)[i * half_dh + j] = _sin;
+//             } else {
+//                 std::cout << "unsupported data type" << std::endl;
+//                 exit(1);
+//             }
+//         }
+//     }
+//     auto shape = std::vector<size_t>({meta->dctx, half_dh});
+//     auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+//     std::free(table);
+//     return tensor;
+// }
+
+// inline std::shared_ptr<Tensor> getCosTable(JiugeMeta const *meta) {
+//     auto half_dh = meta->dh / 2;
+//     auto unit = dsize(meta->dt_logits);
+//     void *table = std::malloc(meta->dctx * half_dh * unit);
+
+//     for (size_t i = 0; i < meta->dctx; i++) {
+//         for (size_t j = 0; j < half_dh; j++) {
+//             float _cos = std::cos(
+//                 static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(j) / half_dh));
+//             if (meta->dt_logits == INFINI_DTYPE_F16) {
+//                 ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos);
+//             } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
+//                 ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos);
+//             } else if (meta->dt_logits == INFINI_DTYPE_F32) {
+//                 ((float *)table)[i * half_dh + j] = _cos;
+//             } else {
+//                 std::cout << "unsupported data type" << std::endl;
+//                 exit(1);
+//             }
+//         }
+//     }
+//     auto shape = std::vector<size_t>({meta->dctx, half_dh});
+//     auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+//     std::free(table);
+//     return tensor;
+// }
+
+// #endif
diff --git a/src/models/qwen/qwen.cpp b/src/models/qwen/qwen.cpp
new file mode 100644
index 00000000..287e476b
--- /dev/null
+++ b/src/models/qwen/qwen.cpp
@@ -0,0 +1,444 @@
+#include "qwen_impl.hpp"
+#include "qwen_weight.hpp"
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "../inference_context.hpp"
+#include "infinicore_infer.h"
+
+#include <random>
+#include <thread>
+#include <vector>
+void createDeviceResource(DeviceResource *rsrc, const QwenMeta *meta,
+                          const QwenWeights *weights,
+                          infiniDevice_t device, int idev,
+                          int ndev, int dev_id,
+                          infinicclComm_t comm) {
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream);
+
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_q_norm, w_attn_k_norm, w_attn_out,
+        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+    for (size_t layer = 0; layer < meta->nlayer; layer++) {
+        w_attn_norm.push_back(
+            qwen::getAttnNorm(meta, weights, layer));
+        w_attn_qkv.push_back(
+            qwen::getAttnQKV(meta, weights, layer, idev, ndev));
+        if (weights->attn_qkv_b != nullptr) {
+            b_attn_qkv.push_back(
+                qwen::getAttnQKVBias(meta, weights, layer, idev, ndev));
+        }
+        if (weights->attn_q_norm != nullptr) {
+            w_attn_q_norm.push_back(
+                qwen::getAttnQNorm(meta, weights, layer));
+            w_attn_k_norm.push_back(
+                qwen::getAttnKNorm(meta, weights, layer));
+        }
+        w_attn_out.push_back(
+            qwen::getAttnO(meta, weights, layer, idev, ndev));
+        w_ffn_norm.push_back(
+            qwen::getFFNNorm(meta, weights, layer));
+        w_ffn_gate_up.push_back(
+            qwen::getFFNGateUp(meta, weights, layer, idev, ndev));
+        w_ffn_down.push_back(
+            qwen::getFFNDown(meta, weights, layer, idev, ndev));
+    }
+
+    auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
+
+    *rsrc = DeviceResource{
+        device,
+        dev_id,
+        handle,
+        qwen::getInEmbd(meta, weights),
+        qwen::getOutNorm(meta, weights),
+        qwen::getOutEmbd(meta, weights),
+        qwen::getSinTable(meta),
+        qwen::getCosTable(meta),
+        w_attn_norm,
+        w_attn_qkv,
+        b_attn_qkv,
+        w_attn_q_norm,
+        w_attn_k_norm,
+        w_attn_out,
+        w_ffn_norm,
+        w_ffn_gate_up,
+        w_ffn_down,
+        stream,
+        comm,
+        memory_pool,
+    };
+    RUN_INFINI(infinirtDeviceSynchronize());
+}
+
+void releaseDeviceResource(DeviceResource &rsrc) {
+    rsrc.w_in_embd.reset();
+    rsrc.w_out_norm.reset();
+    rsrc.w_out_embd.reset();
+    rsrc.sin_table.reset();
+    rsrc.cos_table.reset();
+    rsrc.w_attn_norm.clear();
+    rsrc.w_attn_qkv.clear();
+    rsrc.b_attn_qkv.clear();
+    rsrc.w_attn_q_norm.clear();
+    rsrc.w_attn_k_norm.clear();
+    rsrc.w_attn_out.clear();
+    rsrc.w_ffn_norm.clear();
+    rsrc.w_ffn_gate_up.clear();
+    rsrc.w_ffn_down.clear();
+    RUN_INFINI(infinirtStreamDestroy(rsrc.stream));
+    RUN_INFINI(infiniopDestroyHandle(rsrc.handle));
+}
+
+void inferDeviceBatch(const QwenMeta &meta, DeviceResource &rsrc,
+                      int idev, int ndev,
+                      const uint32_t *tokens, uint32_t ntok,
+                      const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                      struct KVCache **kv_caches,
+                      const float *temperature, const uint32_t *topk, const float *topp,
+                      uint32_t *output, void *last_logits) {
+    auto nlayer = meta.nlayer;
+    auto nkvh = meta.nkvh / ndev;
+    auto nh = meta.nh / ndev;
+    auto ngroup = nh / nkvh;
+    // auto dctx = meta.dctx;
+    auto dh = meta.dh;
+    auto d = meta.d;
+    auto dt_logits = meta.dt_logits;
+    auto di = meta.di / ndev;
+    auto dvoc = meta.dvoc;
+    auto stream = rsrc.stream;
+    bool has_qkv_bias = rsrc.b_attn_qkv.size() > 0;
+    bool has_qk_norm = rsrc.w_attn_q_norm.size() > 0 && rsrc.w_attn_k_norm.size() > 0;
+
+    // Allocate buffers
+    auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool);
+    auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * di}, rsrc.memory_pool);
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
+    auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
+    auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
+    auto result_cpu = std::vector<int64_t>(nreq);
+
+    auto qkv_buf_view = qkv_buf->view({ntok, nh + nkvh * 2, dh});
+    auto q_buf = qkv_buf_view->slice(1, 0, nh);
+    auto k_buf = qkv_buf_view->slice(1, nh, nkvh);
+
+    // Prepare inputs
+    auto batch_pos_ids = std::vector<uint32_t>(ntok);
+    size_t req_start = 0;
+    for (uint32_t req = 0; req < nreq; req++) {
+        for (uint32_t i = 0; i < req_lens[req]; i++) {
+            batch_pos_ids[req_start + i] = req_pos[req] + i;
+        }
+        req_start += req_lens[req];
+    }
+
+    std::shared_ptr<Tensor> pos_ids_buf;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
+    } else {
+        pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
+                                       INFINIRT_MEMCPY_H2D, stream));
+    }
+    for (uint32_t i = 0; i < ntok; i++) {
+        RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
+                                       rsrc.w_in_embd->data(tokens[i] * d),
+                                       dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
+    }
+
+    // Attention
+    // attention inner
+    size_t max_qk_size = 0;
+    size_t max_seq_len = 0;
+
+    for (uint32_t req = 0; req < nreq; req++) {
+        auto past_len = req_pos[req];
+        auto seq_len = req_lens[req];
+        auto total_len = past_len + seq_len;
+
+        max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
+        max_seq_len = std::max(max_seq_len, size_t(seq_len));
+    }
+
+    auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
+    auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
+    auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
+
+    // MLP buffers
+    auto gate_buf = gate_up_buf->slice(1, 0, di);
+    auto up_buf = gate_up_buf->slice(1, di, di);
+
+    // Compute
+    for (uint32_t layer = 0; layer < nlayer; layer++) {
+        // 1. Attention
+        // rms norm
+        rmsnorm(logits_out, logits_in, rsrc.w_attn_norm[layer], meta.epsilon);
+        // qkv_proj
+        linear(qkv_buf, logits_out, rsrc.w_attn_qkv[layer], 1.0, 0.0, nullptr, has_qkv_bias ? rsrc.b_attn_qkv[layer] : nullptr);
+        if (has_qk_norm) {
+            rmsnorm(q_buf, q_buf, rsrc.w_attn_q_norm[layer], meta.epsilon);
+            rmsnorm(k_buf, k_buf, rsrc.w_attn_k_norm[layer], meta.epsilon);
+        }
+        // rope
+        rope(q_buf, q_buf, pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+        rope(k_buf, k_buf, pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+
+        size_t token_offset = 0;
+        for (uint32_t req = 0; req < nreq; req++) {
+            auto past_len = req_pos[req];
+            auto seq_len = req_lens[req];
+            auto total_len = past_len + seq_len;
+            auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+            auto q = qkv_buf_view->slice({{0, token_offset, seq_len}, {1, 0, nh}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
+            auto k = qkv_buf_view->slice({{0, token_offset, seq_len}, {1, nh, nkvh}});
+            auto v = qkv_buf_view->slice({{0, token_offset, seq_len}, {1, nh + nkvh, nkvh}});
+
+            // self attention
+            // concat
+            rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k);
+            rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v);
+            // qk
+            rearrange(q_rearrange->slice(2, 0, seq_len), q);
+            auto qk_gemm = qk_buf->slice(1, 0, seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
+            auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0});
+            linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
+            // softmax
+            auto qk_softmax = qk_buf->slice(1, 0, seq_len * total_len)->view({nh, seq_len, total_len});
+            causalSoftmax(qk_softmax, qk_softmax);
+            auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2});
+            linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr);
+            // rearrange attn val
+            rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
+
+            token_offset += seq_len;
+        }
+
+        // o_proj
+        linear(logits_in, o_buf, rsrc.w_attn_out[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+        // 2. FFN
+        rmsnorm(logits_out, logits_in, rsrc.w_ffn_norm[layer], meta.epsilon);
+        linear(gate_up_buf, logits_out, rsrc.w_ffn_gate_up[layer], 1.0, 0.0, nullptr, nullptr);
+        swiglu(gate_buf, up_buf, gate_buf);
+        linear(logits_in, gate_buf, rsrc.w_ffn_down[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
+
+        // All_reduce if distributed
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(
+                logits_in->data(), logits_in->data(), ntok * d, dt_logits,
+                INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+    }
+    // Sample and Output
+    if (idev == 0) {
+        if (last_logits != nullptr) {
+            rmsnorm(logits_out, logits_in, rsrc.w_out_norm, meta.epsilon);
+            auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
+            linear(last_logits_buf, logits_out, rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
+        }
+        if (output != nullptr) {
+            size_t token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                token_offset += seq_len;
+                rmsnorm(logits_out->slice(0, req, 1),
+                        logits_in->slice(0, token_offset - 1, 1),
+                        rsrc.w_out_norm,
+                        meta.epsilon);
+            }
+            linear(prob_buf, logits_out->slice(0, 0, nreq), rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            std::random_device _rd;
+            std::mt19937 gen(_rd());
+            token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
+                randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
+                             prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
+                             random_val, topp[req], topk[req], temperature[req]);
+                token_offset += seq_len;
+            }
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
+                                      sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
+            for (uint32_t req = 0; req < nreq; req++) {
+                output[req] = uint32_t(result_cpu[req]);
+            }
+        }
+    }
+}
+
+__C void
+inferBatch(struct QwenModel *model,
+           const uint32_t *tokens, uint32_t ntok,
+           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+           struct KVCache **kv_caches,
+           const float *temperature, const uint32_t *topk, const float *topp,
+           uint32_t *output) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = output;
+    model->req.logits = nullptr;
+    model->req.temperature = temperature;
+    model->req.topk = topk;
+    model->req.topp = topp;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+__C void
+forwardBatch(struct QwenModel *model,
+             const uint32_t *tokens, uint32_t ntok,
+             const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+             struct KVCache **kv_caches,
+             void *logits) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = nullptr;
+    model->req.logits = logits;
+    model->req.temperature = nullptr;
+    model->req.topk = nullptr;
+    model->req.topp = nullptr;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+void launchDevice(const QwenMeta &meta, const QwenWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
+                  infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+    CacheManager cache_manager(100);
+    InferenceContext ctx(rsrc, &cache_manager, rsrc->stream);
+
+    // Set the inference context for this thread
+    setInferenceContext(&ctx);
+
+    // Create Device Resource
+    createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
+    {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.loaded = true;
+        lock.unlock();
+        state.cv_load.notify_one();
+    }
+
+    // Infer Loop
+    while (true) {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
+        // quit if exit_flag is set
+        if (state.exit_flag) {
+            break;
+        }
+
+        inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
+                         req.req_lens, req.nreq, req.req_pos, req.kv_caches,
+                         req.temperature, req.topk, req.topp, req.output, req.logits);
+
+        state.proceed = false;
+        lock.unlock();
+        state.cv_done.notify_one();
+    }
+
+    // Clean-Up
+    releaseDeviceResource(*rsrc);
+    setInferenceContext(nullptr); // Clear the context when done
+}
+
+QwenModel::QwenModel(const QwenMeta *_meta, const QwenWeights *weights, infiniDevice_t device_, std::vector<int> device_ids) : meta(*_meta) {
+    int ndev = int(device_ids.size());
+    device = device_;
+    dev_ids = device_ids;
+    dev_resources = std::vector<DeviceResource>(ndev);
+    states = std::vector<InferState>(ndev);
+    threads.resize(ndev);
+    RUN_INFINI(infinirtInit());
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+
+    for (int i = 0; i < ndev; i++) {
+        threads[i] = std::thread(launchDevice, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
+    }
+    for (int i = 0; i < ndev; i++) {
+        std::unique_lock<std::mutex> lock(states[i].mtx);
+        states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
+        lock.unlock();
+    }
+}
+
+__C struct QwenModel *
+createQwenModel(const QwenMeta *meta,
+                 const QwenWeights *weights,
+                 infiniDevice_t device,
+                 int ndev,
+                 const int *dev_ids) {
+    std::vector<int> device_ids(ndev);
+    std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
+    QwenModel *model = new QwenModel(meta, weights, device, device_ids);
+    return model;
+}
+
+__C void destroyQwenModel(struct QwenModel *model) {
+    auto ndev = model->dev_resources.size();
+
+    for (size_t idev = 0; idev < ndev; idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].exit_flag = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+
+    for (size_t idev = 0; idev < ndev; idev++) {
+        model->threads[idev].join();
+    }
+
+    delete model;
+}
diff --git a/src/models/qwen/qwen_impl.hpp b/src/models/qwen/qwen_impl.hpp
new file mode 100644
index 00000000..28910ef5
--- /dev/null
+++ b/src/models/qwen/qwen_impl.hpp
@@ -0,0 +1,48 @@
+#ifndef QWEN_IMPL_H
+#define QWEN_IMPL_H
+
+#include "infinicore_infer.h"
+
+#include "../common_structs.hpp"
+
+#include "../../allocator.hpp"
+#include "../../tensor.hpp"
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+struct DeviceResource {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    // Weights
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
+        cos_table;
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, w_attn_q_norm, w_attn_k_norm, w_attn_out,
+        w_ffn_norm, w_ffn_gate_up, w_ffn_down;
+    // Streams
+    infinirtStream_t stream;
+    // Communicator
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+struct QwenModel {
+    QwenMeta meta;
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    std::vector<DeviceResource> dev_resources;
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    QwenModel(const QwenMeta *, const QwenWeights *, infiniDevice_t device, std::vector<int> device_ids);
+};
+
+
+#endif
\ No newline at end of file
diff --git a/src/models/qwen/qwen_kv_cache.cpp b/src/models/qwen/qwen_kv_cache.cpp
new file mode 100644
index 00000000..dc6622df
--- /dev/null
+++ b/src/models/qwen/qwen_kv_cache.cpp
@@ -0,0 +1,59 @@
+#include "qwen_impl.hpp"
+
+__C struct KVCache *createKVCache(const QwenModel *model) {
+    KVCache *cache = new KVCache();
+    auto ndev = model->dev_resources.size();
+    auto nkvh = model->meta.nkvh / ndev;
+    auto max_len = model->meta.dctx;
+    auto dh = model->meta.dh;
+    auto shape = std::vector<size_t>{max_len, nkvh, dh};
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        auto kcache = std::vector<std::shared_ptr<Tensor>>();
+        auto vcache = std::vector<std::shared_ptr<Tensor>>();
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+            vcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+        }
+        cache->k.push_back(kcache);
+        cache->v.push_back(vcache);
+    }
+
+    return cache;
+}
+
+__C struct KVCache *duplicateKVCache(const QwenModel *model,
+                                     const KVCache *kv_cache,
+                                     unsigned int seq_len) {
+    auto new_kv_cache = createKVCache(model);
+    auto ndev = model->dev_resources.size();
+    auto nkvh = model->meta.nkvh / ndev;
+    auto dh = model->meta.dh;
+    auto dt_size = dsize(model->meta.dt_logits);
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            RUN_INFINI(infinirtMemcpy(new_kv_cache->k[idev][layer]->data(),
+                                      kv_cache->k[idev][layer]->data(),
+                                      seq_len * nkvh * dh * dt_size,
+                                      INFINIRT_MEMCPY_D2D));
+            RUN_INFINI(infinirtMemcpy(new_kv_cache->v[idev][layer]->data(),
+                                      kv_cache->v[idev][layer]->data(),
+                                      seq_len * nkvh * dh * dt_size,
+                                      INFINIRT_MEMCPY_D2D));
+        }
+    }
+    return new_kv_cache;
+}
+
+__C void dropKVCache(QwenModel const *model, KVCache *kv_cache) {
+    auto ndev = model->dev_resources.size();
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kv_cache->k[idev][layer].reset();
+            kv_cache->v[idev][layer].reset();
+        }
+    }
+    delete kv_cache;
+}
\ No newline at end of file
diff --git a/src/models/qwen/qwen_weight.cpp b/src/models/qwen/qwen_weight.cpp
new file mode 100644
index 00000000..de2183d5
--- /dev/null
+++ b/src/models/qwen/qwen_weight.cpp
@@ -0,0 +1,174 @@
+#include "qwen_weight.hpp"
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+
+// 所有函数的实现（定义）都放在这个 .cpp 文件中
+// 并被包裹在 qwen 命名空间内
+namespace qwen {
+
+// 之前缺失的 getInEmbd 的定义
+std::shared_ptr<Tensor> getInEmbd(const QwenMeta *meta, const QwenWeights *w) {
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+        return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({meta->d, meta->dvoc});
+        return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getOutNorm(QwenMeta const *meta, QwenWeights const *w) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)w->output_norm, w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getOutEmbd(QwenMeta const *meta, QwenWeights const *w) {
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+        return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({meta->d, meta->dvoc});
+        return Tensor::weight((char *)w->output_embd, meta->dt_logits, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getAttnNorm(QwenMeta const *meta, QwenWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->attn_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnQKV(QwenMeta const *meta, QwenWeights const *w, size_t layer, int idev, int ndev) {
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
+        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({d, (nh + 2 * nkvh) / ndev * dh});
+        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getAttnQKVBias(QwenMeta const *meta, QwenWeights const *w, size_t layer, int idev, int ndev) {
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * dsize(w->dt_mat);
+    auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh});
+    return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, w->dt_mat, shape);
+}
+
+std::shared_ptr<Tensor> getAttnQNorm(QwenMeta const *meta, QwenWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->dh});
+    return Tensor::weight((char *)(w->attn_q_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnKNorm(QwenMeta const *meta, QwenWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->dh});
+    return Tensor::weight((char *)(w->attn_k_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnO(QwenMeta const *meta, QwenWeights const *w, size_t layer, int idev, int ndev) {
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * d * (nh / ndev * dh) * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({d, nh / ndev * dh});
+        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({nh / ndev * dh, d});
+        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getFFNNorm(QwenMeta const *meta, QwenWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->ffn_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getFFNGateUp(QwenMeta const *meta, QwenWeights const *w, size_t layer, int idev, int ndev) {
+    auto di = meta->di;
+    auto d = meta->d;
+    size_t offset = idev * (2 * di / ndev) * d * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({2 * di / ndev, d});
+        return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset, w->dt_mat, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({d, 2 * di / ndev});
+        return Tensor::weight((char *)(w->ffn_gate_up[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getFFNDown(QwenMeta const *meta, QwenWeights const *w, size_t layer, int idev, int ndev) {
+    auto di = meta->di;
+    auto d = meta->d;
+    size_t offset = idev * d * (di / ndev) * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({d, di / ndev});
+        return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape)
+            ->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({di / ndev, d});
+        return Tensor::weight((char *)(w->ffn_down[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+// 注意：这些函数依赖于 f32_to_f16 和 f32_to_bf16，它们需要被定义
+// 假设它们在 "qwen_impl.hpp" 或其他包含的头文件中
+std::shared_ptr<Tensor> getSinTable(QwenMeta const *meta) {
+    auto half_dh = meta->dh / 2;
+    auto unit = dsize(meta->dt_logits);
+    void *table = std::malloc(meta->dctx * half_dh * unit);
+
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float val = std::sin(static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(2*j) / meta->dh));
+            if (meta->dt_logits == INFINI_DTYPE_F16) {
+                ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(val);
+            } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(val);
+            } else if (meta->dt_logits == INFINI_DTYPE_F32) {
+                ((float *)table)[i * half_dh + j] = val;
+            }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, half_dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+
+std::shared_ptr<Tensor> getCosTable(QwenMeta const *meta) {
+    auto half_dh = meta->dh / 2;
+    auto unit = dsize(meta->dt_logits);
+    void *table = std::malloc(meta->dctx * half_dh * unit);
+
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float val = std::cos(static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(2*j) / meta->dh));
+             if (meta->dt_logits == INFINI_DTYPE_F16) {
+                ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(val);
+            } else if (meta->dt_logits == INFINI_DTYPE_BF16) {
+                ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(val);
+            } else if (meta->dt_logits == INFINI_DTYPE_F32) {
+                ((float *)table)[i * half_dh + j] = val;
+            }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, half_dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+
+} // namespace qwen
\ No newline at end of file
diff --git a/src/models/qwen/qwen_weight.hpp b/src/models/qwen/qwen_weight.hpp
new file mode 100644
index 00000000..055c106b
--- /dev/null
+++ b/src/models/qwen/qwen_weight.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "qwen_impl.hpp"
+#include "infinicore_infer.h"
+#include <memory>
+
+// 头文件中只保留函数的声明
+namespace qwen {
+
+std::shared_ptr<Tensor> getInEmbd(const QwenMeta *meta, const QwenWeights *weights);
+std::shared_ptr<Tensor> getOutNorm(const QwenMeta *meta, const QwenWeights *weights);
+std::shared_ptr<Tensor> getOutEmbd(const QwenMeta *meta, const QwenWeights *weights);
+std::shared_ptr<Tensor> getSinTable(const QwenMeta *meta);
+std::shared_ptr<Tensor> getCosTable(const QwenMeta *meta);
+std::shared_ptr<Tensor> getAttnNorm(const QwenMeta *meta, const QwenWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnQKV(const QwenMeta *meta, const QwenWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getAttnQKVBias(const QwenMeta *meta, const QwenWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getAttnQNorm(const QwenMeta *meta, const QwenWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnKNorm(const QwenMeta *meta, const QwenWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnO(const QwenMeta *meta, const QwenWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getFFNNorm(const QwenMeta *meta, const QwenWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getFFNGateUp(const QwenMeta *meta, const QwenWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getFFNDown(const QwenMeta *meta, const QwenWeights *weights, size_t layer, int idev, int ndev);
+
+} // namespace qwen
\ No newline at end of file
diff --git a/src/models/qwen_moe/qwen_moe.cpp b/src/models/qwen_moe/qwen_moe.cpp
new file mode 100644
index 00000000..afaa7d66
--- /dev/null
+++ b/src/models/qwen_moe/qwen_moe.cpp
@@ -0,0 +1,512 @@
+#include "qwen_moe_impl.hpp"
+#include "qwen_moe_weight.hpp" // 注意: 此文件需要您后续根据 qwen_weight.hpp 进行创建
+
+#include "../../tensor.hpp"
+#include "../../utils.hpp"
+#include "../inference_context.hpp"
+#include "infinicore_infer.h" // 应该指向包含了 qwen_moe.h 的主头文件
+
+#include <iostream>
+#include <vector>
+#include <thread>
+#include <random>
+#include <numeric>      // 需要包含这个头文件
+#include <functional>   // 需要包含这个头文件
+
+// 为单个设备创建和加载资源
+void createDeviceResourceMoe(DeviceResourceMoe *rsrc, const QwenMoeMeta *meta,
+                             const QwenMoeWeights *weights,
+                             infiniDevice_t device, int idev,
+                             int ndev, int dev_id,
+                             infinicclComm_t comm) {
+    RUN_INFINI(infinirtSetDevice(device, dev_id));
+    infiniopHandle_t handle;
+    infiniopCreateHandle(&handle);
+    infinirtStream_t stream;
+    infinirtStreamCreate(&stream);
+
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, 
+                                         w_attn_q_norm, w_attn_k_norm, w_attn_out; // <-- 已更新
+    for (size_t layer = 0; layer < meta->nlayer; layer++) {
+        w_attn_norm.push_back(qwen_moe::getAttnNorm(meta, weights, layer));
+        w_attn_qkv.push_back(qwen_moe::getAttnQKV(meta, weights, layer, idev, ndev));
+        if (weights->attn_qkv_b != nullptr) {
+            b_attn_qkv.push_back(qwen_moe::getAttnQKVBias(meta, weights, layer, idev, ndev));
+        }
+        // --- 已添加加载 QK Norm 的逻辑 ---
+        if (weights->attn_q_norm != nullptr) {
+            w_attn_q_norm.push_back(qwen_moe::getAttnQNorm(meta, weights, layer));
+            w_attn_k_norm.push_back(qwen_moe::getAttnKNorm(meta, weights, layer));
+        }
+        // ---------------------------------
+        w_attn_out.push_back(qwen_moe::getAttnO(meta, weights, layer, idev, ndev));
+    }
+
+    // ... (MoE weight loading remains the same) ...
+    std::vector<std::shared_ptr<Tensor>> w_ffn_norm, w_moe_gate;
+    std::vector<std::shared_ptr<Tensor>> w_moe_experts_gate_up, w_moe_experts_down;
+    for (size_t layer = 0; layer < meta->nlayer; layer++) {
+        w_ffn_norm.push_back(qwen_moe::getFFNNorm(meta, weights, layer));
+        w_moe_gate.push_back(qwen_moe::getMoeGate(meta, weights, layer, idev, ndev));
+        for (size_t expert = 0; expert < meta->num_experts; expert++) {
+            w_moe_experts_gate_up.push_back(qwen_moe::getMoeExpertGateUp(meta, weights, layer, expert, idev, ndev));
+            w_moe_experts_down.push_back(qwen_moe::getMoeExpertDown(meta, weights, layer, expert, idev, ndev));
+        }
+    }
+
+    auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
+
+    rsrc->device = device;
+    rsrc->device_id = dev_id;
+    rsrc->handle = handle;
+    rsrc->w_in_embd = qwen_moe::getInEmbd(meta, weights);
+    rsrc->w_out_norm = qwen_moe::getOutNorm(meta, weights);
+    rsrc->w_out_embd = qwen_moe::getOutEmbd(meta, weights);
+    rsrc->sin_table = qwen_moe::getSinTable(meta);
+    rsrc->cos_table = qwen_moe::getCosTable(meta);
+    rsrc->w_attn_norm = std::move(w_attn_norm);
+    rsrc->w_attn_qkv = std::move(w_attn_qkv);
+    rsrc->b_attn_qkv = std::move(b_attn_qkv);
+    rsrc->w_attn_q_norm = std::move(w_attn_q_norm); // <-- 已添加
+    rsrc->w_attn_k_norm = std::move(w_attn_k_norm); // <-- 已添加
+    rsrc->w_attn_out = std::move(w_attn_out);
+    rsrc->w_ffn_norm = std::move(w_ffn_norm);
+    rsrc->w_moe_gate = std::move(w_moe_gate);
+    rsrc->w_moe_experts_gate_up = std::move(w_moe_experts_gate_up);
+    rsrc->w_moe_experts_down = std::move(w_moe_experts_down);
+    rsrc->stream = stream;
+    rsrc->comm = comm;
+    rsrc->memory_pool = memory_pool;
+
+    RUN_INFINI(infinirtDeviceSynchronize());
+}
+
+// 释放单个设备的资源
+void releaseDeviceResourceMoe(DeviceResourceMoe &rsrc) {
+    rsrc.w_in_embd.reset();
+    rsrc.w_out_norm.reset();
+    rsrc.w_out_embd.reset();
+    rsrc.sin_table.reset();
+    rsrc.cos_table.reset();
+    rsrc.w_attn_norm.clear();
+    rsrc.w_attn_qkv.clear();
+    rsrc.b_attn_qkv.clear();
+    rsrc.w_attn_q_norm.clear();
+    rsrc.w_attn_k_norm.clear();
+    rsrc.w_attn_out.clear();
+    rsrc.w_ffn_norm.clear();
+    rsrc.w_moe_gate.clear();
+    rsrc.w_moe_experts_gate_up.clear();
+    rsrc.w_moe_experts_down.clear();
+    RUN_INFINI(infinirtStreamDestroy(rsrc.stream));
+    RUN_INFINI(infiniopDestroyHandle(rsrc.handle));
+}
+
+
+void inferDeviceBatchMoe(const QwenMoeMeta &meta, DeviceResourceMoe &rsrc,
+                         int idev, int ndev,
+                         const uint32_t *tokens, uint32_t ntok,
+                         const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                         struct KVCache **kv_caches,
+                         const float *temperature, const uint32_t *topk, const float *topp,
+                         uint32_t *output, void *last_logits) {
+    // --- MoE --- 获取 MoE 特定参数
+    auto nlayer = meta.nlayer;
+    auto nkvh = meta.nkvh / ndev;
+    auto nh = meta.nh / ndev;
+    auto ngroup = nh / nkvh;
+    auto dh = meta.dh;
+    auto d = meta.d;
+    auto dt_logits = meta.dt_logits;
+    auto dvoc = meta.dvoc;
+    auto stream = rsrc.stream;
+    bool has_qkv_bias = !rsrc.b_attn_qkv.empty() && rsrc.b_attn_qkv[0] != nullptr;
+    bool has_qk_norm = !rsrc.w_attn_q_norm.empty() && rsrc.w_attn_q_norm[0] != nullptr;
+    auto num_experts = meta.num_experts;
+    auto num_experts_per_tok = meta.num_experts_per_tok;
+    auto moe_di = meta.moe_intermediate_size / ndev;
+
+    // --- MoE --- 分配缓冲区 (为 MoE 更新)
+    auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto qkv_buf = Tensor::buffer(dt_logits, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool);
+    auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
+    
+    // --- MoE --- MoE 模块的缓冲区
+    auto moe_gate_logits = Tensor::buffer(dt_logits, {ntok, num_experts}, rsrc.memory_pool);
+    auto moe_gate_probs = Tensor::buffer(dt_logits, {ntok, num_experts}, rsrc.memory_pool);
+    auto topk_weights = Tensor::buffer(dt_logits, {ntok, num_experts_per_tok}, rsrc.memory_pool);
+    auto topk_indices = Tensor::buffer(INFINI_DTYPE_I32, {ntok, num_experts_per_tok}, rsrc.memory_pool);
+    auto expert_outputs = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
+    auto gate_up_buf = Tensor::buffer(dt_logits, {ntok, 2 * moe_di}, rsrc.memory_pool);
+    auto gate_buf = gate_up_buf->slice(1, 0, moe_di);
+    auto up_buf = gate_up_buf->slice(1, moe_di, moe_di);
+
+    // --- MoE --- 最终采样缓冲区 (与密集模型相同)
+    auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
+    auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
+    auto result_cpu = std::vector<int64_t>(nreq);
+
+    auto qkv_buf_view = qkv_buf->view({ntok, nh + nkvh * 2, dh});
+    auto q_buf = qkv_buf_view->slice(1, 0, nh);
+    auto k_buf = qkv_buf_view->slice(1, nh, nkvh);
+
+    size_t max_qk_size = 0;
+    size_t max_seq_len = 0;
+
+    for (uint32_t req = 0; req < nreq; req++) {
+        auto past_len = req_pos[req];
+        auto seq_len = req_lens[req];
+        auto total_len = past_len + seq_len;
+
+        max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
+        max_seq_len = std::max(max_seq_len, size_t(seq_len));
+    }
+
+    auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
+    auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
+    auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
+    auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
+
+
+    // --- 输入准备 (与密集模型相同) ---
+    auto batch_pos_ids = std::vector<uint32_t>(ntok);
+    size_t req_start = 0;
+    for (uint32_t req = 0; req < nreq; req++) {
+        for (uint32_t i = 0; i < req_lens[req]; i++) {
+            batch_pos_ids[req_start + i] = req_pos[req] + i;
+        }
+        req_start += req_lens[req];
+    }
+
+    std::shared_ptr<Tensor> pos_ids_buf;
+    if (rsrc.device == INFINI_DEVICE_CPU) {
+        pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
+    } else {
+        pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
+        RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
+                                       INFINIRT_MEMCPY_H2D, stream));
+    }
+    for (uint32_t i = 0; i < ntok; i++) {
+        RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
+                                       rsrc.w_in_embd->data(tokens[i] * d),
+                                       dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
+    }
+
+    // --- 主要计算循环 ---
+    for (uint32_t layer = 0; layer < nlayer; layer++) {
+        // 1. 注意力模块 (此模块与密集模型完全相同)
+        rmsnorm(logits_out, logits_in, rsrc.w_attn_norm[layer], meta.epsilon);
+        linear(qkv_buf, logits_out, rsrc.w_attn_qkv[layer], 1.0, 0.0, nullptr, has_qkv_bias ? rsrc.b_attn_qkv[layer] : nullptr);
+        if (has_qk_norm) {
+            rmsnorm(q_buf, q_buf, rsrc.w_attn_q_norm[layer], meta.epsilon);
+            rmsnorm(k_buf, k_buf, rsrc.w_attn_k_norm[layer], meta.epsilon);
+        }
+        rope(q_buf, q_buf, pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+        rope(k_buf, k_buf, pos_ids_buf, rsrc.sin_table, rsrc.cos_table);
+        
+        linear(logits_in, o_buf, rsrc.w_attn_out[layer], 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr);
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(logits_in->data(), logits_in->data(), ntok * d, dt_logits, INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+        
+        // --- MoE --- 2. MoE 模块 (替换 FFN 模块)
+        rmsnorm(logits_out, logits_in, rsrc.w_ffn_norm[layer], meta.epsilon);
+
+        // a. 门控: 为每个 token 计算专家得分
+        linear(moe_gate_logits, logits_out, rsrc.w_moe_gate[layer], 1.0, 0.0, nullptr, nullptr);
+        
+        // b. 路由: 应用 softmax 并找到 top-k 专家
+        causalSoftmax(moe_gate_probs, moe_gate_logits);
+        topk_fun(topk_weights, topk_indices, moe_gate_probs, num_experts_per_tok);
+
+        // Qwen特有: 权重归一化
+        if (meta.norm_topk_prob) {
+            normalize(topk_weights, topk_weights, 1, 1e-6);
+        }
+
+        // c. 专家计算: 采用更高效的“专家并行”模式
+        zeros(expert_outputs); // 清空最终输出缓冲区
+
+        // 将路由结果复制到 CPU，以构建分发计划
+        std::vector<int32_t> topk_indices_cpu(ntok * num_experts_per_tok);
+        std::vector<float> topk_weights_cpu(ntok * num_experts_per_tok);
+        // 计算 topk_indices 的总元素数量
+        size_t topk_indices_nelem = std::accumulate(topk_indices->shape().begin(), topk_indices->shape().end(), 1ULL, std::multiplies<size_t>());
+        // 计算 topk_weights 的总元素数量
+        size_t topk_weights_nelem = std::accumulate(topk_weights->shape().begin(), topk_weights->shape().end(), 1ULL, std::multiplies<size_t>());
+
+        // 使用 "元素数量 * 单个元素大小" 的方式计算总字节数
+        RUN_INFINI(infinirtMemcpy(topk_indices_cpu.data(), topk_indices->data(), topk_indices_nelem * dsize(topk_indices->dtype()), INFINIRT_MEMCPY_D2H));
+        RUN_INFINI(infinirtMemcpy(topk_weights_cpu.data(), topk_weights->data(), topk_weights_nelem * dsize(topk_weights->dtype()), INFINIRT_MEMCPY_D2H));
+        RUN_INFINI(infinirtStreamSynchronize(stream));
+
+        for (uint32_t expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
+            std::vector<uint32_t> token_indices_for_expert;
+            std::vector<float> weights_for_expert;
+            
+            // 在 CPU 上构建分发列表
+            for (uint32_t token_i = 0; token_i < ntok; ++token_i) {
+                for (uint32_t k = 0; k < num_experts_per_tok; ++k) {
+                    if (static_cast<uint32_t>(topk_indices_cpu[token_i * num_experts_per_tok + k]) == expert_idx) {
+                        token_indices_for_expert.push_back(token_i);
+                        weights_for_expert.push_back(topk_weights_cpu[token_i * num_experts_per_tok + k]);
+                    }
+                }
+            }
+
+            if (token_indices_for_expert.empty()) {
+                continue; // 没有 token 分配给这个专家，跳过
+            }
+
+            size_t num_tokens_for_expert = token_indices_for_expert.size();
+            size_t weight_idx = layer * num_experts + expert_idx;
+
+            // Gather: 收集分配给该专家的所有 token 的 hidden states
+            auto expert_input_states = Tensor::buffer(dt_logits, {num_tokens_for_expert, d}, rsrc.memory_pool);
+            gather(expert_input_states, logits_out, token_indices_for_expert);
+
+            // Compute: 对这个小批量进行专家 FFN 计算
+            auto expert_gate_up = Tensor::buffer(dt_logits, {num_tokens_for_expert, 2 * moe_di}, rsrc.memory_pool);
+            auto expert_gate = expert_gate_up->slice(1, 0, moe_di);
+            auto expert_up = expert_gate_up->slice(1, moe_di, moe_di);
+            
+            linear(expert_gate_up, expert_input_states, rsrc.w_moe_experts_gate_up[weight_idx], 1.0, 0.0, nullptr, nullptr);
+            swiglu(expert_gate, expert_up, expert_gate);
+            
+            auto single_expert_output = Tensor::buffer(dt_logits, {num_tokens_for_expert, d}, rsrc.memory_pool);
+            linear(single_expert_output, expert_gate, rsrc.w_moe_experts_down[weight_idx], 1.0, 0.0, nullptr, nullptr);
+
+            // Weighting: 将专家输出乘以其路由权重
+            // scale(single_expert_output, single_expert_output, weights_for_expert);
+
+            // Scatter_add: 将加权后的结果加回到最终输出缓冲区的原始位置
+            scatter_add(expert_outputs, single_expert_output, token_indices_for_expert);
+        }   
+        
+        // 添加残差连接
+        add(logits_in, logits_in, expert_outputs);
+
+        if (rsrc.comm != nullptr) {
+            RUN_INFINI(infinicclAllReduce(logits_in->data(), logits_in->data(), ntok * d, dt_logits, INFINICCL_SUM, rsrc.comm, stream));
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+        }
+    }
+
+    // --- 最终采样和输出 (此模块与密集模型完全相同) ---
+    if (idev == 0) {
+        if (last_logits != nullptr) {
+            rmsnorm(logits_out, logits_in, rsrc.w_out_norm, meta.epsilon);
+            auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
+            linear(last_logits_buf, logits_out, rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
+        }
+        if (output != nullptr) {
+            size_t token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                token_offset += seq_len;
+                rmsnorm(logits_out->slice(0, req, 1),
+                        logits_in->slice(0, token_offset - 1, 1),
+                        rsrc.w_out_norm,
+                        meta.epsilon);
+            }
+            linear(prob_buf, logits_out->slice(0, 0, nreq), rsrc.w_out_embd, 1.0, 0.0, nullptr, nullptr);
+            std::random_device _rd;
+            std::mt19937 gen(_rd());
+            token_offset = 0;
+            for (uint32_t req = 0; req < nreq; req++) {
+                auto seq_len = req_lens[req];
+                float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
+                randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
+                             prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
+                             random_val, topp[req], topk[req], temperature[req]);
+                token_offset += seq_len;
+            }
+            RUN_INFINI(infinirtStreamSynchronize(stream));
+            RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
+                                      sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
+            for (uint32_t req = 0; req < nreq; req++) {
+                output[req] = uint32_t(result_cpu[req]);
+            }
+        }
+    }
+}
+
+
+
+
+// 每个设备的 worker 线程函数
+void launchDeviceMoe(const QwenMoeMeta &meta, const QwenMoeWeights *weights, DeviceResourceMoe *rsrc, InferState &state, InferRequest &req,
+                     infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
+    
+    CacheManager cache_manager(100);
+    InferenceContext ctx(nullptr, &cache_manager, rsrc->stream);
+    setInferenceContext(&ctx);
+    
+    createDeviceResourceMoe(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
+    {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.loaded = true;
+        lock.unlock();
+        state.cv_load.notify_one();
+    }
+
+    while (true) {
+        std::unique_lock<std::mutex> lock(state.mtx);
+        state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
+        if (state.exit_flag) break;
+
+        inferDeviceBatchMoe(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
+                            req.req_lens, req.nreq, req.req_pos, req.kv_caches,
+                            req.temperature, req.topk, req.topp, req.output, req.logits);
+
+        state.proceed = false;
+        lock.unlock();
+        state.cv_done.notify_one();
+    }
+
+    releaseDeviceResourceMoe(*rsrc);
+    setInferenceContext(nullptr);
+}
+
+// 主模型类的构造函数
+QwenMoeModel::QwenMoeModel(const QwenMoeMeta *_meta, const QwenMoeWeights *weights, infiniDevice_t device_, std::vector<int> device_ids) : meta(*_meta) {
+    int ndev = int(device_ids.size());
+    device = device_;
+    dev_ids = device_ids;
+    dev_resources = std::vector<DeviceResourceMoe>(ndev);
+    states = std::vector<InferState>(ndev);
+    threads.resize(ndev);
+    RUN_INFINI(infinirtInit());
+    auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
+    if (ndev > 1) {
+        RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
+    }
+
+    for (int i = 0; i < ndev; i++) {
+        threads[i] = std::thread(launchDeviceMoe, std::cref(meta), weights, &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
+    }
+    for (int i = 0; i < ndev; i++) {
+        std::unique_lock<std::mutex> lock(states[i].mtx);
+        states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
+        lock.unlock();
+    }
+}
+
+// ===================================================================
+// 公共 C API 实现
+// ===================================================================
+
+extern "C" {
+
+__C __export struct QwenMoeModel *
+createQwenMoeModel(const QwenMoeMeta *meta,
+                   const QwenMoeWeights *weights,
+                   infiniDevice_t device,
+                   int ndev,
+                   const int *dev_ids) {
+    std::cout << "C++: createQwenMoeModel called." << std::endl;
+    std::vector<int> device_ids(ndev);
+    std::copy(dev_ids, dev_ids + ndev, device_ids.begin());
+    QwenMoeModel *model = new QwenMoeModel(meta, weights, device, device_ids);
+    return model;
+}
+
+__C __export void
+destroyQwenMoeModel(struct QwenMoeModel *model) {
+    std::cout << "C++: destroyQwenMoeModel called." << std::endl;
+    auto ndev = model->dev_resources.size();
+    for (size_t idev = 0; idev < ndev; idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].exit_flag = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t idev = 0; idev < ndev; idev++) {
+        model->threads[idev].join();
+    }
+    delete model;
+}
+
+// __C __export struct KVCache *
+// createQwenMoeKVCache(const struct QwenMoeModel * model) {
+//     // TODO: 实现 KVCache 的创建逻辑
+//     return nullptr;
+// }
+
+// __C __export void
+// dropQwenMoeKVCache(const struct QwenMoeModel * model, struct KVCache * cache) {
+//     // TODO: 实现 KVCache 的销毁逻辑
+// }
+
+__C __export void
+inferQwenMoeBatch(struct QwenMoeModel *model,
+                  const uint32_t *tokens, uint32_t ntok,
+                  const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                  struct KVCache **kv_caches,
+                  const float *temperature, const uint32_t *topk, const float *topp,
+                  uint32_t *output) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = output;
+    model->req.logits = nullptr;
+    model->req.temperature = temperature;
+    model->req.topk = topk;
+    model->req.topp = topp;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+__C __export void
+forwardQwenMoeBatch(struct QwenMoeModel *model,
+                    const uint32_t *tokens, uint32_t ntok,
+                    const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                    struct KVCache **kv_caches,
+                    void *logits) {
+    model->req.tokens = tokens;
+    model->req.ntok = ntok;
+    model->req.req_lens = req_lens;
+    model->req.nreq = nreq;
+    model->req.req_pos = req_pos;
+    model->req.kv_caches = kv_caches;
+    model->req.output = nullptr;
+    model->req.logits = logits;
+    model->req.temperature = nullptr;
+    model->req.topk = nullptr;
+    model->req.topp = nullptr;
+
+    for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].proceed = true;
+        lock.unlock();
+        model->states[idev].cv_start.notify_one();
+    }
+    for (size_t i = model->dev_ids.size(); i > 0; i--) {
+        auto idev = i - 1;
+        std::unique_lock<std::mutex> lock(model->states[idev].mtx);
+        model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
+        lock.unlock();
+    }
+}
+
+
+} // extern "C"
diff --git a/src/models/qwen_moe/qwen_moe_impl.hpp b/src/models/qwen_moe/qwen_moe_impl.hpp
new file mode 100644
index 00000000..2caecc13
--- /dev/null
+++ b/src/models/qwen_moe/qwen_moe_impl.hpp
@@ -0,0 +1,63 @@
+#ifndef QWEN_MOE_IMPL_H
+#define QWEN_MOE_IMPL_H
+
+#include "infinicore_infer/models/qwen_moe.h"
+
+
+// CORRECTED: Include the new shared header for common structs
+#include "../common_structs.hpp" 
+
+#include "../../allocator.hpp"
+#include "../../tensor.hpp"
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+// DeviceResource definition remains the same
+struct DeviceResourceMoe {
+    // Device
+    infiniDevice_t device;
+    int device_id;
+    infiniopHandle_t handle;
+    
+    // Global Weights
+    std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table, cos_table;
+    
+    // Attention Weights
+    std::vector<std::shared_ptr<Tensor>> w_attn_norm, w_attn_qkv, b_attn_qkv, 
+                                         w_attn_q_norm, w_attn_k_norm, w_attn_out;
+    
+    // MoE Specific Weights
+    std::vector<std::shared_ptr<Tensor>> w_ffn_norm;
+    std::vector<std::shared_ptr<Tensor>> w_moe_gate;
+    std::vector<std::shared_ptr<Tensor>> w_moe_experts_gate_up;
+    std::vector<std::shared_ptr<Tensor>> w_moe_experts_down;
+
+    // Streams & Communicator
+    infinirtStream_t stream;
+    infinicclComm_t comm;
+
+    std::shared_ptr<MemoryPool> memory_pool;
+};
+
+// NOTE: InferState, InferRequest, and KVCache have been moved to common_structs.hpp
+
+// The main class for the MoE model instance
+struct QwenMoeModel {
+    QwenMoeMeta meta; 
+    
+    infiniDevice_t device;
+    std::vector<int> dev_ids;
+    std::vector<DeviceResourceMoe> dev_resources;
+    std::vector<InferState> states;
+    std::vector<std::thread> threads;
+    InferRequest req;
+
+    QwenMoeModel(const QwenMoeMeta *, const QwenMoeWeights *, infiniDevice_t device, std::vector<int> device_ids);
+};
+
+
+#endif // QWEN_MOE_IMPL_H
diff --git a/src/models/qwen_moe/qwen_moe_kv_cache.cpp b/src/models/qwen_moe/qwen_moe_kv_cache.cpp
new file mode 100644
index 00000000..a5fd3cb4
--- /dev/null
+++ b/src/models/qwen_moe/qwen_moe_kv_cache.cpp
@@ -0,0 +1,59 @@
+#include "qwen_moe_impl.hpp"
+
+__C struct KVCache *createQwenMoeKVCache(const QwenMoeModel *model) {
+    KVCache *cache = new KVCache();
+    auto ndev = model->dev_resources.size();
+    auto nkvh = model->meta.nkvh / ndev;
+    auto max_len = model->meta.dctx;
+    auto dh = model->meta.dh;
+    auto shape = std::vector<size_t>{max_len, nkvh, dh};
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        auto kcache = std::vector<std::shared_ptr<Tensor>>();
+        auto vcache = std::vector<std::shared_ptr<Tensor>>();
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+            vcache.push_back(std::move(Tensor::buffer(model->meta.dt_logits, shape)));
+        }
+        cache->k.push_back(kcache);
+        cache->v.push_back(vcache);
+    }
+
+    return cache;
+}
+
+__C struct KVCache *duplicateQwenMoeKVCache(const QwenMoeModel *model,
+                                     const KVCache *kv_cache,
+                                     unsigned int seq_len) {
+    auto new_kv_cache = createQwenMoeKVCache(model);
+    auto ndev = model->dev_resources.size();
+    auto nkvh = model->meta.nkvh / ndev;
+    auto dh = model->meta.dh;
+    auto dt_size = dsize(model->meta.dt_logits);
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            RUN_INFINI(infinirtMemcpy(new_kv_cache->k[idev][layer]->data(),
+                                      kv_cache->k[idev][layer]->data(),
+                                      seq_len * nkvh * dh * dt_size,
+                                      INFINIRT_MEMCPY_D2D));
+            RUN_INFINI(infinirtMemcpy(new_kv_cache->v[idev][layer]->data(),
+                                      kv_cache->v[idev][layer]->data(),
+                                      seq_len * nkvh * dh * dt_size,
+                                      INFINIRT_MEMCPY_D2D));
+        }
+    }
+    return new_kv_cache;
+}
+
+__C void dropQwenMoeKVCache(QwenMoeModel const *model, KVCache *kv_cache) {
+    auto ndev = model->dev_resources.size();
+    for (unsigned int idev = 0; idev < ndev; idev++) {
+        RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev]));
+        for (unsigned int layer = 0; layer < model->meta.nlayer; layer++) {
+            kv_cache->k[idev][layer].reset();
+            kv_cache->v[idev][layer].reset();
+        }
+    }
+    delete kv_cache;
+}
\ No newline at end of file
diff --git a/src/models/qwen_moe/qwen_moe_weight.cpp b/src/models/qwen_moe/qwen_moe_weight.cpp
new file mode 100644
index 00000000..8efe8f1f
--- /dev/null
+++ b/src/models/qwen_moe/qwen_moe_weight.cpp
@@ -0,0 +1,172 @@
+#include "qwen_moe_weight.hpp"
+#include <cmath>
+
+// 假设这些类型转换辅助函数在您的项目中是可用的
+// 您可能需要包含定义它们的头文件
+extern uint16_t f32_to_f16(float f);
+extern uint16_t f32_to_bf16(float f);
+
+namespace qwen_moe {
+
+// --- 全局权重加载实现 ---
+
+std::shared_ptr<Tensor> getInEmbd(const QwenMoeMeta *meta, const QwenMoeWeights *w) {
+    // 注意：MoE 模型通常不转置输入权重，但为保持一致性，我们保留这个逻辑
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+        return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({meta->d, meta->dvoc});
+        return Tensor::weight((char *)w->input_embd, meta->dt_logits, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getOutNorm(const QwenMoeMeta *meta, const QwenMoeWeights *w) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)w->output_norm, w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getOutEmbd(const QwenMoeMeta *meta, const QwenMoeWeights *w) {
+    // MoE 模型的 tie_word_embeddings 为 false，所以这是一个独立的权重
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({meta->dvoc, meta->d});
+        return Tensor::weight((char *)w->output_embd, w->dt_mat, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({meta->d, meta->dvoc});
+        return Tensor::weight((char *)w->output_embd, w->dt_mat, shape);
+    }
+}
+
+// --- Attention 权重加载实现 (与密集模型逻辑类似) ---
+
+std::shared_ptr<Tensor> getAttnNorm(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->attn_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnQKV(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, int idev, int ndev) {
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * ((nh + 2 * nkvh) / ndev * dh) * d * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh, d});
+        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({d, (nh + 2 * nkvh) / ndev * dh});
+        return Tensor::weight((char *)(w->attn_qkv[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getAttnQKVBias(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, int idev, int ndev) {
+    
+    auto nkvh = meta->nkvh;
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    size_t offset = idev * ((nh + 2 * nkvh) / ndev * dh) * dsize(w->dt_mat);
+    auto shape = std::vector<size_t>({(nh + 2 * nkvh) / ndev * dh});
+    return Tensor::weight((char *)(w->attn_qkv_b[layer]) + offset, w->dt_mat, shape);
+}
+
+std::shared_ptr<Tensor> getAttnQNorm(QwenMoeMeta const *meta, QwenMoeWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->dh});
+    return Tensor::weight((char *)(w->attn_q_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnKNorm(QwenMoeMeta const *meta, QwenMoeWeights const *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->dh});
+    return Tensor::weight((char *)(w->attn_k_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getAttnO(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, int idev, int ndev) {
+    auto nh = meta->nh;
+    auto dh = meta->dh;
+    auto d = meta->d;
+    size_t offset = idev * d * (nh / ndev * dh) * dsize(w->dt_mat);
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({d, nh / ndev * dh});
+        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({nh / ndev * dh, d});
+        return Tensor::weight((char *)(w->attn_o[layer]) + offset, w->dt_mat, shape);
+    }
+}
+
+// --- MoE 专属权重加载实现 ---
+
+std::shared_ptr<Tensor> getFFNNorm(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer) {
+    auto shape = std::vector<size_t>({meta->d});
+    return Tensor::weight((char *)(w->ffn_norm[layer]), w->dt_norm, shape);
+}
+
+std::shared_ptr<Tensor> getMoeGate(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, int idev, int ndev) {
+    auto shape = std::vector<size_t>({meta->num_experts, meta->d});
+    return Tensor::weight((char *)(w->moe_gate[layer]), w->dt_mat, shape);
+}
+
+std::shared_ptr<Tensor> getMoeExpertGateUp(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, size_t expert_idx, int idev, int ndev) {
+    size_t index = layer * meta->num_experts + expert_idx;
+    auto d = meta->d;
+    auto moe_di = meta->moe_intermediate_size;
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({2 * moe_di, d});
+        return Tensor::weight((char *)(w->moe_experts_gate_up[index]), w->dt_mat, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({d, 2 * moe_di});
+        return Tensor::weight((char *)(w->moe_experts_gate_up[index]), w->dt_mat, shape);
+    }
+}
+
+std::shared_ptr<Tensor> getMoeExpertDown(const QwenMoeMeta *meta, const QwenMoeWeights *w, size_t layer, size_t expert_idx, int idev, int ndev) {
+    size_t index = layer * meta->num_experts + expert_idx;
+    auto d = meta->d;
+    auto moe_di = meta->moe_intermediate_size;
+    if (w->transpose_linear_weights != 0) {
+        auto shape = std::vector<size_t>({d, moe_di});
+        return Tensor::weight((char *)(w->moe_experts_down[index]), w->dt_mat, shape)->permute({1, 0});
+    } else {
+        auto shape = std::vector<size_t>({moe_di, d});
+        return Tensor::weight((char *)(w->moe_experts_down[index]), w->dt_mat, shape);
+    }
+}
+
+// --- RoPE Table 生成函数 (与密集模型逻辑相同，但使用 MoE Meta) ---
+
+std::shared_ptr<Tensor> getSinTable(const QwenMoeMeta *meta) {
+    auto half_dh = meta->dh / 2;
+    auto unit = dsize(meta->dt_logits);
+    void *table = std::malloc(meta->dctx * half_dh * unit);
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float val = std::sin(static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(2*j) / meta->dh));
+            if (meta->dt_logits == INFINI_DTYPE_F16) { ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(val); }
+            else if (meta->dt_logits == INFINI_DTYPE_BF16) { ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(val); }
+            else if (meta->dt_logits == INFINI_DTYPE_F32) { ((float *)table)[i * half_dh + j] = val; }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, half_dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+
+std::shared_ptr<Tensor> getCosTable(const QwenMoeMeta *meta) {
+    auto half_dh = meta->dh / 2;
+    auto unit = dsize(meta->dt_logits);
+    void *table = std::malloc(meta->dctx * half_dh * unit);
+    for (size_t i = 0; i < meta->dctx; i++) {
+        for (size_t j = 0; j < half_dh; j++) {
+            float val = std::cos(static_cast<float>(i) / std::pow(meta->theta, static_cast<float>(2*j) / meta->dh));
+            if (meta->dt_logits == INFINI_DTYPE_F16) { ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(val); }
+            else if (meta->dt_logits == INFINI_DTYPE_BF16) { ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(val); }
+            else if (meta->dt_logits == INFINI_DTYPE_F32) { ((float *)table)[i * half_dh + j] = val; }
+        }
+    }
+    auto shape = std::vector<size_t>({meta->dctx, half_dh});
+    auto tensor = Tensor::weight(table, meta->dt_logits, shape);
+    std::free(table);
+    return tensor;
+}
+
+} // namespace qwen_moe
diff --git a/src/models/qwen_moe/qwen_moe_weight.hpp b/src/models/qwen_moe/qwen_moe_weight.hpp
new file mode 100644
index 00000000..64acd10a
--- /dev/null
+++ b/src/models/qwen_moe/qwen_moe_weight.hpp
@@ -0,0 +1,36 @@
+#ifndef QWEN_MOE_WEIGHT_H
+#define QWEN_MOE_WEIGHT_H
+
+#include "infinicore_infer/models/qwen_moe.h" // 使用 MoE 的头文件
+#include "../../tensor.hpp"
+#include <memory>
+
+// 使用一个独立的命名空间，以避免与密集模型发生冲突
+namespace qwen_moe {
+
+// --- MoE 权重加载函数的声明 ---
+
+// 全局权重
+std::shared_ptr<Tensor> getInEmbd(const QwenMoeMeta *meta, const QwenMoeWeights *weights);
+std::shared_ptr<Tensor> getOutNorm(const QwenMoeMeta *meta, const QwenMoeWeights *weights);
+std::shared_ptr<Tensor> getOutEmbd(const QwenMoeMeta *meta, const QwenMoeWeights *weights);
+std::shared_ptr<Tensor> getSinTable(const QwenMoeMeta *meta);
+std::shared_ptr<Tensor> getCosTable(const QwenMoeMeta *meta);
+
+// 逐层 Attention 权重
+std::shared_ptr<Tensor> getAttnNorm(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnQKV(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getAttnQKVBias(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getAttnQNorm(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnKNorm(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getAttnO(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, int idev, int ndev);
+
+// 逐层 MoE 专属权重
+std::shared_ptr<Tensor> getFFNNorm(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer);
+std::shared_ptr<Tensor> getMoeGate(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, int idev, int ndev);
+std::shared_ptr<Tensor> getMoeExpertGateUp(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, size_t expert_idx, int idev, int ndev);
+std::shared_ptr<Tensor> getMoeExpertDown(const QwenMoeMeta *meta, const QwenMoeWeights *weights, size_t layer, size_t expert_idx, int idev, int ndev);
+
+} // namespace qwen_moe
+
+#endif // QWEN_MOE_WEIGHT_H
diff --git a/xmake.lua b/xmake.lua
index 4eee405f..a95c6653 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -12,6 +12,8 @@ target("infinicore_infer")
     set_languages("cxx17")
     set_warnings("all", "error")
 
+    add_cxflags("-g") 
+
     add_files("src/models/*.cpp")
     add_files("src/models/*/*.cpp")
     add_files("src/tensor/*.cpp")