InfiniTensor · hootandy321 · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,7 @@ cache/
 *.txt
 
 *.http
+.DS_Store
+**/.DS_Store
+.DS_Store
+**/.DS_Store
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ xmake && xmake install
 - 运行模型推理测试
 
 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
+python scripts/qwen.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
 ```
 
 - 部署模型推理服务
@@ -34,4 +34,4 @@ python scripts/test_perf.py
 
 ```bash
 python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
+```
diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h
@@ -1,8 +1,10 @@
 #ifndef INFINICORE_INFER_H
 #define INFINICORE_INFER_H
 
+// 为了解决函数命名冲突，我们需要调整包含顺序并添加命名空间
+#include "infinicore_infer/models/qwen.h"
+#include "infinicore_infer/models/qwen_moe.h"
 #include "infinicore_infer/models/jiuge.h"
 
 
-
-#endif /* INFINICORE_INFER_H */
+#endif /* INFINICORE_INFER_H */
diff --git a/include/infinicore_infer/imodel.h b/include/infinicore_infer/imodel.h
@@ -0,0 +1,24 @@
+// imodel.h
+#pragma once
+
+// 只需要 KVCache 的前向声明，不需要知道它的具体实现
+struct KVCache; 
+
+// 这是所有模型都必须遵守的通用接口
+class IModel {
+public:
+    // C++ 接口类必须有虚析构函数
+    virtual ~IModel() = default;
+
+    // 定义所有模型都必须提供的功能作为“纯虚函数” (= 0)
+    // 任何继承 IModel 的类都必须自己实现这些函数
+
+    // 创建一个适用于此模型的 KVCache 结构
+    virtual KVCache* createKVCache() const = 0;
+
+    // 复制 KVCache（例如用于 beam search）
+    virtual KVCache* duplicateKVCache(const KVCache* cache, unsigned int seq_len) const = 0;
+
+    // 销毁 KVCache
+    virtual void dropKVCache(KVCache* cache) const = 0;
+};
diff --git a/include/infinicore_infer/models/jiuge.h b/include/infinicore_infer/models/jiuge.h
@@ -1,14 +1,23 @@
+// model_jiuge.h
+
 #ifndef MODEL_JIUGE_H
 #define MODEL_JIUGE_H
 
 #include <infiniccl.h>
 #include <infiniop.h>
 #include <infinirt.h>
-
 #include <stdint.h>
 
+// --- 修改：添加 extern "C" 以保证 C 链接性，保持风格统一 ---
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declarations
 struct JiugeModel;
+struct KVCache;
 
+// Meta and Weights structs remain the same...
 typedef struct
 {
     infiniDtype_t dt_logits;
@@ -45,68 +54,38 @@ typedef struct
     const void *const *ffn_down;
 } JiugeWeights;
 
-//////////////////// APIs ///////////////////////
+
 /// @brief 创建模型
-/// @param device 协处理器种类
-/// @param ndev 协处理器数量
-/// @param dev_ids 协处理器编号，长度为 ndev
-__C __export struct JiugeModel *
-createJiugeModel(const JiugeMeta *,
-                 const JiugeWeights *,
-                 infiniDevice_t device,
-                 int ndev,
-                 const int *dev_ids);
+__C __export struct JiugeModel*
+createJiugeModel(const JiugeMeta*, const JiugeWeights*, infiniDevice_t device, int ndev, const int* dev_ids);
 
 /// @brief 销毁模型
 __C __export void
-destroyJiugeModel(struct JiugeModel *);
+destroyJiugeModel(struct JiugeModel*);
 
 /// @brief 创建 KV Cache
-__C __export struct KVCache *
-createKVCache(const struct JiugeModel *);
+__C __export struct KVCache*
+createJiugeKVCache(const struct JiugeModel*);
 
 /// @brief 复制 KV Cache
-__C __export struct KVCache *
-duplicateKVCache(const struct JiugeModel *,
-                 const struct KVCache *, uint32_t seq_len);
+__C __export struct KVCache*
+duplicateJiugeKVCache(const struct JiugeModel*, const struct KVCache*, uint32_t seq_len);
 
 /// @brief 销毁 KV Cache
 __C __export void
-dropKVCache(const struct JiugeModel *,
-            struct KVCache *);
-
-/// @brief 批次推理一轮，并采样出新的 token
-/// @param tokens 输入 token 地址
-/// @param ntok 输入 token 数量
-/// @param nreq 请求数量
-/// @param req_lens 每个请求的 token 数量
-/// @param req_pos 每个请求的起始位置
-/// @param kv_caches 每个请求的 KV Cache
-/// @param temperature 采样温度（0. 表示贪心采样）
-/// @param topk 采样 topk（1 表示贪心采样）
-/// @param topp 采样 topp
-/// @param output 输出 token 数组，每个请求一个输出，长度至少为nreq
-__C __export void
-inferBatch(struct JiugeModel *,
-           const uint32_t *tokens, uint32_t ntok,
-           const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-           struct KVCache **kv_caches,
-           const float *temperature, const uint32_t *topk, const float *topp,
-           uint32_t *output);
+dropJiugeKVCache(const struct JiugeModel*, struct KVCache*);
 
-/// @brief 批次推理一轮，输出 output embedding 后的 logits
-/// @param tokens 输入 token 地址
-/// @param ntok 输入 token 数量
-/// @param nreq 请求数量
-/// @param req_lens 每个请求的 token 数量
-/// @param req_pos 每个请求的起始位置
-/// @param kv_caches 每个请求的 KV Cache
-/// @param logits 输出 token 数组，每个请求一个输出，长度至少为nreq
+/// @brief 批次推理一轮
 __C __export void
-forwardBatch(struct JiugeModel *,
-             const uint32_t *tokens, uint32_t ntok,
-             const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
-             struct KVCache **kv_caches,
-             void *logits);
+inferJiugeBatch(struct JiugeModel*,
+                const uint32_t* tokens, uint32_t ntok,
+                const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
+                struct KVCache** kv_caches,
+                const float* temperature, const uint32_t* topk, const float* topp,
+                uint32_t* output);
 
+#ifdef __cplusplus
+}
 #endif
+
+#endif // MODEL_JIUGE_H
diff --git a/include/infinicore_infer/models/qwen.h b/include/infinicore_infer/models/qwen.h
@@ -0,0 +1,96 @@
+#ifndef MODEL_QWEN_H
+#define MODEL_QWEN_H
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declarations
+struct QwenModel;
+struct KVCache; // 假设 KVCache 是一个通用的结构体
+
+typedef struct
+{
+    infiniDtype_t dt_logits;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+} QwenMeta;
+
+typedef struct
+{
+    size_t nlayer;
+    infiniDtype_t dt_norm, dt_mat;
+    // 0 if linear weights are passed as W, any other value if passed as W^T (default format in pytorch)
+    int transpose_linear_weights;
+    // [dvoc, d]
+    const void *input_embd;
+    // [d]
+    const void *output_norm;
+    // [dvoc, d]
+    const void *output_embd;
+    // nlayer * [d]
+    const void *const *attn_norm;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
+    const void *const *attn_qkv;
+    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
+    const void *const *attn_qkv_b;
+    // nlayer * [dh]
+    const void *const *attn_q_norm;
+    // nlayer * [dh]
+    const void *const *attn_k_norm;
+    // nlayer * [ndev, d, nkvh / ndev * dh]
+    const void *const *attn_o;
+    // nlayer * [d]
+    const void *const *ffn_norm;
+    // nlayer * [ndev, 2 * di / ndev, d]
+    const void *const *ffn_gate_up;
+    // nlayer * [ndev, d, di / ndev]
+    const void *const *ffn_down;
+} QwenWeights;
+
+
+//////////////////// APIs ///////////////////////
+/// @brief 创建模型
+__C __export struct QwenModel*
+createQwenModel(const QwenMeta*, const QwenWeights*, infiniDevice_t device, int ndev, const int* dev_ids);
+
+/// @brief 销毁模型
+__C __export void
+destroyQwenModel(struct QwenModel*);
+
+/// @brief 创建 KV Cache
+// --- 修改：函数重命名 ---
+__C __export struct KVCache*
+createQwenKVCache(const struct QwenModel*);
+
+/// @brief 复制 KV Cache
+// --- 修改：函数重命名 ---
+__C __export struct KVCache*
+duplicateQwenKVCache(const struct QwenModel*, const struct KVCache*, uint32_t seq_len);
+
+/// @brief 销毁 KV Cache
+// --- 修改：函数重命名 ---
+__C __export void
+dropQwenKVCache(const struct QwenModel*, struct KVCache*);
+
+/// @brief 批次推理一轮
+// --- 修改：函数重命名 ---
+__C __export void
+inferQwenBatch(struct QwenModel*,
+               const uint32_t* tokens, uint32_t ntok,
+               const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
+               struct KVCache** kv_caches,
+               const float* temperature, const uint32_t* topk, const float* topp,
+               uint32_t* output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MODEL_QWEN_H
diff --git a/include/infinicore_infer/models/qwen_moe.h b/include/infinicore_infer/models/qwen_moe.h
@@ -0,0 +1,112 @@
+#ifndef MODEL_QWEN_MOE_H
+#define MODEL_QWEN_MOE_H
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declaration for the new MoE model handle
+struct QwenMoeModel;
+// KVCache struct can be reused if its definition is generic enough,
+// otherwise it should also be specialized. Assuming it's generic for now.
+struct KVCache;
+
+
+// Renamed and specialized Meta struct for MoE
+typedef struct
+{
+    // --- Standard Fields (same as dense model) ---
+    infiniDtype_t dt_logits;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+
+    // --- New MoE-Specific Fields ---
+    size_t num_experts;         // Total number of experts per layer
+    size_t num_experts_per_tok; // Number of active experts per token
+    size_t moe_intermediate_size; // Intermediate size of a single expert's FFN
+    int norm_topk_prob;         // Flag (0 or 1) for routing logic
+
+} QwenMoeMeta;
+
+// Renamed and redesigned Weights struct for MoE
+typedef struct
+{
+    // --- Standard Fields (same as dense model) ---
+    size_t nlayer;
+    infiniDtype_t dt_norm, dt_mat;
+    int transpose_linear_weights;
+    const void *input_embd;       // [dvoc, d]
+    const void *output_norm;      // [d]
+    const void *output_embd;      // [dvoc, d]
+
+    // --- Attention Block (same as dense model) ---
+    const void *const *attn_norm;     // nlayer * [d]
+    const void *const *attn_qkv;      // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
+    const void *const *attn_qkv_b;    // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
+    const void *const *attn_q_norm;   // nlayer * [dh]
+    const void *const *attn_k_norm;   // nlayer * [dh]
+    const void *const *attn_o;        // nlayer * [ndev, d, nkvh / ndev * dh]
+
+    // --- MoE Block (replaces dense FFN) ---
+    const void *const *ffn_norm;      // Still needed: nlayer * [d] (post_attention_layernorm)
+
+    // Pointers for the Gating Network in each layer
+    const void *const *moe_gate;      // nlayer * [num_experts, d]
+
+    // Pointers for the Experts. These point to flattened arrays of pointers.
+    // The total length of each array is (nlayer * num_experts).
+    // Access in C++ via: array[layer_idx * num_experts + expert_idx]
+    const void *const *moe_experts_gate_up; // Flat array of pointers to each expert's gate_up/swiglu weights
+    const void *const *moe_experts_down;    // Flat array of pointers to each expert's down_proj weights
+
+} QwenMoeWeights;
+
+
+//////////////////// New MoE APIs ///////////////////////
+/// @brief 创建 MoE 模型
+__C __export struct QwenMoeModel *
+createQwenMoeModel(const QwenMoeMeta *,
+                   const QwenMoeWeights *,
+                   infiniDevice_t device,
+                   int ndev,
+                   const int *dev_ids);
+
+/// @brief 销毁 MoE 模型
+__C __export void
+destroyQwenMoeModel(struct QwenMoeModel *);
+
+/// @brief 为 MoE 模型创建 KV Cache
+__C __export struct KVCache *
+createQwenMoeKVCache(const struct QwenMoeModel *);
+
+/// @brief 为 MoE 模型复制 KV Cache
+__C __export struct KVCache *
+duplicateQwenMoeKVCache(const struct QwenMoeModel *,
+                        const struct KVCache *, uint32_t seq_len);
+
+/// @brief 为 MoE 模型销毁 KV Cache
+__C __export void
+dropQwenMoeKVCache(const struct QwenMoeModel *,
+                   struct KVCache *);
+
+/// @brief MoE 模型批次推理一轮，并采样出新的 token
+__C __export void
+inferQwenMoeBatch(struct QwenMoeModel *,
+                  const uint32_t *tokens, uint32_t ntok,
+                  const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                  struct KVCache **kv_caches,
+                  const float *temperature, const uint32_t *topk, const float *topp,
+                  uint32_t *output);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MODEL_QWEN_MOE_H