Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ cache/
*.txt

*.http
.DS_Store
**/.DS_Store
.DS_Store
**/.DS_Store
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ xmake && xmake install
- 运行模型推理测试

```bash
python scripts/jiuge.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
python scripts/qwen.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore] path/to/model_dir [n_device]
```

- 部署模型推理服务
Expand All @@ -34,4 +34,4 @@ python scripts/test_perf.py

```bash
python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
```
```
6 changes: 4 additions & 2 deletions include/infinicore_infer.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#ifndef INFINICORE_INFER_H
#define INFINICORE_INFER_H

// 为了解决函数命名冲突,我们需要调整包含顺序并添加命名空间
#include "infinicore_infer/models/qwen.h"
#include "infinicore_infer/models/qwen_moe.h"
#include "infinicore_infer/models/jiuge.h"



#endif /* INFINICORE_INFER_H */
#endif /* INFINICORE_INFER_H */
24 changes: 24 additions & 0 deletions include/infinicore_infer/imodel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// imodel.h
#pragma once

// 只需要 KVCache 的前向声明,不需要知道它的具体实现
struct KVCache;

// 这是所有模型都必须遵守的通用接口
class IModel {
public:
// C++ 接口类必须有虚析构函数
virtual ~IModel() = default;

// 定义所有模型都必须提供的功能作为“纯虚函数” (= 0)
// 任何继承 IModel 的类都必须自己实现这些函数

// 创建一个适用于此模型的 KVCache 结构
virtual KVCache* createKVCache() const = 0;

// 复制 KVCache(例如用于 beam search)
virtual KVCache* duplicateKVCache(const KVCache* cache, unsigned int seq_len) const = 0;

// 销毁 KVCache
virtual void dropKVCache(KVCache* cache) const = 0;
};
81 changes: 30 additions & 51 deletions include/infinicore_infer/models/jiuge.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
// model_jiuge.h

#ifndef MODEL_JIUGE_H
#define MODEL_JIUGE_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>

#include <stdint.h>

// --- 修改:添加 extern "C" 以保证 C 链接性,保持风格统一 ---
#ifdef __cplusplus
extern "C" {
#endif

// Forward declarations
struct JiugeModel;
struct KVCache;

// Meta and Weights structs remain the same...
typedef struct
{
infiniDtype_t dt_logits;
Expand Down Expand Up @@ -45,68 +54,38 @@ typedef struct
const void *const *ffn_down;
} JiugeWeights;

//////////////////// APIs ///////////////////////

/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__C __export struct JiugeModel *
createJiugeModel(const JiugeMeta *,
const JiugeWeights *,
infiniDevice_t device,
int ndev,
const int *dev_ids);
__C __export struct JiugeModel*
createJiugeModel(const JiugeMeta*, const JiugeWeights*, infiniDevice_t device, int ndev, const int* dev_ids);

/// @brief 销毁模型
__C __export void
destroyJiugeModel(struct JiugeModel *);
destroyJiugeModel(struct JiugeModel*);

/// @brief 创建 KV Cache
__C __export struct KVCache *
createKVCache(const struct JiugeModel *);
__C __export struct KVCache*
createJiugeKVCache(const struct JiugeModel*);

/// @brief 复制 KV Cache
__C __export struct KVCache *
duplicateKVCache(const struct JiugeModel *,
const struct KVCache *, uint32_t seq_len);
__C __export struct KVCache*
duplicateJiugeKVCache(const struct JiugeModel*, const struct KVCache*, uint32_t seq_len);

/// @brief 销毁 KV Cache
__C __export void
dropKVCache(const struct JiugeModel *,
struct KVCache *);

/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__C __export void
inferBatch(struct JiugeModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);
dropJiugeKVCache(const struct JiugeModel*, struct KVCache*);

/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
/// @brief 批次推理一轮
__C __export void
forwardBatch(struct JiugeModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
void *logits);
inferJiugeBatch(struct JiugeModel*,
const uint32_t* tokens, uint32_t ntok,
const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
struct KVCache** kv_caches,
const float* temperature, const uint32_t* topk, const float* topp,
uint32_t* output);

#ifdef __cplusplus
}
#endif

#endif // MODEL_JIUGE_H
96 changes: 96 additions & 0 deletions include/infinicore_infer/models/qwen.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#ifndef MODEL_QWEN_H
#define MODEL_QWEN_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

// Forward declarations
struct QwenModel;
struct KVCache; // 假设 KVCache 是一个通用的结构体

typedef struct
{
infiniDtype_t dt_logits;
size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
float epsilon, theta;
uint32_t end_token;
} QwenMeta;

typedef struct
{
size_t nlayer;
infiniDtype_t dt_norm, dt_mat;
// 0 if linear weights are passed as W, any other value if passed as W^T (default format in pytorch)
int transpose_linear_weights;
// [dvoc, d]
const void *input_embd;
// [d]
const void *output_norm;
// [dvoc, d]
const void *output_embd;
// nlayer * [d]
const void *const *attn_norm;
// nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
const void *const *attn_qkv;
// nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
const void *const *attn_qkv_b;
// nlayer * [dh]
const void *const *attn_q_norm;
// nlayer * [dh]
const void *const *attn_k_norm;
// nlayer * [ndev, d, nkvh / ndev * dh]
const void *const *attn_o;
// nlayer * [d]
const void *const *ffn_norm;
// nlayer * [ndev, 2 * di / ndev, d]
const void *const *ffn_gate_up;
// nlayer * [ndev, d, di / ndev]
const void *const *ffn_down;
} QwenWeights;


//////////////////// APIs ///////////////////////
/// @brief 创建模型
__C __export struct QwenModel*
createQwenModel(const QwenMeta*, const QwenWeights*, infiniDevice_t device, int ndev, const int* dev_ids);

/// @brief 销毁模型
__C __export void
destroyQwenModel(struct QwenModel*);

/// @brief 创建 KV Cache
// --- 修改:函数重命名 ---
__C __export struct KVCache*
createQwenKVCache(const struct QwenModel*);

/// @brief 复制 KV Cache
// --- 修改:函数重命名 ---
__C __export struct KVCache*
duplicateQwenKVCache(const struct QwenModel*, const struct KVCache*, uint32_t seq_len);

/// @brief 销毁 KV Cache
// --- 修改:函数重命名 ---
__C __export void
dropQwenKVCache(const struct QwenModel*, struct KVCache*);

/// @brief 批次推理一轮
// --- 修改:函数重命名 ---
__C __export void
inferQwenBatch(struct QwenModel*,
const uint32_t* tokens, uint32_t ntok,
const uint32_t* req_lens, uint32_t nreq, const uint32_t* req_pos,
struct KVCache** kv_caches,
const float* temperature, const uint32_t* topk, const float* topp,
uint32_t* output);

#ifdef __cplusplus
}
#endif

#endif // MODEL_QWEN_H
112 changes: 112 additions & 0 deletions include/infinicore_infer/models/qwen_moe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#ifndef MODEL_QWEN_MOE_H
#define MODEL_QWEN_MOE_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

// Forward declaration for the new MoE model handle
struct QwenMoeModel;
// KVCache struct can be reused if its definition is generic enough,
// otherwise it should also be specialized. Assuming it's generic for now.
struct KVCache;


// Renamed and specialized Meta struct for MoE
typedef struct
{
// --- Standard Fields (same as dense model) ---
infiniDtype_t dt_logits;
size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
float epsilon, theta;
uint32_t end_token;

// --- New MoE-Specific Fields ---
size_t num_experts; // Total number of experts per layer
size_t num_experts_per_tok; // Number of active experts per token
size_t moe_intermediate_size; // Intermediate size of a single expert's FFN
int norm_topk_prob; // Flag (0 or 1) for routing logic

} QwenMoeMeta;

// Renamed and redesigned Weights struct for MoE
typedef struct
{
// --- Standard Fields (same as dense model) ---
size_t nlayer;
infiniDtype_t dt_norm, dt_mat;
int transpose_linear_weights;
const void *input_embd; // [dvoc, d]
const void *output_norm; // [d]
const void *output_embd; // [dvoc, d]

// --- Attention Block (same as dense model) ---
const void *const *attn_norm; // nlayer * [d]
const void *const *attn_qkv; // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh, d]
const void *const *attn_qkv_b; // nlayer * [ndev, (nh + 2 * nkvh) / ndev * dh]
const void *const *attn_q_norm; // nlayer * [dh]
const void *const *attn_k_norm; // nlayer * [dh]
const void *const *attn_o; // nlayer * [ndev, d, nkvh / ndev * dh]

// --- MoE Block (replaces dense FFN) ---
const void *const *ffn_norm; // Still needed: nlayer * [d] (post_attention_layernorm)

// Pointers for the Gating Network in each layer
const void *const *moe_gate; // nlayer * [num_experts, d]

// Pointers for the Experts. These point to flattened arrays of pointers.
// The total length of each array is (nlayer * num_experts).
// Access in C++ via: array[layer_idx * num_experts + expert_idx]
const void *const *moe_experts_gate_up; // Flat array of pointers to each expert's gate_up/swiglu weights
const void *const *moe_experts_down; // Flat array of pointers to each expert's down_proj weights

} QwenMoeWeights;


//////////////////// New MoE APIs ///////////////////////
/// @brief 创建 MoE 模型
__C __export struct QwenMoeModel *
createQwenMoeModel(const QwenMoeMeta *,
const QwenMoeWeights *,
infiniDevice_t device,
int ndev,
const int *dev_ids);

/// @brief 销毁 MoE 模型
__C __export void
destroyQwenMoeModel(struct QwenMoeModel *);

/// @brief 为 MoE 模型创建 KV Cache
__C __export struct KVCache *
createQwenMoeKVCache(const struct QwenMoeModel *);

/// @brief 为 MoE 模型复制 KV Cache
__C __export struct KVCache *
duplicateQwenMoeKVCache(const struct QwenMoeModel *,
const struct KVCache *, uint32_t seq_len);

/// @brief 为 MoE 模型销毁 KV Cache
__C __export void
dropQwenMoeKVCache(const struct QwenMoeModel *,
struct KVCache *);

/// @brief MoE 模型批次推理一轮,并采样出新的 token
__C __export void
inferQwenMoeBatch(struct QwenMoeModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);

#ifdef __cplusplus
}
#endif

#endif // MODEL_QWEN_MOE_H
Loading