Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e581b7d
fix kernel output extract (#5212)
freeliuzc Nov 25, 2025
a11d17c
[Speculative Decoding][Cherry Pick]Update extract_mtp_weight script a…
freeliuzc Nov 25, 2025
e0c7ebf
[BugFix][Cherry Pick] fix ds type bug (#5220)
kevincheng2 Nov 25, 2025
49be443
[Cherry-Pick][CI] Add check trigger and logic(#5191) (#5227)
EmmonsCurse Nov 26, 2025
7107533
[Cherry-Pick] Fix eplb noaux(#5239) (#5240)
xiaoxiaohehe001 Nov 26, 2025
bdcc952
fix pd-split first step bug (#5246)
freeliuzc Nov 26, 2025
3d74a4b
[Cherry-Pick] MTP split draft_tokens into standalone post-processing …
sunlei1024 Nov 27, 2025
69b4d05
cp_fix_bug (#5253)
kevincheng2 Nov 27, 2025
9b0c65b
Add method to disable sequence parallel MoE if needed (#5268)
yuanlehome Nov 27, 2025
fd1313c
[Cherry-Pick][Feature] support flash_mask_attention backend(#5134) (…
lizhenyun01 Nov 28, 2025
89ed1a9
[Cherry-pick][XPU][CI] Set pip index URL to Tsinghua mirror (#5277) …
plusNew001 Nov 28, 2025
b990644
Update load_weight_utils.py (#5285)
yuanlehome Nov 28, 2025
f1e1f5d
fix mm to_dict bug (#5299)
kevincheng2 Nov 29, 2025
04b2c43
[Optimization] 1.fix tp+ep moe_forward; 2.set max_prefill_batch=env.M…
carryyu Dec 2, 2025
cae2c1c
supports mtp split_kv_attn (#5344)
carryyu Dec 3, 2025
9b5b08c
[Cherry-Pick][BugFix] Fix async download(#5349) (#5347)
kevincheng2 Dec 5, 2025
f08fb25
[Others] Maintain the mtp branch temporarily. (#5447)
carryyu Dec 9, 2025
c5973c2
fix limit_thinking bug (#5477)
yuanlehome Dec 10, 2025
6715196
fix attention bug in spec decoding (#5480)
freeliuzc Dec 10, 2025
f133ce5
[CI] disable test_cuda_graph_dynamic_subgraph.py in unit_test
EmmonsCurse Dec 11, 2025
4c76171
[Optimize][Cherry-pick] Robust stabilty for PD deployment #5338 (#5395)
rainyfly Dec 15, 2025
77ff0cb
[Cherry-Pick][Quantization][BugFix] Support w4afp8 dynamic quant(#528…
Sunny-bot1 Dec 16, 2025
531b96a
[Cherry-Pick][CI] Adape unit_test due to Paddle update(#5576) (#5589)
EmmonsCurse Dec 16, 2025
e65000a
[Cherry-Pick][BugFix] fix speculate_limit_thinking_content_length #55…
yuanlehome Dec 17, 2025
52280be
[Speculative Decoding]Support multi-step mtp with cudagraph (#5624) (…
freeliuzc Dec 23, 2025
cfddec7
[Quantization][Cherry-Pick] Support w4afp8 moe weight offline permute…
Sunny-bot1 Dec 23, 2025
1b74540
fix eplb weight updating (#5529) (#5661)
RichardWooSJTU Dec 23, 2025
db774a6
[Cherry-Pick][CI] Revert adapt vl_model baseline changes due to Paddl…
EmmonsCurse Dec 24, 2025
b018c49
[Cherry-Pick][CI]Fix multistep MTP in splitewise-prefill mode (#5723)…
freeliuzc Dec 24, 2025
d054cf6
commit (#5791)
zhoutianzi666 Dec 26, 2025
19a625a
[Speculative Decoding] Fix attn_mask_offset for multi-step MTP in mix…
freeliuzc Dec 26, 2025
9807f2b
[CI] Remove useless cases in 1131 and fix XPU (#5801)
EmmonsCurse Dec 26, 2025
77fa137
[Feature] two chunk overlap (#5754)
zhoutianzi666 Dec 30, 2025
44cbf2e
fix quant (#5837)
Sunny-bot1 Dec 31, 2025
7aea651
add del to decrease peak memory (#5862)
zhoutianzi666 Jan 5, 2026
20ef041
[Cherry-Pick] Support redundant expert for eplb
xiaoxiaohehe001 Jan 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/CheckPRTemplate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:
branches:
- develop
- 'release/*'
- 'release/**'

jobs:
check:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/Codestyle-Check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:
branches:
- develop
- 'release/*'
- 'release/**'

jobs:
pre-commit:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/approve.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:
branches:
- develop
- 'release/*'
- 'release/**'

env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci_xpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
pull_request:
branches:
- develop
- 'release/*'
- 'release/**'
paths-ignore:
- '**.md'
- '**.txt'
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Code Checkout
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.2.0
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci
run: |
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
Expand Down Expand Up @@ -59,7 +59,7 @@ jobs:

- name: Run CI unittest
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.2.0
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
Expand Down
2 changes: 0 additions & 2 deletions custom_ops/gpu_ops/append_attn/append_attention_func.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2470,8 +2470,6 @@ __global__ void merge_multi_chunks_v2_kernel(
const int num_chunks_this_seq = div_up(seq_len_kv, chunk_size);
if (num_chunks_this_seq <= 1) {
continue;
} else if (!ENABLE_PREFILL) {
continue;
}

using LoadT = AlignedVector<T, vec_size>;
Expand Down
143 changes: 110 additions & 33 deletions custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ __global__ void GQAVariableLengthRotarySplitKernel(
const T *qkv,
const float *cos_emb,
const float *sin_emb,
const float *q_norm_weight,
const float *k_norm_weight,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *seq_lens,
Expand All @@ -38,37 +40,46 @@ __global__ void GQAVariableLengthRotarySplitKernel(
const int kv_num_head,
const int seq_len,
const int last_dim,
const bool rope_3d) {
const bool rope_3d,
const float rms_norm_eps) {
using LoadT = AlignedVector<T, VecSize>;
constexpr int HalfVecSize = VecSize / 2;
using LoadEmbT = AlignedVector<float, HalfVecSize>;
using LoadFloat = AlignedVector<float, VecSize>;
LoadT src_vec;
LoadEmbT cos_emb_vec;
LoadEmbT sin_emb_vec;
int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
LoadFloat tmp_vec;
LoadFloat q_norm_vec, k_norm_vec;
int64_t global_warp_idx = blockDim.y * blockIdx.x + threadIdx.y;
int64_t all_warp_num = gridDim.x * blockDim.y;
const int half_lastdim = last_dim / 2;
const int offset = (q_num_head + kv_num_head * 2) * last_dim;
for (int64_t linear_index = global_thread_idx * VecSize,
step = gridDim.x * blockDim.x * VecSize;
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int offset =
(q_num_head + kv_num_head * 2) * last_dim; // for all q,k,v
const int all_head_num = elem_cnt / last_dim;
for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num;
gloabl_hi += all_warp_num) {
int64_t linear_index =
gloabl_hi * last_dim + threadIdx.x * VecSize; // 全局index
const int token_idx =
linear_index / offset; // token id(第几个token,不分qkv)
const int ori_bi = batch_id_per_token[token_idx]; // 第几个batch
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;

const int ori_seq_id =
(token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id;

const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
int64_t new_emb_idx =
rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
(token_idx - cu_seqlens_q[ori_bi]) +
seq_lens_decoder
[ori_bi]; // 在当前seq中的id(拼接了seq到一个batch的情况下有效)
const int64_t emb_idx =
ori_seq_id * half_lastdim + h_bias / 2; // embedding的id
const int64_t base_idx =
token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim +
h_bias;
Load<T, VecSize>(&qkv[base_idx], &src_vec);
const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id;
int64_t base_split_idx;
T *out_p = nullptr;
if (hi < q_num_head) {
Expand All @@ -84,21 +95,67 @@ __global__ void GQAVariableLengthRotarySplitKernel(
base_split_idx = kv_write_idx * kv_num_head * last_dim +
(hi - q_num_head - kv_num_head) * last_dim + h_bias;
}
Load<T, VecSize>(&qkv[base_idx], &src_vec);
// do rope
if (hi < q_num_head + kv_num_head) {
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);

// TODO check this correct or not
int64_t new_emb_idx =
rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
float thread_m2 = 0.0f;
float warp_m2 = 0.0f;

if (q_norm_weight && k_norm_weight) {
if (hi < q_num_head + kv_num_head) { // only q and k need rope
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
const float input_left = static_cast<float>(src_vec[2 * i]);
const float input_right = static_cast<float>(src_vec[2 * i + 1]);
const float cos_tmp = cos_emb_vec[i];
const float sin_tmp = sin_emb_vec[i];
float tmp1 = input_left * cos_tmp - input_right * sin_tmp;
float tmp2 = input_right * cos_tmp + input_left * sin_tmp;
tmp_vec[2 * i] = tmp1;
tmp_vec[2 * i + 1] = tmp2;
thread_m2 += tmp1 * tmp1 + tmp2 * tmp2;
}
}
WelfordWarpAllReduce<float, 32>(thread_m2, &warp_m2); // 单个head的标准差

if (hi < q_num_head + kv_num_head) { // only q and k need norm
float row_variance = max(warp_m2 / last_dim, 0.0f);
float row_inv_var = Rsqrt(row_variance + rms_norm_eps);
if (hi < q_num_head) {
Load<float, VecSize>(&q_norm_weight[threadIdx.x * VecSize],
&q_norm_vec);
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
const float input_left = static_cast<float>(src_vec[2 * i]);
const float input_right = static_cast<float>(src_vec[2 * i + 1]);
const float cos_tmp = cos_emb_vec[i];
const float sin_tmp = sin_emb_vec[i];
src_vec[2 * i] =
static_cast<T>(input_left * cos_tmp - input_right * sin_tmp);
src_vec[2 * i + 1] =
static_cast<T>(input_right * cos_tmp + input_left * sin_tmp);
for (int i = 0; i < VecSize; i++) {
src_vec[i] =
static_cast<T>(tmp_vec[i] * row_inv_var * q_norm_vec[i]);
}
} else {
Load<float, VecSize>(&k_norm_weight[threadIdx.x * VecSize],
&k_norm_vec);
for (int i = 0; i < VecSize; i++) {
src_vec[i] =
static_cast<T>(tmp_vec[i] * row_inv_var * k_norm_vec[i]);
}
}
}
} else {
if (hi < q_num_head + kv_num_head) {
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
const float input_left = static_cast<float>(src_vec[2 * i]);
const float input_right = static_cast<float>(src_vec[2 * i + 1]);
const float cos_tmp = cos_emb_vec[i];
const float sin_tmp = sin_emb_vec[i];
src_vec[2 * i] =
static_cast<T>(input_left * cos_tmp - input_right * sin_tmp);
src_vec[2 * i + 1] =
static_cast<T>(input_right * cos_tmp + input_left * sin_tmp);
}
}
}
Store<T, VecSize>(src_vec, &qkv_out[base_idx]);
Expand All @@ -114,6 +171,8 @@ void gqa_rotary_qk_split_variable(
T *v,
const T *qkv_input,
const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2]
const float *q_norm_weight,
const float *k_norm_weight,
const int *batch_id_per_token,
const int *seq_lens_encoder,
const int *seq_lens_decoder,
Expand All @@ -126,24 +185,31 @@ void gqa_rotary_qk_split_variable(
const int input_output_len,
const int dim_head,
const bool rope_3d,
const float rms_norm_eps,
const cudaStream_t &stream) {
assert(dim_head == 128 && "dim_head must be 128");
int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * dim_head;
constexpr int PackSize = 16 / sizeof(T);

constexpr int HEAD_DIM = 128;
constexpr int PackSize = HEAD_DIM / kWarpSize;
const int pack_num = elem_nums / PackSize;
const int blocksize = 128;
int grid_size = 1;
GetNumBlocks<128>(pack_num, &grid_size);
dim3 block_size(kWarpSize, blocksize / kWarpSize);

const float *cos_emb = rotary_emb;
const float *sin_emb = rotary_emb + input_output_len * dim_head / 2;
launchWithPdlWhenEnabled(GQAVariableLengthRotarySplitKernel<T, PackSize>,
grid_size,
blocksize,
block_size,
0,
stream,
qkv_input,
cos_emb,
sin_emb,
q_norm_weight,
k_norm_weight,
batch_id_per_token,
cu_seqlens_q,
seq_lens_encoder,
Expand All @@ -158,7 +224,8 @@ void gqa_rotary_qk_split_variable(
kv_num_heads,
seq_len,
dim_head,
rope_3d);
rope_3d,
rms_norm_eps);
}

template <typename T,
Expand Down Expand Up @@ -1054,6 +1121,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::Tensor &cache_batch_ids,
const paddle::Tensor &cache_tile_ids,
const paddle::Tensor &cache_num_blocks,
const paddle::optional<paddle::Tensor> &q_norm_weight,
const paddle::optional<paddle::Tensor> &k_norm_weight,
const paddle::optional<paddle::Tensor> &cache_k_quant_scales,
const paddle::optional<paddle::Tensor> &cache_v_quant_scales,
const paddle::optional<paddle::Tensor> &cache_k_dequant_scales,
Expand All @@ -1063,6 +1132,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::optional<paddle::Tensor> &kv_signal_data,
const int kv_token_num,
const int max_seq_len,
const float rms_norm_eps,
const std::string &cache_quant_type,
const bool rope_3d) {
typedef PDTraits<paddle::DataType::BFLOAT16> traits_;
Expand Down Expand Up @@ -1113,6 +1183,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
v.data<data_t>(),
qkv.data<data_t>(),
rotary_embs.data<float>(),
q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
batch_id_per_token.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
Expand All @@ -1125,6 +1197,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
rope_3d ? rotary_embs.dims()[3] : rotary_embs.dims()[2],
head_dim,
rope_3d,
rms_norm_eps,
stream);

if (token_num < kv_token_num) {
Expand Down Expand Up @@ -1259,6 +1332,8 @@ PD_BUILD_STATIC_OP(gqa_rope_write_cache)
"cache_batch_ids",
"cache_tile_ids_per_batch",
"cache_num_blocks",
paddle::Optional("q_norm_weight"),
paddle::Optional("k_norm_weight"),
paddle::Optional("cache_k_quant_scales"),
paddle::Optional("cache_v_quant_scales"),
paddle::Optional("cache_k_dequant_scales"),
Expand All @@ -1271,5 +1346,7 @@ PD_BUILD_STATIC_OP(gqa_rope_write_cache)
{"value_cache", "value_cache_out"}})
.Attrs({"kv_token_num: int",
"max_seq_len: int",
"cache_quant_type: std::string"})
"rms_norm_eps: float",
"cache_quant_type: std::string",
"rope_3d: bool"})
.SetKernelFn(PD_KERNEL(GQARopeWriteCacheKernel));
Original file line number Diff line number Diff line change
Expand Up @@ -918,10 +918,7 @@ void MultiQueryAppendAttention(
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
uint32_t chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
const int num_chunks = div_up(max_dec_len, chunk_size);
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
dim3 blocks(32, num_warps);
Expand Down Expand Up @@ -1173,9 +1170,6 @@ void MultiQueryAppendAttention(
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}

uint32_t attn_mask_len;
if (attn_mask) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1108,10 +1108,7 @@ void MultiQueryAppendC4Attention(
const float ratio = static_cast<float>(num_blocks_need) /
static_cast<float>(num_blocks_per_wave);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
uint32_t chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
const int num_chunks = div_up(max_dec_len, chunk_size);

dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
Expand Down Expand Up @@ -1390,9 +1387,6 @@ void MultiQueryAppendC4Attention(
static_cast<float>(num_blocks_per_wave);

uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}

const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1218,10 +1218,7 @@ void MultiQueryAppendC8Attention(
const int dev_id = 0;
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
uint32_t chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
const int num_chunks = div_up(max_dec_len, chunk_size);
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
dim3 blocks(32, num_warps);
Expand Down Expand Up @@ -1525,9 +1522,6 @@ void MultiQueryAppendC8Attention(
int sm_count;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}

const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
Expand Down
Loading
Loading