Relax FP8 TP requirement (#3697)

lzhangzz · web-flow · commit b5ceeeda16dc · 2025-07-15T16:59:19.000+08:00
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
@@ -100,26 +100,18 @@ def __init__(self, model: BaseOutputModel):
         self.inter_size = model.model_config.inter_size
         self.group_size = max(1, model.model_config.group_size)
 
-    def _export(self,
-                inter_size: int,
-                fmt: str,
-                idx: int,
-                w123,
-                kind: str,
-                pack_fn,
-                apply_gs=False,
-                block_size=1,
-                **kwargs):
+    def _export(self, inter_size: int, fmt: str, idx: int, w123, kind: str, pack_fn, apply_gs=[], **kwargs):
         is_lora_a, is_lora_b = get_lora_flags(kind)
         w1, w2, w3 = map(transpose, w123)
 
-        # TODO: handle padding for block_size != 1
-        if not is_lora_a and block_size == 1:
-            w1 = pad_out_dims(w1, inter_size)
-            w3 = pad_out_dims(w3, inter_size)
-        if not is_lora_b and block_size == 1:
-            group_size = self.group_size if apply_gs else 1
-            w2 = pad_in_dims(w2, inter_size // group_size)
+        gs1 = self.group_size if 'w1' in apply_gs else 1
+        w1 = pad_out_dims(w1, inter_size // gs1)
+
+        gs3 = self.group_size if 'w3' in apply_gs else 1
+        w3 = pad_out_dims(w3, inter_size // gs3)
+
+        gs2 = self.group_size if 'w2' in apply_gs else 1
+        w2 = pad_in_dims(w2, inter_size // gs2)
 
         w1, w2, w3 = map(pack_fn, (w1, w2, w3))
         self.model.save_split(w1, fmt.format(idx, 'w1', kind), split_dim=-1, split_num=self.tp, copy=is_lora_a)
@@ -180,54 +172,63 @@ def __init__(self, model: BaseOutputModel):
         self.head_dim = model.model_config.size_per_head
         self.attn_bias = model.model_config.attn_bias
         self.qk_norm = model.model_config.qk_norm
+        self.group_size = max(1, model.model_config.group_size)
 
-    def _reorder_and_merge(self, qkvo, block_size):
+    def _reorder_and_merge(self, qkvo, gs: int):
         q, k, v, o = qkvo
         # reorder output dim for tm's rotary embedding layout
         if self.model.permute_qk:
-            if block_size == 1:
+            if gs == 1:
                 q = permute_v2(q, self.head_dim)
                 k = permute_v2(k, self.head_dim)
             else:
-                assert block_size % self.head_dim == 0
+                assert gs % self.head_dim == 0
         qkv = merge_qkv_v2(q, k, v, self.tp)
         # zero bias for `wo` when `w_qkv` has bias but `wo` doesn't
         if o is None and q.dim() == 1:
             o = torch.zeros_like(q)
         return qkv, o
 
-    def _repeat_kv(self, qkvo, kind: str):
+    def _repeat_kv(self, qkvo, gs: int, kind: str):
         """Replicate kv."""
         q, k, v, o = qkvo
-        head_dim = self.model.model_config.size_per_head
+        head_dim = self.model.model_config.size_per_head // gs
+        kv_head_num = self.model.model_config.kv_head_num // self.model.repeat_kv
         hidden_dim = self.model.model_config.hidden_units
 
         def _repeat(x):
-            dim = hidden_dim if kind != 'bias' else 1
-            x = x.reshape(dim, -1, head_dim)
-            x = x.repeat(1, 1, self.model.repeat_kv)
-            x = x.reshape(dim, -1)
+            n = self.model.repeat_kv
+
+            x = x.reshape(-1, kv_head_num, head_dim)
+            x = x.repeat(1, 1, n)
+            x = x.reshape(-1, kv_head_num * n * head_dim)
+
             return x
 
         k, v = map(_repeat, (k, v))
+
         if kind == 'bias':
             if o is None:
                 o = torch.zeros(hidden_dim, dtype=q.dtype, device=q.device)
             q, k, v, o = map(torch.squeeze, (q, k, v, o))
 
         return (q, k, v, o)
 
-    def _export(self, idx: int, qkvo, kind: str, pack_fn, block_size=1, **kwargs):
+    def _export(self, idx: int, qkvo, kind: str, pack_fn, apply_gs=[], **kwargs):
         if all(x is None for x in qkvo):
             return
         is_lora_a, is_lora_b = get_lora_flags(kind)
-        if is_lora_a:
-            qkv, o = map(transpose, qkvo)
-        else:
-            qkvo = tuple(map(transpose, qkvo))
-            if self.model.repeat_kv:
-                qkvo = self._repeat_kv(qkvo, kind)
-            qkv, o = self._reorder_and_merge(qkvo, block_size)
+        assert not (is_lora_a or is_lora_b)
+
+        qkvo = tuple(map(transpose, qkvo))
+
+        gs = self.group_size if ('w1' in apply_gs) else 1
+
+        if self.model.repeat_kv:
+            qkvo = self._repeat_kv(qkvo, gs, kind)
+
+        qkv, o = self._reorder_and_merge(qkvo, gs)
+
         self.model.save_split(pack_fn(qkv),
                               self._attn.format(idx, 'w_qkv', kind),
                               split_dim=-1,
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
@@ -56,16 +56,16 @@ class QuantWeightOnly(Parameter):
 
     def __call__(self, f, g, i):
         f(i, g('qweight'), 'qweight', pack_u4_row)
-        f(i, g('scales'), 'scales', to_half, apply_gs=True)
-        f(i, g('qzeros'), 'zeros', to_half, apply_gs=True)
+        f(i, g('scales'), 'scales', to_half, apply_gs=['w2'])
+        f(i, g('qzeros'), 'zeros', to_half, apply_gs=['w2'])
 
 
 class WeightScaleInv(Parameter):
     KEYS = '.weight_scale_inv', '.weight'
 
     # TODO: flag any operations crossing the quant blocks as illegal
     def __call__(self, f, g, i):
-        f(i, g('weight_scale_inv'), 'scales', to_float, block_size=128)
+        f(i, g('weight_scale_inv'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2'])
         f(i, g('weight'), 'weight', identity)
 
 
diff --git a/src/turbomind/core/tensor.h b/src/turbomind/core/tensor.h
@@ -22,10 +22,7 @@ class Tensor {
         buffer_ = Buffer(layout_.cosize(), dtype, alloc);
     }
 
-    Tensor(Buffer buffer, Layout layout): layout_{std::move(layout)}, buffer_{std::move(buffer)}
-    {
-        TM_CHECK_LE(layout_.cosize(), buffer_.size());
-    }
+    Tensor(Buffer buffer, Layout layout): layout_{std::move(layout)}, buffer_{buffer.slice(0, layout_.cosize())} {}
 
     Tensor(Buffer buffer): layout_{buffer.size()}, buffer_{buffer} {}
 
@@ -204,11 +201,16 @@ class Tensor {
     Buffer buffer_;
 };
 
-static Tensor empty_like(const Tensor& tensor, std::optional<Device> device = {})
+inline Tensor empty_like(const Tensor& tensor, std::optional<Device> device = {})
 {
     return Tensor{tensor.layout(), tensor.dtype(), device ? *device : tensor.device()};
 }
 
+inline Tensor empty_like(const Tensor& tensor, DataType dtype)
+{
+    return Tensor{tensor.layout(), dtype, tensor.device()};
+}
+
 void Copy(const Tensor& src, Ref<Tensor> dst_, const Stream& stream);
 
 void Copy(const Tensor& src, Ref<Tensor> dst_);
diff --git a/src/turbomind/kernels/gemm/test/testbed_v2.h b/src/turbomind/kernels/gemm/test/testbed_v2.h
@@ -141,8 +141,22 @@ class Testbed_v2 {
         rng_.NormalFloat(b_, 1., 1.);
 
         if (Ta == kFloat8_e4m3) {
-            QuantizeSymmBlock(a_q_, a_s_, a_, stream);
-            DequantizeSymmBlock(a_f_, a_q_, a_s_, stream);
+            if (expert_num_ == 0) {
+                QuantizeSymmBlock(a_q_, a_s_, a_, stream);
+                DequantizeSymmBlock(a_f_, a_q_, a_s_, stream);
+            }
+            else {
+                a_q_          = empty_like(a_, kFloat8_e4m3);
+                a_f_          = empty_like(a_);
+                const int m_s = cdiv(M, 128);
+                a_s_          = Tensor_<float>({m_s * expert_num_, cdiv(K, 128)}, kDEVICE);
+                for (int i = 0; i < expert_num_; ++i) {
+                    auto a_s = a_s_.slice(i * m_s, m_s);
+                    QuantizeSymmBlock(a_q_.slice(i * M, M), a_s, a_.slice(i * M, M), stream);
+                    DequantizeSymmBlock(a_f_.slice(i * M, M), a_q_.slice(i * M, M), a_s, stream);
+                }
+            }
+
             a_q_desc_ = {a_q_.dtype(), kRowMajor, M, K, (int)a_q_.stride(0)};
             u_desc_   = {a_s_.dtype(), kRowMajor, (int)a_s_.shape(0), (int)a_s_.shape(1), (int)a_s_.stride(0)};
             tie(a_x_, a_desc_x_) = std::make_tuple(&a_q_, &a_q_desc_);
@@ -221,8 +235,8 @@ class Testbed_v2 {
         for (int i = 0; i < expert_num_; ++i) {
             a_ptrs[i] = reinterpret_cast<uint64_t>(a_x_->slice(m_offset[i]).raw_data());
             if (a_s_) {
-                TM_CHECK(m_offset[i] % 128 == 0);
-                u_ptrs[i] = reinterpret_cast<uint64_t>(a_s_.slice(m_offset[i] / 128).raw_data());
+                const int m_s = cdiv(M, 128);
+                u_ptrs[i]     = reinterpret_cast<uint64_t>(a_s_.slice(i * m_s).raw_data());
             }
         }
 
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
@@ -86,15 +86,15 @@ __global__ void RMSNorm(T*       dst,
 
 void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st)
 {
+    if (x.size() == 0) {
+        return;
+    }
+
     TM_CHECK(x.ndim() == 2);
     TM_CHECK(out.shape() == x.shape());
     TM_CHECK(out.dtype() == x.dtype());
     TM_CHECK(w.dtype() == x.dtype() && w.shape(-1) == x.shape(-1));
 
-    if (x.size() == 0) {
-        return;
-    }
-
     auto invoke = [&](auto t) {
         using T = decltype(t);
 
diff --git a/src/turbomind/kernels/quantization.cu b/src/turbomind/kernels/quantization.cu
@@ -22,14 +22,17 @@ __global__ void quant_symm_row(
 #if TURBOMIND_ARCH_SM90
     static_assert(group_size % vec_size == 0);
     constexpr int threads = group_size / vec_size;
+    const int     dim1    = round_up(dim, WARP_SIZE * vec_size);
     for (int ti = blockIdx.x; ti < num; ti += gridDim.x) {
-        for (int di = threadIdx.x * vec_size; di < dim; di += blockDim.x * vec_size) {
-            Array<T, vec_size> vec;
-            Ldg(vec, src + ti * src_ld + di);
+        for (int di = threadIdx.x * vec_size; di < dim1; di += blockDim.x * vec_size) {
+            Array<T, vec_size> vec{};
+            if (di < dim) {
+                Ldg(vec, src + ti * src_ld + di);
+            }
             auto         absmax    = fmaxf(static_cast<Tscale>(find_absmax<threads>(vec)), 1e-8f);
             const Tscale scale     = absmax / qmax;
             const Tscale inv_scale = qmax / absmax;
-            if (threadIdx.x % threads == 0) {
+            if (threadIdx.x % threads == 0 && di < dim) {
                 // column-major
                 scales[(di / group_size) * scales_ld + ti] = scale;
             }
@@ -38,7 +41,9 @@ __global__ void quant_symm_row(
             for (int c = 0; c < vec_size; ++c) {
                 tmp[c] = Tout(static_cast<Tscale>(vec[c]) * inv_scale);
             }
-            Store(out + ti * out_ld + di, tmp);
+            if (di < dim) {
+                Store(out + ti * out_ld + di, tmp);
+            }
         }
     }
 #endif
@@ -69,11 +74,13 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st
 
     const int aligned_num = round_up<int>(num, alignment);
 
+    const int s_dim = cdiv<ssize_t>(dim, group_size);
+
     if (!scale) {
-        scale = Tensor_<Tscale>({{dim / group_size, num}, {aligned_num, 1}}, kDEVICE);
+        scale = Tensor_<Tscale>({{s_dim, num}, {aligned_num, 1}}, kDEVICE);
     }
     else {
-        TM_CHECK(std::make_tuple(dim / group_size, num) == scale.shapes(0, 1));
+        TM_CHECK(std::make_tuple(s_dim, num) == scale.shapes(0, 1));
         TM_CHECK(scale.stride(1) == 1);
         TM_CHECK(scale.stride(0) % alignment == 0);
     }
@@ -159,17 +166,17 @@ __global__ void quant_symm_block(Tout* out, Tscale* scales, const T* src, Tscale
     __shared__ typename BlockReduce::TempStorage temp_storage;
     __shared__ T                                 shared_inv_scale;
 
-    const int ti  = blockIdx.x * block_size;
-    const int di  = blockIdx.y * block_size;
-    const int col = threadIdx.x % threads;
     const int row = threadIdx.x / threads;
+    const int col = threadIdx.x % threads;
+    const int ti  = blockIdx.x * block_size;
+    const int di  = blockIdx.y * block_size + col * vec_size;
 
     T                  absmax{};
     Array<T, vec_size> xs[S]{};
     PRAGMA_UNROLL
     for (int s = 0; s < S; ++s) {
-        if (auto r = ti + s * rows + row; r < num) {
-            Ldg(xs[s], src + (int64_t)r * dim + di + col * vec_size);
+        if (auto r = ti + s * rows + row; r < num && di < dim) {
+            Ldg(xs[s], src + (int64_t)r * dim + di);
         }
         PRAGMA_UNROLL
         for (int i = 0; i < vec_size; ++i) {
@@ -193,14 +200,14 @@ __global__ void quant_symm_block(Tout* out, Tscale* scales, const T* src, Tscale
         for (int i = 0; i < vec_size; ++i) {
             ys[s][i] = Tout(static_cast<Tscale>(xs[s][i]) * inv_scale);
         }
-        if (auto r = ti + s * rows + row; r < num) {
-            Store(out + (int64_t)r * dim + di + col * vec_size, ys[s]);
+        if (auto r = ti + s * rows + row; r < num && di < dim) {
+            Store(out + (int64_t)r * dim + di, ys[s]);
         }
     }
 #endif
 }
 
-void QuantizeSymmBlock(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st)
+void QuantizeSymmBlock(Ref<Tensor> out_, Ref<Tensor> scale_, const Tensor& src, cudaStream_t st)
 {
     TM_CHECK(src.is_contiguous());
     TM_CHECK_EQ(src.ndim(), 2);
@@ -220,6 +227,9 @@ void QuantizeSymmBlock(Tensor& out, Tensor& scale, const Tensor& src, cudaStream
     constexpr int cta_size = 1024;
     const dim3    grid(bnum, bdim);
 
+    auto& out   = out_.get();
+    auto& scale = scale_.get();
+
     if (!out) {
         out = Tensor_<Tout>{src.layout(), kDEVICE};
     }
@@ -259,7 +269,7 @@ __global__ void dequant_symm_block(Tout* out, const T* src, const Tscale* scales
     PRAGMA_UNROLL
     for (int s = 0; s < S; ++s) {
         const auto ti = blockIdx.x * block_size + s * rows + row;
-        if (ti < num) {
+        if (ti < num && di < dim) {
             Array<T, vec_size> x;
             Ldg(x, src + (int64_t)ti * dim + di);
             Array<Tout, vec_size> y;
@@ -273,7 +283,7 @@ __global__ void dequant_symm_block(Tout* out, const T* src, const Tscale* scales
 #endif
 }
 
-void DequantizeSymmBlock(Tensor& out, const Tensor& src, const Tensor& scale, cudaStream_t st)
+void DequantizeSymmBlock(Ref<Tensor> out_, Ref<Tensor> src_, const Tensor& scale, cudaStream_t st)
 {
     using T      = fp8_e4m3_t;
     using Tout   = bfloat16_t;
@@ -282,6 +292,9 @@ void DequantizeSymmBlock(Tensor& out, const Tensor& src, const Tensor& scale, cu
     constexpr int block_size = 128;
     constexpr int vec_size   = 8;
 
+    auto& out = out_.get();
+    auto& src = src_.get();
+
     if (!out) {
         out = Tensor_<Tout>{src.layout(), kDEVICE};
     }
diff --git a/src/turbomind/kernels/quantization.h b/src/turbomind/kernels/quantization.h
@@ -6,8 +6,8 @@ void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st
 
 void DequantizeSymm(Tensor& out, const Tensor& src, const Tensor& scale, cudaStream_t st);
 
-void QuantizeSymmBlock(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st);
+void QuantizeSymmBlock(Ref<Tensor> out_, Ref<Tensor> scale_, const Tensor& src, cudaStream_t st);
 
-void DequantizeSymmBlock(Tensor& out, const Tensor& src, const Tensor& scale, cudaStream_t st);
+void DequantizeSymmBlock(Ref<Tensor> out_, Ref<Tensor> src_, const Tensor& scale, cudaStream_t st);
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -36,7 +36,7 @@ MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const E
 
     h_offsets_ = {max_expert_num + 1, kCPUpinned};
 
-    const int max_token_num = engine.max_forward_token_num;
+    const int max_token_num = engine.max_forward_token_num * engine.attn_dp_size;
     const int pad_token_num = (max_token_num + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
 
     masks_   = {max_expert_num * pad_token_num, kDEVICE};
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc