Skip to content

Support W4A8 #3329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 40 additions & 6 deletions custom_ops/xpu_ops/src/ops/moe_layer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,12 @@ template <> struct fused_moe_ffn_trait<bfloat16, bfloat16> {
template <> struct fused_moe_ffn_trait<bfloat16, int8_t> {
using GEMM_TYPE = float;
};
// template <> struct fused_moe_ffn_trait<bfloat16, int4_t> {
// using GEMM_TYPE = int4_wo_int15;
// };

template <> struct fused_moe_ffn_trait<bfloat16, int4_t> {
using GEMM_TYPE = int4_wo_int15;
using GEMM_TYPE = int4_wo_int8;
};

template <typename TX, typename TW>
Expand All @@ -51,6 +55,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
const paddle::optional<paddle::Tensor> &down_proj_bias,
const paddle::optional<paddle::Tensor> &up_gate_proj_weight_scale,
const paddle::optional<paddle::Tensor> &down_proj_weight_scale,
const paddle::optional<paddle::Tensor> &up_gate_proj_in_scale,
const paddle::optional<paddle::Tensor> &down_proj_in_scale, // not support
const std::string &quant_method, const int moe_top_k,
const bool moe_group) {
Expand All @@ -69,8 +74,11 @@ std::vector<paddle::Tensor> MoeLayerKernel(
auto up_gate_proj_dims = up_gate_proj_weight.shape();
PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2.");
PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3.");
PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
if (quant_method == "weight_only_int4") {
// PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
// std::cout << "quant_method : " << quant_method << std::endl;
// std::cout << "x_dims[1] " << x_dims[1] << std::endl;
// std::cout << "up_gate_proj_dims[2] " << up_gate_proj_dims[2] << std::endl;
if (quant_method == "weight_only_int4" || quant_method == "w4a8") {
PD_CHECK(x_dims[1] == up_gate_proj_dims[2] * 2,
"x_dims[1] should equal to up_gate_proj_dims[2], (weight must be "
"[e,n,k]).");
Expand Down Expand Up @@ -167,8 +175,8 @@ std::vector<paddle::Tensor> MoeLayerKernel(
const_cast<float *>(down_proj_bias.get_ptr()->data<float>()),
xftblock::DataType::DT_FLOAT, down_proj_bias.get_ptr()->shape());
}
// std::cout << "[Op Debug] start init moe_ffn weight and bias" <<
// std::endl; MoeFFNWeight
// std::cout << "[Op Debug] start init moe_ffn weight and bias" << std::endl;
// MoeFFNWeight
xftblock::MoeFFNWeight moe_ffn_w_struct;
moe_ffn_w_struct.gate_weight = &xgate_w;
moe_ffn_w_struct.ffn_inter_weights = xup_gate_proj_w.get();
Expand All @@ -185,6 +193,10 @@ std::vector<paddle::Tensor> MoeLayerKernel(
// std::cout << "[Op Debug] pre in xvfblock moe_ffn" << std::endl;

using XPU_TGEMM = typename fused_moe_ffn_trait<XPU_TX, XPU_TW>::GEMM_TYPE;
std::cout << "xpu_moe_layer 算子 XPU_TX " << typeid(XPU_TX).name() << std::endl;
std::cout << "xpu_moe_layer 算子 XPU_TW " << typeid(XPU_TW).name() << std::endl;
std::cout << "xpu_moe_layer 算子 XPU_TGEMM " << typeid(XPU_TGEMM).name() << std::endl;

ret = baidu::xpu::xftblock::moe_ffn_block_sorted_castte_per_token<
XPU_TX, XPU_TW, XPU_TX, XPU_TGEMM>(&xctx, &xin, &xout, moe_ffn_w_struct,
moe_ffn_param);
Expand All @@ -208,6 +220,14 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
const bool moe_group) {
const auto x_type = x.dtype();
const auto w_type = up_gate_proj_weight.dtype();
const auto gate_weight_type = gate_weight.dtype();
// const auto gate_correction_bias_type = gate_correction_bias.dtype();
const auto down_proj_weight_type = down_proj_weight.dtype();
// const auto up_gate_proj_bias_type = up_gate_proj_bias.dtype();
// const auto down_proj_bias_type = down_proj_bias.dtype();
// const auto up_gate_proj_weight_scale_type = up_gate_proj_weight_scale.dtype();
// const auto down_proj_weight_scale_type = down_proj_weight_scale.dtype();
// const auto down_proj_in_scale_type = down_proj_in_scale.dtype();

#define APPLY_MOE_LAYER_KERNEL(TX, TW) \
return MoeLayerKernel<TX, TW>( \
Expand All @@ -216,6 +236,18 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
down_proj_in_scale, quant_method, moe_top_k, moe_group);

// TODO(mayang02): how to use quant_method?
std::cout << "=============== MoeLayer ======\n";
std::cout << "x_type " << x_type << std::endl;
std::cout << "gate_weight_type " << gate_weight_type << std::endl;
// std::cout << "gate_correction_bias_type " << gate_correction_bias_type << std::endl;
std::cout << "down_proj_weight_type " << down_proj_weight_type << std::endl;
// std::cout << "up_gate_proj_bias_type " << up_gate_proj_bias_type << std::endl;
// std::cout << "down_proj_bias_type " << down_proj_bias_type << std::endl;
// std::cout << "up_gate_proj_weight_scale_type " << up_gate_proj_weight_scale_type << std::endl;
// std::cout << "down_proj_weight_scale_type " << down_proj_weight_scale_type << std::endl;
// std::cout << "down_proj_in_scale_type " << down_proj_in_scale_type << std::endl;
std::cout << "w_type " << w_type << std::endl;

if (x_type == paddle::DataType::BFLOAT16 &&
w_type == paddle::DataType::BFLOAT16) {
APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, paddle::bfloat16);
Expand All @@ -225,7 +257,9 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
} else if (x_type == paddle::DataType::BFLOAT16 &&
quant_method == "weight_only_int4") {
APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t);
} else {
} else if(x_type == paddle::DataType::BFLOAT16 && quant_method=="w4a8"){
APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t);
}else{
PD_THROW("MoeLayer not support x_type==%d, w_type==%d",
static_cast<int>(x_type), static_cast<int>(w_type));
return {};
Expand Down
19 changes: 16 additions & 3 deletions custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ WeightQuantizeKernel(const paddle::Tensor &x, const std::string &algo,
auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
int64_t k = x.shape()[0];
int64_t n = x.shape()[1];
// std::cout << "[yw debug] " << "algo " << algo << std::endl;

paddle::Tensor scale =
paddle::full({n}, 0, paddle::DataType::FLOAT32, x.place());
Expand Down Expand Up @@ -66,7 +67,12 @@ WeightQuantizeKernel(const paddle::Tensor &x, const std::string &algo,
{n, k / 2}, {1, 0});
PD_CHECK(ret == 0);
return {out, scale};
} else {
} else if(algo == "w4a8"){
// 如果量化类型是w4a8,就什么都不做
paddle::Tensor out = x;
return {out, scale};
}
else {
PD_THROW("Weight quantize only supports weight_only_int8 on XPU now.");
return {};
}
Expand All @@ -77,6 +83,10 @@ std::vector<paddle::Tensor> WeightQuantize(const paddle::Tensor &x,
const int32_t arch,
const int32_t group_size) {
const auto x_type = x.dtype();
// std::cout << "WeightQuantize x_type " << x_type << std::endl;
// std::cout << "algo " << algo << std::endl;
// std::cout << "arch " << arch << std::endl;
// std::cout << "group_size " << group_size << std::endl;
#define APPLY_WEIGHT_QUANTIZE_KERNEL(TX) \
return WeightQuantizeKernel<TX>(x, algo, arch, group_size);

Expand All @@ -85,8 +95,11 @@ std::vector<paddle::Tensor> WeightQuantize(const paddle::Tensor &x,
} else if (x_type == paddle::DataType::FLOAT32) {
APPLY_WEIGHT_QUANTIZE_KERNEL(float);
} else {
PD_THROW("WeightQuantize not support x_type==%d",
static_cast<int>(x_type));
// PD_THROW("WeightQuantize not support x_type==%d",
// static_cast<int>(x_type));
APPLY_WEIGHT_QUANTIZE_KERNEL(float);
PD_THROW("WeightQuantize not support x_type==%s",
x_type);
return {};
}
}
Expand Down
5 changes: 5 additions & 0 deletions fastdeploy/engine/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def postprocess(self, num_total_tokens, number_of_tasks):
calculate block num
"""
self.dec_token_num = self.enc_dec_block_num * self.block_size
print(f"self.enc_dec_block_num : {self.enc_dec_block_num}")
if self.num_gpu_blocks_override is not None:
self.total_block_num = self.num_gpu_blocks_override
self.prefill_kvcache_block_num = int(self.total_block_num *
Expand All @@ -299,6 +300,10 @@ def postprocess(self, num_total_tokens, number_of_tasks):
length = num_total_tokens // number_of_tasks
block_num = (length + self.block_size - 1 +
self.dec_token_num) // self.block_size
print(f"length : {length}")
print(f"block_num : {block_num}")
print(f"self.dec_token_num : {self.dec_token_num}")
print(f"number_of_tasks : {number_of_tasks}")
self.total_block_num = block_num * number_of_tasks
self.prefill_kvcache_block_num = self.total_block_num
llm_logger.info(
Expand Down
3 changes: 3 additions & 0 deletions fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,11 @@ def from_engine_args(cls, engine_args: EngineArgs):
Returns:
LLMEngine: Instance of the LLMEngine class.
"""
print(f"cls 是什么: {cls}")
print(f"engine_args 是 : {engine_args}")
# Create the engine configs.
config = engine_args.create_engine_config()
# print(f"启动 LLM Engine Config : {config}")
# Create the LLMEngine.
return cls(cfg=config)

Expand Down
1 change: 1 addition & 0 deletions fastdeploy/entrypoints/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def launch_api_server(args) -> None:

def main():
"""main函数"""
print(f"./fastdeploy/entrypoints/api_server.py是启动入口")
parser = FlexibleArgumentParser()
parser.add_argument("--port", default=9904, type=int, help="port to the http server")
parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server")
Expand Down
2 changes: 2 additions & 0 deletions fastdeploy/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def load_engine():
api_server_logger.info(
f"FastDeploy LLM API server starting... {os.getpid()}")
engine_args = EngineArgs.from_cli_args(args)
print(f"engine_args : {engine_args}")
engine = LLMEngine.from_engine_args(engine_args)

if not engine.start(api_server_pid=os.getpid()):
Expand Down Expand Up @@ -426,4 +427,5 @@ def main():


if __name__ == "__main__":
print(f"./fastdeploy/entrypoints/openai/api_server.py是启动入口")
main()
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/attention/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def __init__(
logger.info(
f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode"
)
print(f"对于Attention的KV, 使用{self.kvcache_quant_method}进行量化" if self.kvcache_quant_method else "对于Attention的KV, 不进行量化")

def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
Expand Down
49 changes: 49 additions & 0 deletions fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,55 @@ def forward_mixed(
k_quant_scale = getattr(layer, "cache_k_scale", None)
v_quant_scale = getattr(layer, "cache_v_scale", None)


print(f"qkv.dtype : {qkv.dtype}")
print(f"qkv.shape: {qkv.shape}")
print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}")
print(f"forward_meta.caches[2 * layer.layer_id + 1]: {forward_meta.caches[2 * layer.layer_id + 1].dtype}")
print(f"k_quant_scale : {k_quant_scale.dtype}")
print(f"v_quant_scale : {v_quant_scale.dtype}")
# raise "this is a error"

# print(f"q dtype : {q.dtype}")
# print(f"k dtype : {k.dtype}")
# print(f"v dtype : {v.dtype}")
# if k_quant_scale is not None:
# print(f"k_quant_scale.dtype : {k_quant_scale.dtype}")
# if v_quant_scale is not None:
# print(f"v_quant_scale.dtype : {v_quant_scale.dtype}")
# print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}")
# print(f"forward_meta.caches[2 * layer.layer_id + 1].dtype : {forward_meta.caches[2 * layer.layer_id + 1].dtype}")
if k_quant_scale is not None:
k_quant_scale = paddle.cast(k_quant_scale, dtype='float32')

if v_quant_scale is not None:
v_quant_scale = paddle.cast(v_quant_scale, dtype='float32')


forward_meta.caches[2 * layer.layer_id] = paddle.cast(forward_meta.caches[2 * layer.layer_id], dtype='bfloat16')
forward_meta.caches[2 * layer.layer_id+1] = paddle.cast(forward_meta.caches[2 * layer.layer_id+1], dtype='bfloat16')

from fastdeploy.model_executor.ops.xpu import block_attn
# print(f"qkv.dtype : {qkv.dtype}")
# print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}")
# print(f"forward_meta.caches[2 * layer.layer_id + 1]: {forward_meta.caches[2 * layer.layer_id + 1].dtype}")
# print(f"forward_meta.cum_offsets.dtype: {forward_meta.cum_offsets.dtype}")
# print(f"metadata.rotary_embs : {metadata.rotary_embs.dtype}")
# print(f"metadata.rotary_embs : {metadata.rotary_embs.dtype}")

# print(f"metadata.block_tables : {metadata.block_tables.dtype}")

# print(f"k_quant_scale : {k_quant_scale.dtype}")
# print(f"v_quant_scale : {v_quant_scale.dtype}")

# print(f"forward_meta.enc_batch : {forward_meta.enc_batch.dtype}")
# print(f"forward_meta.dec_batch : {forward_meta.dec_batch.dtype}")
# print(f"forward_meta.total_enc_len : {forward_meta.total_enc_len.dtype}")
# print(f"forward_meta.encoder_seq_lod_cpu : {forward_meta.encoder_seq_lod_cpu.dtype}")
# print(f"forward_meta.encoder_batch_map_cpu: {forward_meta.encoder_batch_map_cpu.dtype}")
# print(f"forward_meta.decoder_context_len_cpu: {forward_meta.decoder_context_len_cpu.dtype}")
# print(f"forward_meta.decoder_batch_map_cpu: {forward_meta.decoder_batch_map_cpu.dtype}")

res = block_attn(
qkv,
forward_meta.caches[2 * layer.layer_id],
Expand All @@ -186,4 +234,5 @@ def forward_mixed(
forward_meta.decoder_context_len_cpu,
forward_meta.decoder_batch_map_cpu,
)
print(f"block_attention 计算完成")
return res
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def create_weights(self, layer: nn.Layer) -> None:
layer.weight_shape.reverse()
if self.quant_config.name() == "weight_only_int4":
layer.weight_shape[0] //= 2
layer.weight_dtype = "int8"
layer.weight_dtype = "int8" # not used
# print(f" layer :{ layer}")
layer.weight_scale = layer.create_parameter(
shape=weight_scale_shape,
dtype="float32",
Expand All @@ -53,8 +54,11 @@ def process_loaded_weights(self, layer: nn.Layer,
"""
loaded_weights using xpu special quantization
"""
# print(f"quant_config.algo : {self.quant_config.algo}")
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
weight, self.quant_config.algo, -1, -1)
layer.weight.set_value(
paddle.transpose(quanted_weight_tensor, [1, 0]))
# print(f"quanted_weight_tensor.dtype: {quanted_weight_tensor.dtype}")
# print(f"weight_scale_tensor.dtype : {weight_scale_tensor.dtype}")
layer.weight_scale.set_value(weight_scale_tensor)
9 changes: 7 additions & 2 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def init_weight(self):
"""
if self.skip_quant:
self.weight_dtype = self._dtype
# print(f"LinearBase : self.weight_dtype : {self.weight_dtype}")
self.weight = self.create_parameter(
shape=self.weight_shape,
dtype=self.weight_dtype,
Expand Down Expand Up @@ -286,6 +287,8 @@ def init_weight(self):
"""
if self.skip_quant:
self.weight_dtype = self._dtype

# print(f"self.weight_dtype : {self.weight_dtype}")
self.weight = self.create_parameter(
shape=self.weight_shape,
dtype=self.weight_dtype,
Expand Down Expand Up @@ -458,7 +461,9 @@ def load_weight(self, state_dict: dict):
self.hidden_size,
])
weight_tensor = paddle.transpose(weight_tensor, perm=[1, 0])

print(f"self.fd_config.quant_config : {self.fd_config.quant_config}")
print(f"weight_tensor.dtype : {weight_tensor.dtype}")
print(f"self.quant_method : {self.quant_method}")
if self.fd_config.quant_config:
self.quant_method.process_loaded_weights(self, weight_tensor)
else:
Expand All @@ -474,7 +479,7 @@ def load_state_dict(self, state_dict: dict):
# weight
assert self.weight_key is not None, 'weight_key should not be None.'
# qkv fused in disk

print(f"self.fd_config.model_config.is_quantized : {self.fd_config.model_config.is_quantized}")
if self.fd_config.model_config.is_quantized:
self.load_prequant_weight(state_dict)
else:
Expand Down
Loading