From b340ed94a4027f92ec36b47a0eeafa59822fb844 Mon Sep 17 00:00:00 2001 From: iosmers Date: Mon, 11 Aug 2025 12:51:28 +0000 Subject: [PATCH] update --- custom_ops/xpu_ops/src/ops/moe_layer.cc | 46 +++- .../xpu_ops/src/ops/weight_quantize_xpu.cc | 19 +- fastdeploy/engine/config.py | 5 + fastdeploy/engine/engine.py | 3 + fastdeploy/entrypoints/api_server.py | 1 + fastdeploy/entrypoints/openai/api_server.py | 2 + .../layers/attention/attention.py | 1 + .../layers/attention/xpu_attn_backend.py | 49 ++++ .../backends/xpu/quantization/weight_only.py | 6 +- fastdeploy/model_executor/layers/linear.py | 9 +- .../layers/moe/fused_moe_xpu_backend.py | 223 ++++++++++++++++++ fastdeploy/model_executor/layers/moe/moe.py | 6 + .../layers/quantization/kv_cache.py | 5 +- .../layers/quantization/mix_quant.py | 18 +- .../layers/quantization/w4a8.py | 9 +- fastdeploy/model_executor/layers/utils.py | 1 + fastdeploy/model_executor/model_loader.py | 8 + .../model_executor/models/ernie4_5_moe.py | 5 + fastdeploy/worker/worker_process.py | 13 + fastdeploy/worker/xpu_model_runner.py | 18 +- fastdeploy/worker/xpu_worker.py | 5 +- 21 files changed, 424 insertions(+), 28 deletions(-) diff --git a/custom_ops/xpu_ops/src/ops/moe_layer.cc b/custom_ops/xpu_ops/src/ops/moe_layer.cc index 70f4fac52b..1b407b88c9 100644 --- a/custom_ops/xpu_ops/src/ops/moe_layer.cc +++ b/custom_ops/xpu_ops/src/ops/moe_layer.cc @@ -38,8 +38,12 @@ template <> struct fused_moe_ffn_trait { template <> struct fused_moe_ffn_trait { using GEMM_TYPE = float; }; +// template <> struct fused_moe_ffn_trait { +// using GEMM_TYPE = int4_wo_int15; +// }; + template <> struct fused_moe_ffn_trait { - using GEMM_TYPE = int4_wo_int15; + using GEMM_TYPE = int4_wo_int8; }; template @@ -51,6 +55,7 @@ std::vector MoeLayerKernel( const paddle::optional &down_proj_bias, const paddle::optional &up_gate_proj_weight_scale, const paddle::optional &down_proj_weight_scale, + const paddle::optional &up_gate_proj_in_scale, const paddle::optional &down_proj_in_scale, // not support const std::string &quant_method, const int moe_top_k, const bool moe_group) { @@ -69,8 +74,11 @@ std::vector MoeLayerKernel( auto up_gate_proj_dims = up_gate_proj_weight.shape(); PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2."); PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3."); - PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support."); - if (quant_method == "weight_only_int4") { + // PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support."); + // std::cout << "quant_method : " << quant_method << std::endl; + // std::cout << "x_dims[1] " << x_dims[1] << std::endl; + // std::cout << "up_gate_proj_dims[2] " << up_gate_proj_dims[2] << std::endl; + if (quant_method == "weight_only_int4" || quant_method == "w4a8") { PD_CHECK(x_dims[1] == up_gate_proj_dims[2] * 2, "x_dims[1] should equal to up_gate_proj_dims[2], (weight must be " "[e,n,k])."); @@ -167,8 +175,8 @@ std::vector MoeLayerKernel( const_cast(down_proj_bias.get_ptr()->data()), xftblock::DataType::DT_FLOAT, down_proj_bias.get_ptr()->shape()); } - // std::cout << "[Op Debug] start init moe_ffn weight and bias" << - // std::endl; MoeFFNWeight + // std::cout << "[Op Debug] start init moe_ffn weight and bias" << std::endl; + // MoeFFNWeight xftblock::MoeFFNWeight moe_ffn_w_struct; moe_ffn_w_struct.gate_weight = &xgate_w; moe_ffn_w_struct.ffn_inter_weights = xup_gate_proj_w.get(); @@ -185,6 +193,10 @@ std::vector MoeLayerKernel( // std::cout << "[Op Debug] pre in xvfblock moe_ffn" << std::endl; using XPU_TGEMM = typename fused_moe_ffn_trait::GEMM_TYPE; + std::cout << "xpu_moe_layer 算子 XPU_TX " << typeid(XPU_TX).name() << std::endl; + std::cout << "xpu_moe_layer 算子 XPU_TW " << typeid(XPU_TW).name() << std::endl; + std::cout << "xpu_moe_layer 算子 XPU_TGEMM " << typeid(XPU_TGEMM).name() << std::endl; + ret = baidu::xpu::xftblock::moe_ffn_block_sorted_castte_per_token< XPU_TX, XPU_TW, XPU_TX, XPU_TGEMM>(&xctx, &xin, &xout, moe_ffn_w_struct, moe_ffn_param); @@ -208,6 +220,14 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight, const bool moe_group) { const auto x_type = x.dtype(); const auto w_type = up_gate_proj_weight.dtype(); + const auto gate_weight_type = gate_weight.dtype(); + // const auto gate_correction_bias_type = gate_correction_bias.dtype(); + const auto down_proj_weight_type = down_proj_weight.dtype(); + // const auto up_gate_proj_bias_type = up_gate_proj_bias.dtype(); + // const auto down_proj_bias_type = down_proj_bias.dtype(); + // const auto up_gate_proj_weight_scale_type = up_gate_proj_weight_scale.dtype(); + // const auto down_proj_weight_scale_type = down_proj_weight_scale.dtype(); + // const auto down_proj_in_scale_type = down_proj_in_scale.dtype(); #define APPLY_MOE_LAYER_KERNEL(TX, TW) \ return MoeLayerKernel( \ @@ -216,6 +236,18 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight, down_proj_in_scale, quant_method, moe_top_k, moe_group); // TODO(mayang02): how to use quant_method? + std::cout << "=============== MoeLayer ======\n"; + std::cout << "x_type " << x_type << std::endl; + std::cout << "gate_weight_type " << gate_weight_type << std::endl; + // std::cout << "gate_correction_bias_type " << gate_correction_bias_type << std::endl; + std::cout << "down_proj_weight_type " << down_proj_weight_type << std::endl; + // std::cout << "up_gate_proj_bias_type " << up_gate_proj_bias_type << std::endl; + // std::cout << "down_proj_bias_type " << down_proj_bias_type << std::endl; + // std::cout << "up_gate_proj_weight_scale_type " << up_gate_proj_weight_scale_type << std::endl; + // std::cout << "down_proj_weight_scale_type " << down_proj_weight_scale_type << std::endl; + // std::cout << "down_proj_in_scale_type " << down_proj_in_scale_type << std::endl; + std::cout << "w_type " << w_type << std::endl; + if (x_type == paddle::DataType::BFLOAT16 && w_type == paddle::DataType::BFLOAT16) { APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, paddle::bfloat16); @@ -225,7 +257,9 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight, } else if (x_type == paddle::DataType::BFLOAT16 && quant_method == "weight_only_int4") { APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t); - } else { + } else if(x_type == paddle::DataType::BFLOAT16 && quant_method=="w4a8"){ + APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t); + }else{ PD_THROW("MoeLayer not support x_type==%d, w_type==%d", static_cast(x_type), static_cast(w_type)); return {}; diff --git a/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc b/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc index 32a3699859..fae81cee01 100644 --- a/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc +++ b/custom_ops/xpu_ops/src/ops/weight_quantize_xpu.cc @@ -28,6 +28,7 @@ WeightQuantizeKernel(const paddle::Tensor &x, const std::string &algo, auto xpu_ctx = static_cast(dev_ctx); int64_t k = x.shape()[0]; int64_t n = x.shape()[1]; + // std::cout << "[yw debug] " << "algo " << algo << std::endl; paddle::Tensor scale = paddle::full({n}, 0, paddle::DataType::FLOAT32, x.place()); @@ -66,7 +67,12 @@ WeightQuantizeKernel(const paddle::Tensor &x, const std::string &algo, {n, k / 2}, {1, 0}); PD_CHECK(ret == 0); return {out, scale}; - } else { + } else if(algo == "w4a8"){ + // 如果量化类型是w4a8,就什么都不做 + paddle::Tensor out = x; + return {out, scale}; + } + else { PD_THROW("Weight quantize only supports weight_only_int8 on XPU now."); return {}; } @@ -77,6 +83,10 @@ std::vector WeightQuantize(const paddle::Tensor &x, const int32_t arch, const int32_t group_size) { const auto x_type = x.dtype(); + // std::cout << "WeightQuantize x_type " << x_type << std::endl; + // std::cout << "algo " << algo << std::endl; + // std::cout << "arch " << arch << std::endl; + // std::cout << "group_size " << group_size << std::endl; #define APPLY_WEIGHT_QUANTIZE_KERNEL(TX) \ return WeightQuantizeKernel(x, algo, arch, group_size); @@ -85,8 +95,11 @@ std::vector WeightQuantize(const paddle::Tensor &x, } else if (x_type == paddle::DataType::FLOAT32) { APPLY_WEIGHT_QUANTIZE_KERNEL(float); } else { - PD_THROW("WeightQuantize not support x_type==%d", - static_cast(x_type)); + // PD_THROW("WeightQuantize not support x_type==%d", + // static_cast(x_type)); + APPLY_WEIGHT_QUANTIZE_KERNEL(float); + PD_THROW("WeightQuantize not support x_type==%s", + x_type); return {}; } } diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 2d49aa0ce9..57510a9354 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -291,6 +291,7 @@ def postprocess(self, num_total_tokens, number_of_tasks): calculate block num """ self.dec_token_num = self.enc_dec_block_num * self.block_size + print(f"self.enc_dec_block_num : {self.enc_dec_block_num}") if self.num_gpu_blocks_override is not None: self.total_block_num = self.num_gpu_blocks_override self.prefill_kvcache_block_num = int(self.total_block_num * @@ -299,6 +300,10 @@ def postprocess(self, num_total_tokens, number_of_tasks): length = num_total_tokens // number_of_tasks block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size + print(f"length : {length}") + print(f"block_num : {block_num}") + print(f"self.dec_token_num : {self.dec_token_num}") + print(f"number_of_tasks : {number_of_tasks}") self.total_block_num = block_num * number_of_tasks self.prefill_kvcache_block_num = self.total_block_num llm_logger.info( diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 89a4f2ca4a..745a41dbf4 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -78,8 +78,11 @@ def from_engine_args(cls, engine_args: EngineArgs): Returns: LLMEngine: Instance of the LLMEngine class. """ + print(f"cls 是什么: {cls}") + print(f"engine_args 是 : {engine_args}") # Create the engine configs. config = engine_args.create_engine_config() + # print(f"启动 LLM Engine Config : {config}") # Create the LLMEngine. return cls(cfg=config) diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index 9c2ce35c37..73e9e20d80 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -108,6 +108,7 @@ def launch_api_server(args) -> None: def main(): """main函数""" + print(f"./fastdeploy/entrypoints/api_server.py是启动入口") parser = FlexibleArgumentParser() parser.add_argument("--port", default=9904, type=int, help="port to the http server") parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 3a2ee5e726..ef146f678c 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -82,6 +82,7 @@ def load_engine(): api_server_logger.info( f"FastDeploy LLM API server starting... {os.getpid()}") engine_args = EngineArgs.from_cli_args(args) + print(f"engine_args : {engine_args}") engine = LLMEngine.from_engine_args(engine_args) if not engine.start(api_server_pid=os.getpid()): @@ -426,4 +427,5 @@ def main(): if __name__ == "__main__": + print(f"./fastdeploy/entrypoints/openai/api_server.py是启动入口") main() diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 457e5d5215..81631e4d0d 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -99,6 +99,7 @@ def __init__( logger.info( f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode" ) + print(f"对于Attention的KV, 使用{self.kvcache_quant_method}进行量化" if self.kvcache_quant_method else "对于Attention的KV, 不进行量化") def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 6c3cade149..13d309777b 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -167,7 +167,55 @@ def forward_mixed( k_quant_scale = getattr(layer, "cache_k_scale", None) v_quant_scale = getattr(layer, "cache_v_scale", None) + + print(f"qkv.dtype : {qkv.dtype}") + print(f"qkv.shape: {qkv.shape}") + print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}") + print(f"forward_meta.caches[2 * layer.layer_id + 1]: {forward_meta.caches[2 * layer.layer_id + 1].dtype}") + print(f"k_quant_scale : {k_quant_scale.dtype}") + print(f"v_quant_scale : {v_quant_scale.dtype}") + # raise "this is a error" + + # print(f"q dtype : {q.dtype}") + # print(f"k dtype : {k.dtype}") + # print(f"v dtype : {v.dtype}") + # if k_quant_scale is not None: + # print(f"k_quant_scale.dtype : {k_quant_scale.dtype}") + # if v_quant_scale is not None: + # print(f"v_quant_scale.dtype : {v_quant_scale.dtype}") + # print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}") + # print(f"forward_meta.caches[2 * layer.layer_id + 1].dtype : {forward_meta.caches[2 * layer.layer_id + 1].dtype}") + if k_quant_scale is not None: + k_quant_scale = paddle.cast(k_quant_scale, dtype='float32') + + if v_quant_scale is not None: + v_quant_scale = paddle.cast(v_quant_scale, dtype='float32') + + + forward_meta.caches[2 * layer.layer_id] = paddle.cast(forward_meta.caches[2 * layer.layer_id], dtype='bfloat16') + forward_meta.caches[2 * layer.layer_id+1] = paddle.cast(forward_meta.caches[2 * layer.layer_id+1], dtype='bfloat16') + from fastdeploy.model_executor.ops.xpu import block_attn + # print(f"qkv.dtype : {qkv.dtype}") + # print(f"forward_meta.caches[2 * layer.layer_id].dtype : {forward_meta.caches[2 * layer.layer_id].dtype}") + # print(f"forward_meta.caches[2 * layer.layer_id + 1]: {forward_meta.caches[2 * layer.layer_id + 1].dtype}") + # print(f"forward_meta.cum_offsets.dtype: {forward_meta.cum_offsets.dtype}") + # print(f"metadata.rotary_embs : {metadata.rotary_embs.dtype}") + # print(f"metadata.rotary_embs : {metadata.rotary_embs.dtype}") + + # print(f"metadata.block_tables : {metadata.block_tables.dtype}") + + # print(f"k_quant_scale : {k_quant_scale.dtype}") + # print(f"v_quant_scale : {v_quant_scale.dtype}") + + # print(f"forward_meta.enc_batch : {forward_meta.enc_batch.dtype}") + # print(f"forward_meta.dec_batch : {forward_meta.dec_batch.dtype}") + # print(f"forward_meta.total_enc_len : {forward_meta.total_enc_len.dtype}") + # print(f"forward_meta.encoder_seq_lod_cpu : {forward_meta.encoder_seq_lod_cpu.dtype}") + # print(f"forward_meta.encoder_batch_map_cpu: {forward_meta.encoder_batch_map_cpu.dtype}") + # print(f"forward_meta.decoder_context_len_cpu: {forward_meta.decoder_context_len_cpu.dtype}") + # print(f"forward_meta.decoder_batch_map_cpu: {forward_meta.decoder_batch_map_cpu.dtype}") + res = block_attn( qkv, forward_meta.caches[2 * layer.layer_id], @@ -186,4 +234,5 @@ def forward_mixed( forward_meta.decoder_context_len_cpu, forward_meta.decoder_batch_map_cpu, ) + print(f"block_attention 计算完成") return res diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py index 36bd87bc0a..13923a9cf2 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py @@ -41,7 +41,8 @@ def create_weights(self, layer: nn.Layer) -> None: layer.weight_shape.reverse() if self.quant_config.name() == "weight_only_int4": layer.weight_shape[0] //= 2 - layer.weight_dtype = "int8" + layer.weight_dtype = "int8" # not used + # print(f" layer :{ layer}") layer.weight_scale = layer.create_parameter( shape=weight_scale_shape, dtype="float32", @@ -53,8 +54,11 @@ def process_loaded_weights(self, layer: nn.Layer, """ loaded_weights using xpu special quantization """ + # print(f"quant_config.algo : {self.quant_config.algo}") quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu( weight, self.quant_config.algo, -1, -1) layer.weight.set_value( paddle.transpose(quanted_weight_tensor, [1, 0])) + # print(f"quanted_weight_tensor.dtype: {quanted_weight_tensor.dtype}") + # print(f"weight_scale_tensor.dtype : {weight_scale_tensor.dtype}") layer.weight_scale.set_value(weight_scale_tensor) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 324a5eed61..a159645c44 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -96,6 +96,7 @@ def init_weight(self): """ if self.skip_quant: self.weight_dtype = self._dtype + # print(f"LinearBase : self.weight_dtype : {self.weight_dtype}") self.weight = self.create_parameter( shape=self.weight_shape, dtype=self.weight_dtype, @@ -286,6 +287,8 @@ def init_weight(self): """ if self.skip_quant: self.weight_dtype = self._dtype + + # print(f"self.weight_dtype : {self.weight_dtype}") self.weight = self.create_parameter( shape=self.weight_shape, dtype=self.weight_dtype, @@ -458,7 +461,9 @@ def load_weight(self, state_dict: dict): self.hidden_size, ]) weight_tensor = paddle.transpose(weight_tensor, perm=[1, 0]) - + print(f"self.fd_config.quant_config : {self.fd_config.quant_config}") + print(f"weight_tensor.dtype : {weight_tensor.dtype}") + print(f"self.quant_method : {self.quant_method}") if self.fd_config.quant_config: self.quant_method.process_loaded_weights(self, weight_tensor) else: @@ -474,7 +479,7 @@ def load_state_dict(self, state_dict: dict): # weight assert self.weight_key is not None, 'weight_key should not be None.' # qkv fused in disk - + print(f"self.fd_config.model_config.is_quantized : {self.fd_config.model_config.is_quantized}") if self.fd_config.model_config.is_quantized: self.load_prequant_weight(state_dict) else: diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py index 6f74acdff8..d44a0184fb 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py @@ -27,6 +27,7 @@ from .fused_moe_backend_base import MoEMethodBase +from ..utils import create_and_set_parameter, get_tensor class XPUMoEMethod(MoEMethodBase): """ @@ -111,6 +112,221 @@ def apply_ep_decode( """ raise NotImplementedError +class XPUW4A8MoEMethod(XPUMoEMethod): + """ + XPU w4a8 MoE Method + """ + + def __init__(self, quant_config): + super().__init__(quant_config) + self.quant_config = quant_config + self.moe_quant_type = "w4a8" + self.pack_num = 2 + + def create_weights(self, layer: nn.Layer, state_dict): + """ + Paddle cutlass create weight process. + """ + print(f"----------为layer{layer.full_name}加载权重------------") + # for k, v in state_dict.items(): + # print(f"key is : {k}") + # print(f"value.type is {v.dtype}") + # print(f"value.shape is {v.shape}") + + up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict) + self.check(layer, up_gate_proj_weights, down_proj_weights) + for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]): + weight_name = self.added_weight_attrs[idx] + weight_list = [] + for i in range(layer.num_local_experts): + # print(f"self.moe_quant_type : {self.moe_quant_type}") + # print(f"weight_tensor[i].shape: {weight_tensor[i].shape}") + # print(f"weight_tensor[i].dtpye: {weight_tensor[i].dtype}") + # print(f"self.moe_quant_type : {self.moe_quant_type}") + quant_weight, scale = weight_quantize_xpu(weight_tensor[i], + self.moe_quant_type, + -1,-1) + weight_list.append(quant_weight) + quanted_weight = paddle.stack(weight_list, axis=0) + create_and_set_parameter(layer, weight_name, quanted_weight) + + self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict) + + + def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, + state_dict: dict): + """ + Get w4a8 weights from state dict and process them. + Args: + layer (nn.Layer): The layer to add parameters to. + weight_key_map (dict): The weight key map. + state_dict (dict): The state dict. + """ + + def _extract_scale_tensor(state_dict, key_template, expert_idx): + return get_tensor(state_dict.pop(key_template.format(expert_idx))) + + def _process_in_scale(name: str, in_scales: list[paddle.Tensor]): + processed_in_scale = 1 / paddle.concat(in_scales) + create_and_set_parameter(layer, name, processed_in_scale) + return processed_in_scale + + def _process_weight_scale(name: str, + weight_scales: list[paddle.Tensor], + processed_in_scale: paddle.Tensor): + processed_weight_scale = (paddle.stack(weight_scales, axis=0) / + (127 * 112) / + processed_in_scale[:, None]).cast( + dtype="float32") + + # print(f"paddle.get_default_dtype() : {paddle.get_default_dtype()}") + processed_weight_scale = (paddle.stack(weight_scales, axis=0) / + (127 * 112) / + processed_in_scale[:, None]).cast( + paddle.get_default_dtype()) + + create_and_set_parameter(layer, name, processed_weight_scale) + + # 1. Init scale containers and maps + up_gate_proj_weight_scales = [] + down_proj_weight_scales = [] + up_gate_proj_in_scales = [] + down_proj_in_scales = [] + + scale_weight_map = { + "up_gate_proj_weight_scale": up_gate_proj_weight_scales, + "down_proj_weight_scale": down_proj_weight_scales, + "up_gate_proj_in_scale": up_gate_proj_in_scales, + "down_proj_in_scale": down_proj_in_scales, + } + scale_key_map = { + "up_gate_proj_weight_scale": + weight_key_map.get("up_gate_proj_expert_weight_scale_key", None), + "down_proj_weight_scale": + weight_key_map.get("down_proj_expert_weight_scale_key", None), + "up_gate_proj_in_scale": + weight_key_map.get("up_gate_proj_expert_in_scale_key", None), + "down_proj_in_scale": + weight_key_map.get("down_proj_expert_in_scale_key", None), + } + for name, value in scale_key_map.items(): + if value is None: + raise ValueError( + f"scale {name} should not be none in w4a8 mode.") + + # 2. Extract scale tensor from state dict + for local_expert_idx in range(layer.num_local_experts): + expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts + for name, scale_key_template in scale_key_map.items(): + scale_tensor = _extract_scale_tensor(state_dict, + scale_key_template, + expert_idx) + # print(f"scale_tensor.dtype : {scale_tensor.dtype}") + scale_weight_map[name].append(scale_tensor) + + # 3. Process scale tensor and set to layer + in_scales = [] + for in_scale_name in ["up_gate_proj_in_scale", "down_proj_in_scale"]: + in_scales.append( + _process_in_scale(in_scale_name, + scale_weight_map[in_scale_name])) + + for i, weight_scale_name in enumerate( + ["up_gate_proj_weight_scale", "down_proj_weight_scale"]): + _process_weight_scale(weight_scale_name, + scale_weight_map[weight_scale_name], + in_scales[i]) + + + def apply( + self, + layer: nn.Layer, + x: paddle.Tensor, + gate_out: paddle.Tensor, + ) -> paddle.Tensor: + """ + XPU compute Fused MoE. + """ + from fastdeploy.model_executor.ops.xpu import xpu_moe_layer + + print(f"layer.up_gate_proj_weight.shape : {layer.up_gate_proj_weight.shape}") + print(f"layer.down_proj_weight.shappe : {layer.down_proj_weight.shape}") + + print(f"layer.up_gate_proj_weight.transpose([2, 1]).shape : {layer.up_gate_proj_weight.transpose([0, 2, 1]).shape}") + print(f"layer.down_proj_weight.transpose([2, 1]).shape {layer.down_proj_weight.transpose([2, 1]).shape}") + print(f"layer.up_gate_proj_weight_scale.dtype : {layer.up_gate_proj_weight_scale.dtype}") + print(f"layer.down_proj_weight_scale.dtype : {layer.down_proj_weight_scale.dtype}") + + # if layer.up_gate_proj_weight_scale is not None: + # # layer.up_gate_proj_weight_scale.set_value(paddle.cast(layer.up_gate_proj_weight_scale, dtype="float32")) + # layer.up_gate_proj_weight_scale.astype("float32") + + + # if layer.down_proj_weight_scale is not None: + # # layer.down_proj_weight_scale.set_value(paddle.cast(layer.down_proj_weight_scale, dtype="float32")) + # layer.down_proj_weight_scale.astype("float32") + + if layer.up_gate_proj_weight_scale is not None: + # 先转换数据类型 + casted_tensor = paddle.cast(layer.up_gate_proj_weight_scale, dtype="float32") + + # 先删除旧参数 + if hasattr(layer, 'up_gate_proj_weight_scale'): + del layer.up_gate_proj_weight_scale + + # 创建新参数,会自动生成唯一名称 + layer.up_gate_proj_weight_scale = paddle.create_parameter( + shape=casted_tensor.shape, + dtype=casted_tensor.dtype, + default_initializer=paddle.nn.initializer.Assign(casted_tensor) + # 不指定name参数,让框架自动生成 + ) + + if layer.down_proj_weight_scale is not None: + casted_tensor = paddle.cast(layer.down_proj_weight_scale, dtype="float32") + + if hasattr(layer, 'down_proj_weight_scale'): + del layer.down_proj_weight_scale + + layer.down_proj_weight_scale = paddle.create_parameter( + shape=casted_tensor.shape, + dtype=casted_tensor.dtype, + default_initializer=paddle.nn.initializer.Assign(casted_tensor) + ) + + # print(f"x.dtype : {x.dtype}") + # print(f"layer.gate_weight: {layer.gate_weight.dtype}") + # print(f"layer.gate_correction_bias: {layer.gate_correction_bias.dtype}") + # print(f"layer.up_gate_proj_weight.dtype: {layer.up_gate_proj_weight.dtype}") + # print(f"layer.down_proj_weight.dtype: {layer.down_proj_weight.dtype}") + # print(f"up_gate_proj_weight_scale: {layer.up_gate_proj_weight_scale.dtype}") if hasattr(layer, "up_gate_proj_weight_scale") else None + # print(f"down_proj_weight_scale: {layer.down_proj_weight_scale.dtype}") if hasattr(layer, "down_proj_weight_scale") else None + + fused_moe_out = xpu_moe_layer( + x, + layer.gate_weight.transpose([1, 0]), + layer.gate_correction_bias, + layer.up_gate_proj_weight.transpose([0, 2, 1]), + layer.down_proj_weight.transpose([0, 2, 1]), + None, # up_gate_proj bias + None, # down_proj bias + (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), + (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), + # (layer.down_proj_in_scale + # if hasattr(layer, "down_proj_in_scale") else None), + None, + self.moe_quant_type, + layer.top_k, + False, # moe group, used in deepseek + ) + if layer.tp_size > 1: + from fastdeploy.distributed.communication_op import \ + tensor_model_parallel_all_reduce + tensor_model_parallel_all_reduce(fused_moe_out) + + return fused_moe_out + + class XPUWeightOnlyMoEMethod(QuantMethodBase): """ XPU Fused MoE Method. @@ -130,8 +346,15 @@ def create_weights(self, layer: nn.Layer, state_dict: Dict[str, Paddle cutlass create weight process. """ up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict) + + for i in range(len(up_gate_proj_weights)): + print(f"up_gate_proj_weights[i].shape : {up_gate_proj_weights[i].shape}") + print(f"down_proj_weights[i].shape : {down_proj_weights[i].shape}") + print(f"layer.hidden_size : {layer.hidden_size}") + print(f"layer.moe_intermediate_size : {layer.moe_intermediate_size}") assert len(up_gate_proj_weights) == layer.num_local_experts assert len(down_proj_weights) == layer.num_local_experts + # 一个专家的shape assert up_gate_proj_weights[0].shape == [ layer.hidden_size, layer.moe_intermediate_size * 2 ] diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 2494f298a5..008c0d0fbf 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -162,6 +162,7 @@ def init_moe_weights(self): self.init_weight_only_scale() # up_gate_proj parameters + print(f"up_gate_proj_weight : self.weight_dtype") self.up_gate_proj_weight = self.create_parameter( shape=up_gate_proj_weight_shape, dtype=self.weight_dtype, @@ -246,10 +247,13 @@ def extract_moe_ffn_weights(self, state_dict: dict): AssertionError: If required weight keys are missing or number of weights doesn't match number of local experts. """ + print(f"self.weight_key_map : {self.weight_key_map}") up_gate_proj_expert_weight_key = self.weight_key_map.get( "up_gate_proj_expert_weight_key", None) + print(f"mlp 第一层专家权重的名字: {up_gate_proj_expert_weight_key}") down_proj_expert_weight_key = self.weight_key_map.get( "down_proj_expert_weight_key", None) + print(f"mlp 第二层专家权重的名字: {down_proj_expert_weight_key}") assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none." assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none." @@ -306,6 +310,8 @@ def load_state_dict(self, state_dict): if self.fd_config.model_config.is_quantized: self.quant_method.process_prequanted_weights(self, state_dict) else: + print(f"moe.py self.quant_method { self.quant_method}") + print(f"moe.py self.quant_method.create_weights { self.quant_method.create_weights}") self.quant_method.create_weights(self, state_dict) def forward(self, x: paddle.Tensor): diff --git a/fastdeploy/model_executor/layers/quantization/kv_cache.py b/fastdeploy/model_executor/layers/quantization/kv_cache.py index 54e2b8cbf2..8242e02218 100644 --- a/fastdeploy/model_executor/layers/quantization/kv_cache.py +++ b/fastdeploy/model_executor/layers/quantization/kv_cache.py @@ -110,13 +110,16 @@ def load_scale(self, layer: nn.Layer, state_dict): """ load_scale """ + print(f"Attention正在加载cacheK_scale,{self.cache_k_scale_name} 的参数类型为{state_dict[self.cache_k_scale_name].dtype} shape: {state_dict[self.cache_k_scale_name].shape} ") + print(f"Attention正在加载cacheV_scale,{self.cache_v_scale_name} 的参数类型为{state_dict[self.cache_v_scale_name].dtype} shape: {state_dict[self.cache_v_scale_name].shape} ") + print(f"paddle默认的数据类型: {paddle.get_default_dtype()}") cache_k_scale_tensor = get_tensor( state_dict.pop(self.cache_k_scale_name)).cast( paddle.get_default_dtype()).reshape_([-1]) cache_v_scale_tensor = get_tensor( state_dict.pop(self.cache_v_scale_name)).cast( paddle.get_default_dtype()).reshape_([-1]) - + print(f"self.cache_quant_config.max_bound : {self.cache_quant_config.max_bound}") cache_k_scale = self.cache_quant_config.max_bound / cache_k_scale_tensor cache_v_scale = self.cache_quant_config.max_bound / cache_v_scale_tensor cache_k_out_scale = cache_k_scale_tensor / self.cache_quant_config.max_bound diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index 4868b346bf..cdad5d50eb 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -51,26 +51,32 @@ def name(self) -> str: @classmethod def from_config(cls, config: dict) -> "MixQuantConfig": - return cls(config['dense_quant_type'], config['moe_quant_type'], - config.get('kv_cache_quant_type', None), - config.get('image_moe_quant_type', None)) + return cls(config['dense_quant_type'], config['moe_quant_type'],config.get('kv_cache_quant_type', None),config.get('image_moe_quant_type', None)) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: + # print(f"调用get_quant_method, 三种量化类型分别是self.dense_quant_type: {self.dense_quant_type}, self.moe_quant_type: {self.moe_quant_type}, self.kv_cache_quant_type: {self.kv_cache_quant_type}") + # print(f"get_quant_method 这层是: {layer}") if isinstance(layer, FusedMoE): if layer.moe_tag == "Image": return get_quantization_config( self.image_moe_quant_type).from_config( {}).get_quant_method(layer) else: - return get_quantization_config( + f = get_quantization_config( self.moe_quant_type).from_config( {}).get_quant_method(layer) + # print(f"FusedMoE 的量化方法是: {f}") + return f elif isinstance(layer, Attention): if self.kv_cache_quant_type is not None: - return (get_quantization_config("kvcache").from_config( + f = (get_quantization_config("kvcache").from_config( self.kv_cache_quant_type).get_quant_method(layer)) + # print(f"kvcache 的量化方法是: {f}") + return f else: return None else: - return get_quantization_config(self.dense_quant_type).from_config( + f = get_quantization_config(self.dense_quant_type).from_config( {}).get_quant_method(layer) + # print(f"普通层的量化方法是: {f}") + return f diff --git a/fastdeploy/model_executor/layers/quantization/w4a8.py b/fastdeploy/model_executor/layers/quantization/w4a8.py index f8776d6c16..1345f4a2dc 100644 --- a/fastdeploy/model_executor/layers/quantization/w4a8.py +++ b/fastdeploy/model_executor/layers/quantization/w4a8.py @@ -17,6 +17,7 @@ from ..moe import FusedMoE from .quant_base import QuantConfigBase, QuantMethodBase +from fastdeploy.platforms import current_platform class W4A8Config(QuantConfigBase): @@ -36,7 +37,11 @@ def from_config(cls, config: dict) -> "W4A8Config": def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): - from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import CutlassW4A8MoEMethod - return CutlassW4A8MoEMethod(self) + if current_platform.is_cuda(): + from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import CutlassW4A8MoEMethod + return CutlassW4A8MoEMethod(self) + if current_platform.is_xpu(): + from fastdeploy.model_executor.layers.moe.fused_moe_xpu_backend import XPUW4A8MoEMethod + return XPUW4A8MoEMethod(self) else: raise ValueError(f"Unsupported layer type {type(layer)} for w4a8") diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index d635ef2854..36e00bba0c 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -379,4 +379,5 @@ def create_and_set_parameter(layer: nn.Layer, name: str, dtype=tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), )) + # print(f"utils create_and_set_parameter {name} : {tensor.dtype}") getattr(layer, name).set_value(tensor) diff --git a/fastdeploy/model_executor/model_loader.py b/fastdeploy/model_executor/model_loader.py index 8d1819840b..88091bbabd 100644 --- a/fastdeploy/model_executor/model_loader.py +++ b/fastdeploy/model_executor/model_loader.py @@ -118,6 +118,14 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer: fd_config, return_numpy=True, ) + + print(f"这是加载的state_dict的内容") + for key, value in state_dict.items(): + print(f"key: {key}, value shape: {value.shape}, value.type: {value.dtype}") + print(f"模型已经从磁盘加载到内存,现在开始设置XPU权重") + print(f"模型结构是这样的 : {model}") + + # print(f"模型要开始设置state_dict了: 这里模型是{model}, 调用方法是{model.set_state_dict}") model.set_state_dict(state_dict) self.clean_memory_fragments(state_dict) return model diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 3c8e0d8e5b..5d7d9c559d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -244,6 +244,8 @@ def forward( forward_meta: ForwardMeta, hidden_states: paddle.Tensor, ): + # for param in self.qkv_proj.parameters(): + # print(f"Ernie4_5_Attention param {param.name}, {param.dtype}") qkv_out = self.qkv_proj(hidden_states) attn_out = self.attn( @@ -395,6 +397,9 @@ def forward( residual = None for i in range(self.num_layers): + # xpu_model = self.layers[i] + # for param in xpu_model.parameters(): + # print(f"param: {param.name} : {param.dtype}") hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 4aa420a5ea..42b322d0bd 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -107,6 +107,9 @@ def __init__( self.ranks = ranks self.local_rank = local_rank self.fd_config = fd_config + # print(f"fd_config.parallel_config : {fd_config.parallel_config}") + # for key, value in fd_config.parallel_config.__dict__.items(): + # print(f"key : {key}, value : {value}") self.parallel_config = fd_config.parallel_config # TODO(gongshaotian): Use worker factory to get worker @@ -566,6 +569,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: device_config = DeviceConfig(vars(args)) decoding_config = DecodingConfig(vars(args)) speculative_config = SpeculativeConfig(vars(args)) + # parallel_config 是从args里面获取的数据 parallel_config = ParallelConfig(vars(args)) load_config = LoadConfig(vars(args)) @@ -626,7 +630,9 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: if quant_config_name is None: quant_config = None else: + print(f"=====quant_config_name ====: {quant_config_name}") quant_cls = get_quantization_config(quant_config_name) + print(f"量化类: {quant_cls}") quant_config = quant_cls.from_config(quantization_config) # Log quantization info @@ -674,6 +680,9 @@ def run_worker_proc() -> None: # Get fd_config fd_config = initialize_fd_config(args, ranks, local_rank) + + print(f"fd_config : {fd_config}") + # print(f"fd_config.quant_config.get_quant_method() : {fd_config.quant_config.get_quant_method()}") # Create worker process worker_proc = PaddleDisWorkerProc(fd_config, ranks, local_rank) @@ -682,12 +691,16 @@ def run_worker_proc() -> None: worker_proc.init_device() # Load model + print(f"=======================开始模型加载======================") worker_proc.load_model() + print(f"=======================模型加载已完成======================") logger.info("determine_num_available_blocks") worker_proc.determine_num_available_blocks() + print(f"=======================显存评估已完成======================") # Trigger CUDAGraph capture worker_proc.worker.graph_optimize_and_warm_up_model() + print(f"=======================warm_up已完成======================") # Initialize health status worker_proc.init_health_status() diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 0d3329c1d2..cfcbf3b788 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -532,7 +532,7 @@ def _prepare_inputs(self) -> None: def load_model(self) -> None: """ load or download model """ logger.info( - f"Starting to load model {self.model_config.architectures[0]}") + f"开始加载模型 Starting to load model {self.model_config.architectures[0]}") time_before_load = time.perf_counter() # 1. Load original model self.model = get_model_from_loader(fd_config=self.fd_config) @@ -543,7 +543,7 @@ def load_model(self) -> None: time_after_load = time.perf_counter() logger.info( - f"Model loading took {time_after_load - time_before_load} seconds") + f"模型加载总共耗时 Model loading took {time_after_load - time_before_load} seconds") def get_model(self) -> nn.Layer: """ get current model """ @@ -563,8 +563,10 @@ def initialize_kv_cache(self) -> None: """ cache_kvs = {} max_block_num = self.num_gpu_blocks + print(f"初始化initialize_kv_cache max_block_num : {max_block_num}") cache_type = self.parallel_config.dtype + print(f"cache_type : {cache_type}") if (self.quant_config and hasattr(self.quant_config, "kv_cache_quant_type") @@ -701,10 +703,13 @@ class at the server level, which is too granular for ModelRunner. self._prepare_inputs() # 2. Padding inputs for cuda grph - - # 3. Execute model - model_output = self.model(self.share_inputs["ids_remove_padding"], - self.forward_meta) + print(f"==============执行前,模型所有参数=====================:") + for param in self.model.parameters(): + print(f"参数名称: {param.name}, 参数形状: {param.shape}, 数据类型: {param.dtype}") + # print(f"参数值: {param.numpy()}") + print("-" * 50) + # 3. Execute model + model_output = self.model(self.share_inputs["ids_remove_padding"],self.forward_meta) hiddden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], @@ -758,6 +763,7 @@ def prepare_profile(self) -> None: """Prepare the profile run by setting the block number and initializing the KV cache.""" paddle.device.xpu.empty_cache() self.num_gpu_blocks = self.parallel_config.total_block_num + # print(f"self.num_gpu_blocks : {self.num_gpu_blocks}") self.initialize_kv_cache() def profile_run(self) -> None: diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index bf85762c17..cc4ba2ae36 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -93,7 +93,7 @@ def determine_available_memory(self) -> int: total_memory = xpu_get_total_global_memory(self.local_rank) used_memory = xpu_get_used_global_memory(self.local_rank) free_memory = xpu_get_free_global_memory(self.local_rank) - + # 这个时候已经加载完模型了,所以used_memory已经是加载的权重了 logger.info(f"Before warm up, total_memory: {total_memory}, \ used_memory: {used_memory}, free_memory: {free_memory}") @@ -104,6 +104,9 @@ def determine_available_memory(self) -> int: used_memory = xpu_get_used_global_memory(self.local_rank) available_kv_cache_memory = total_available_memory - used_memory model_block_memory_used = self.cal_theortical_kvcache() + # print(f"单个block的内存占用:{model_block_memory_used}") + # print(f"总的block数目: {self.parallel_config.total_block_num}") + # print(f"可用的available_kv_cache_memory: {available_kv_cache_memory}") available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num self.model_runner.clear_block_table()