From e94e3048db840b8b06da662fd2cf4d50b10f282d Mon Sep 17 00:00:00 2001 From: liu-jiaxuan Date: Mon, 9 Feb 2026 07:22:53 +0000 Subject: [PATCH] fix codes --- .../object_detection/modeling/rt_detr.py | 116 +++++++++++------- .../modeling/slanext.py | 51 +++++--- .../text_recognition/modeling/pp_ocrv5_rec.py | 82 ++++++++----- 3 files changed, 157 insertions(+), 92 deletions(-) diff --git a/paddlex/inference/models/object_detection/modeling/rt_detr.py b/paddlex/inference/models/object_detection/modeling/rt_detr.py index 9f15589214..ece6c1afa7 100644 --- a/paddlex/inference/models/object_detection/modeling/rt_detr.py +++ b/paddlex/inference/models/object_detection/modeling/rt_detr.py @@ -181,52 +181,78 @@ def __call__(self, head_out, im_shape, scale_factor, pad_shape): class RTDETRConfig(PretrainedConfig): def __init__( self, - backbone, - HybridEncoder, - RTDETRTransformer, - DINOHead, - DETRPostProcess, + arch, + return_idx, + freeze_stem_only, + freeze_at, + freeze_norm, + lr_mult_list, + hidden_dim, + use_encoder_idx, + num_encoder_layers, + el_d_model, + el_nhead, + el_dim_feedforward, + el_dropout, + el_activation, + expansion, + tf_num_queries, + tf_position_embed_type, + tf_feat_strides, + tf_num_levels, + tf_nhead, + tf_num_decoder_layers, + tf_backbone_feat_channels, + tf_dim_feedforward, + tf_dropout, + tf_activation, + tf_num_denoising, + tf_label_noise_ratio, + tf_box_noise_scale, + tf_learnt_init_query, + loss_coeff, + aux_loss, + use_vfl, + matcher_coeff, + num_top_queries, + use_focal_loss, + **kwargs, ): - if backbone["name"] == "PPHGNetV2": - self.arch = backbone["arch"] - self.return_idx = backbone["return_idx"] - self.freeze_stem_only = backbone["freeze_stem_only"] - self.freeze_at = backbone["freeze_at"] - self.freeze_norm = backbone["freeze_norm"] - self.lr_mult_list = backbone["lr_mult_list"] - else: - raise RuntimeError( - f"There is no dynamic graph implementation for backbone {backbone['name']}." - ) - self.hidden_dim = HybridEncoder["hidden_dim"] - self.use_encoder_idx = HybridEncoder["use_encoder_idx"] - self.num_encoder_layers = HybridEncoder["num_encoder_layers"] - self.el_d_model = HybridEncoder["encoder_layer"]["d_model"] - self.el_nhead = HybridEncoder["encoder_layer"]["nhead"] - self.el_dim_feedforward = HybridEncoder["encoder_layer"]["dim_feedforward"] - self.el_dropout = HybridEncoder["encoder_layer"]["dropout"] - self.el_activation = HybridEncoder["encoder_layer"]["activation"] - self.expansion = HybridEncoder["expansion"] - self.tf_num_queries = RTDETRTransformer["num_queries"] - self.tf_position_embed_type = RTDETRTransformer["position_embed_type"] - self.tf_feat_strides = RTDETRTransformer["feat_strides"] - self.tf_num_levels = RTDETRTransformer["num_levels"] - self.tf_nhead = RTDETRTransformer["nhead"] - self.tf_num_decoder_layers = RTDETRTransformer["num_decoder_layers"] - self.tf_backbone_feat_channels = RTDETRTransformer["backbone_feat_channels"] - self.tf_dim_feedforward = RTDETRTransformer["dim_feedforward"] - self.tf_dropout = RTDETRTransformer["dropout"] - self.tf_activation = RTDETRTransformer["activation"] - self.tf_num_denoising = RTDETRTransformer["num_denoising"] - self.tf_label_noise_ratio = RTDETRTransformer["label_noise_ratio"] - self.tf_box_noise_scale = RTDETRTransformer["box_noise_scale"] - self.tf_learnt_init_query = RTDETRTransformer["learnt_init_query"] - self.loss_coeff = DINOHead["loss"]["loss_coeff"] - self.aux_loss = DINOHead["loss"]["aux_loss"] - self.use_vfl = DINOHead["loss"]["use_vfl"] - self.matcher_coeff = DINOHead["loss"]["matcher"]["matcher_coeff"] - self.num_top_queries = DETRPostProcess["num_top_queries"] - self.use_focal_loss = DETRPostProcess["use_focal_loss"] + self.arch = arch + self.return_idx = return_idx + self.freeze_stem_only = freeze_stem_only + self.freeze_at = freeze_at + self.freeze_norm = freeze_norm + self.lr_mult_list = lr_mult_list + self.hidden_dim = hidden_dim + self.use_encoder_idx = use_encoder_idx + self.num_encoder_layers = num_encoder_layers + self.el_d_model = d_model + self.el_nhead = nhead + self.el_dim_feedforward = dim_feedforward + self.el_dropout = dropout + self.el_activation = activation + self.expansion = expansion + self.tf_num_queries = num_queries + self.tf_position_embed_type = position_embed_type + self.tf_feat_strides = feat_strides + self.tf_num_levels = num_levels + self.tf_nhead = nhead + self.tf_num_decoder_layers = num_decoder_layers + self.tf_backbone_feat_channels = backbone_feat_channels + self.tf_dim_feedforward = dim_feedforward + self.tf_dropout = dropout + self.tf_activation = activation + self.tf_num_denoising = num_denoising + self.tf_label_noise_ratio = label_noise_ratio + self.tf_box_noise_scale = box_noise_scale + self.tf_learnt_init_query = learnt_init_query + self.loss_coeff = loss_coeff + self.aux_loss = aux_loss + self.use_vfl = use_vfl + self.matcher_coeff = matcher_coeff + self.num_top_queries = num_top_queries + self.use_focal_loss = use_focal_loss self.tensor_parallel_degree = 1 diff --git a/paddlex/inference/models/table_structure_recognition/modeling/slanext.py b/paddlex/inference/models/table_structure_recognition/modeling/slanext.py index 884339f0fa..956a7d36c6 100644 --- a/paddlex/inference/models/table_structure_recognition/modeling/slanext.py +++ b/paddlex/inference/models/table_structure_recognition/modeling/slanext.py @@ -25,23 +25,26 @@ class SLANeXtConfig(PretrainedConfig): def __init__( self, - backbone, - SLAHead, + out_channels, + hidden_size, + max_text_length, + loc_reg_num, + image_size, + encoder_embed_dim, + encoder_depth, + encoder_num_heads, + encoder_global_attn_indexes, + **kwargs, ): - if backbone["name"] == "Vary_VIT_B": - self.image_size = backbone["image_size"] - self.encoder_embed_dim = backbone["encoder_embed_dim"] - self.encoder_depth = backbone["encoder_depth"] - self.encoder_num_heads = backbone["encoder_num_heads"] - self.encoder_global_attn_indexes = backbone["encoder_global_attn_indexes"] - else: - raise RuntimeError( - f"There is no dynamic graph implementation for backbone {backbone['name']}." - ) - self.out_channels = SLAHead["out_channels"] - self.hidden_size = SLAHead["hidden_size"] - self.max_text_length = SLAHead["max_text_length"] - self.loc_reg_num = SLAHead["loc_reg_num"] + self.out_channels = out_channels + self.hidden_size = hidden_size + self.max_text_length = max_text_length + self.loc_reg_num = loc_reg_num + self.image_size = image_size + self.encoder_embed_dim = encoder_embed_dim + self.encoder_depth = encoder_depth + self.encoder_num_heads = encoder_num_heads + self.encoder_global_attn_indexes = encoder_global_attn_indexes self.tensor_parallel_degree = 1 @@ -76,11 +79,25 @@ def forward(self, x): return [x["loc_preds"], x["structure_probs"]] def get_transpose_weight_keys(self): - transpose_keys = ["mlp.lin2", "attn.qkv", "mlp.lin1"] + transpose_keys = [ + "mlp.lin2", + "attn.qkv", + "mlp.lin1", + "structure_attention_cell.score", + "attn.proj", + "i2h", + "h2h", + "structure_generator.0", + "structure_generator.1", + "loc_generator.0", + "loc_generator.1", + ] need_to_transpose = [] all_weight_keys = [] for name, param in self.backbone.named_parameters(): all_weight_keys.append("backbone." + name) + for name, param in self.head.named_parameters(): + all_weight_keys.append("head." + name) for i in range(len(all_weight_keys)): for j in range(len(transpose_keys)): if (transpose_keys[j] in all_weight_keys[i]) and ( diff --git a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py index 0c90123e6b..50ac8d1196 100644 --- a/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py +++ b/paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py @@ -27,34 +27,49 @@ class PPOCRV5RecConfig(PretrainedConfig): def __init__( self, - backbone, - MultiHead, + model_type, + scale: float = 0.95, + conv_kxk_num: int = 4, + lr_mult_list: list = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr: float = 0.1, + net_config: dict | None = None, + text_rec: bool = True, + stem_channels: list = [3, 32, 48], + det: bool = False, + use_lab: bool = False, + use_last_conv: bool = True, + class_expand: int = 2048, + dropout_prob: float = 0.0, + class_num: int = 1000, + lr_mult_list: list = [1.0, 1.0, 1.0, 1.0, 1.0], + out_indices: list | None = None, + stage_config: dict | None = None, + head_list: list | None = None, + decode_list: dict | None = None, + **kwargs, ): - self.backbone_name = backbone["name"] - if self.backbone_name == "PPLCNetV3": - self.net_config = backbone["net_config"] - self.scale = backbone["scale"] - self.conv_kxk_num = backbone["conv_kxk_num"] - self.lr_mult_list = backbone["lr_mult_list"] - self.lab_lr = backbone["lab_lr"] - elif self.backbone_name == "PPHGNetV2": - self.text_rec = backbone["text_rec"] - self.stem_channels = backbone["stem_channels"] - self.stage_config = backbone["stage_config"] - self.det = backbone["det"] - self.use_lab = backbone["use_lab"] - self.use_last_conv = backbone["use_last_conv"] - self.class_expand = backbone["class_expand"] - self.dropout_prob = backbone["dropout_prob"] - self.class_num = backbone["class_num"] - self.lr_mult_list = backbone["lr_mult_list"] - self.out_indices = backbone["out_indices"] - else: - raise RuntimeError( - f"There is no dynamic graph implementation for backbone {backbone['name']}." - ) - self.head_list = MultiHead["head_list"] - self.decode_list = MultiHead["decode_list"] + self.model_type = model_type + if self.model_type == "pp_ocrv5_mobile_rec": + self.net_config = net_config + self.scale = scale + self.conv_kxk_num =conv_kxk_num + self.lr_mult_list = lr_mult_list + self.lab_lr = lab_lr + elif self.model_type == "pp_ocrv5_server_rec": + self.text_rec = text_rec + self.stem_channels = stem_channels + self.stage_config = stage_config + self.det = det + self.use_lab = use_lab + self.use_last_conv = use_last_conv + self.class_expand = class_expand + self.dropout_prob = dropout_prob + self.class_num = class_num + self.lr_mult_list = lr_mult_list + self.out_indices = out_indices + + self.head_list = head_list + self.decode_list = decode_list self.tensor_parallel_degree = 1 @@ -64,7 +79,7 @@ class PPOCRV5Rec(PretrainedModel): def __init__(self, config: PPOCRV5RecConfig): super().__init__(config) - if self.config.backbone_name == "PPLCNetV3": + if self.config.model_type == "pp_ocrv5_mobile_rec": self.backbone = PPLCNetV3( scale=self.config.scale, net_config=self.config.net_config, @@ -72,7 +87,7 @@ def __init__(self, config: PPOCRV5RecConfig): lr_mult_list=self.config.lr_mult_list, lab_lr=self.config.lab_lr, ) - elif self.config.backbone_name == "PPHGNetV2": + elif self.config.model_type == "pp_ocrv5_server_rec": self.backbone = PPHGNetV2( stage_config=self.config.stage_config, stem_channels=self.config.stem_channels, @@ -102,7 +117,14 @@ def forward(self, x): return [x.cpu().numpy()] def get_transpose_weight_keys(self): - transpose_keys = ["fc", "out_proj", "attn.qkv"] + transpose_keys = [ + "fc", + "out_proj", + "attn.qkv", + "mixer.qkv", + "cross_attn.kv", + "mixer.proj" + ] need_to_transpose = [] all_weight_keys = [] for name, param in self.head.named_parameters():