Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 71 additions & 45 deletions paddlex/inference/models/object_detection/modeling/rt_detr.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,52 +181,78 @@ def __call__(self, head_out, im_shape, scale_factor, pad_shape):
class RTDETRConfig(PretrainedConfig):
def __init__(
self,
backbone,
HybridEncoder,
RTDETRTransformer,
DINOHead,
DETRPostProcess,
arch,
return_idx,
freeze_stem_only,
freeze_at,
freeze_norm,
lr_mult_list,
hidden_dim,
use_encoder_idx,
num_encoder_layers,
el_d_model,
el_nhead,
el_dim_feedforward,
el_dropout,
el_activation,
expansion,
tf_num_queries,
tf_position_embed_type,
tf_feat_strides,
tf_num_levels,
tf_nhead,
tf_num_decoder_layers,
tf_backbone_feat_channels,
tf_dim_feedforward,
tf_dropout,
tf_activation,
tf_num_denoising,
tf_label_noise_ratio,
tf_box_noise_scale,
tf_learnt_init_query,
loss_coeff,
aux_loss,
use_vfl,
matcher_coeff,
num_top_queries,
use_focal_loss,
**kwargs,
):
if backbone["name"] == "PPHGNetV2":
self.arch = backbone["arch"]
self.return_idx = backbone["return_idx"]
self.freeze_stem_only = backbone["freeze_stem_only"]
self.freeze_at = backbone["freeze_at"]
self.freeze_norm = backbone["freeze_norm"]
self.lr_mult_list = backbone["lr_mult_list"]
else:
raise RuntimeError(
f"There is no dynamic graph implementation for backbone {backbone['name']}."
)
self.hidden_dim = HybridEncoder["hidden_dim"]
self.use_encoder_idx = HybridEncoder["use_encoder_idx"]
self.num_encoder_layers = HybridEncoder["num_encoder_layers"]
self.el_d_model = HybridEncoder["encoder_layer"]["d_model"]
self.el_nhead = HybridEncoder["encoder_layer"]["nhead"]
self.el_dim_feedforward = HybridEncoder["encoder_layer"]["dim_feedforward"]
self.el_dropout = HybridEncoder["encoder_layer"]["dropout"]
self.el_activation = HybridEncoder["encoder_layer"]["activation"]
self.expansion = HybridEncoder["expansion"]
self.tf_num_queries = RTDETRTransformer["num_queries"]
self.tf_position_embed_type = RTDETRTransformer["position_embed_type"]
self.tf_feat_strides = RTDETRTransformer["feat_strides"]
self.tf_num_levels = RTDETRTransformer["num_levels"]
self.tf_nhead = RTDETRTransformer["nhead"]
self.tf_num_decoder_layers = RTDETRTransformer["num_decoder_layers"]
self.tf_backbone_feat_channels = RTDETRTransformer["backbone_feat_channels"]
self.tf_dim_feedforward = RTDETRTransformer["dim_feedforward"]
self.tf_dropout = RTDETRTransformer["dropout"]
self.tf_activation = RTDETRTransformer["activation"]
self.tf_num_denoising = RTDETRTransformer["num_denoising"]
self.tf_label_noise_ratio = RTDETRTransformer["label_noise_ratio"]
self.tf_box_noise_scale = RTDETRTransformer["box_noise_scale"]
self.tf_learnt_init_query = RTDETRTransformer["learnt_init_query"]
self.loss_coeff = DINOHead["loss"]["loss_coeff"]
self.aux_loss = DINOHead["loss"]["aux_loss"]
self.use_vfl = DINOHead["loss"]["use_vfl"]
self.matcher_coeff = DINOHead["loss"]["matcher"]["matcher_coeff"]
self.num_top_queries = DETRPostProcess["num_top_queries"]
self.use_focal_loss = DETRPostProcess["use_focal_loss"]
self.arch = arch
self.return_idx = return_idx
self.freeze_stem_only = freeze_stem_only
self.freeze_at = freeze_at
self.freeze_norm = freeze_norm
self.lr_mult_list = lr_mult_list
self.hidden_dim = hidden_dim
self.use_encoder_idx = use_encoder_idx
self.num_encoder_layers = num_encoder_layers
self.el_d_model = d_model
self.el_nhead = nhead
self.el_dim_feedforward = dim_feedforward
self.el_dropout = dropout
self.el_activation = activation
self.expansion = expansion
self.tf_num_queries = num_queries
self.tf_position_embed_type = position_embed_type
self.tf_feat_strides = feat_strides
self.tf_num_levels = num_levels
self.tf_nhead = nhead
self.tf_num_decoder_layers = num_decoder_layers
self.tf_backbone_feat_channels = backbone_feat_channels
self.tf_dim_feedforward = dim_feedforward
self.tf_dropout = dropout
self.tf_activation = activation
self.tf_num_denoising = num_denoising
self.tf_label_noise_ratio = label_noise_ratio
self.tf_box_noise_scale = box_noise_scale
self.tf_learnt_init_query = learnt_init_query
self.loss_coeff = loss_coeff
self.aux_loss = aux_loss
self.use_vfl = use_vfl
self.matcher_coeff = matcher_coeff
self.num_top_queries = num_top_queries
self.use_focal_loss = use_focal_loss
self.tensor_parallel_degree = 1


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,26 @@
class SLANeXtConfig(PretrainedConfig):
def __init__(
self,
backbone,
SLAHead,
out_channels,
hidden_size,
max_text_length,
loc_reg_num,
image_size,
encoder_embed_dim,
encoder_depth,
encoder_num_heads,
encoder_global_attn_indexes,
**kwargs,
):
if backbone["name"] == "Vary_VIT_B":
self.image_size = backbone["image_size"]
self.encoder_embed_dim = backbone["encoder_embed_dim"]
self.encoder_depth = backbone["encoder_depth"]
self.encoder_num_heads = backbone["encoder_num_heads"]
self.encoder_global_attn_indexes = backbone["encoder_global_attn_indexes"]
else:
raise RuntimeError(
f"There is no dynamic graph implementation for backbone {backbone['name']}."
)
self.out_channels = SLAHead["out_channels"]
self.hidden_size = SLAHead["hidden_size"]
self.max_text_length = SLAHead["max_text_length"]
self.loc_reg_num = SLAHead["loc_reg_num"]
self.out_channels = out_channels
self.hidden_size = hidden_size
self.max_text_length = max_text_length
self.loc_reg_num = loc_reg_num
self.image_size = image_size
self.encoder_embed_dim = encoder_embed_dim
self.encoder_depth = encoder_depth
self.encoder_num_heads = encoder_num_heads
self.encoder_global_attn_indexes = encoder_global_attn_indexes
self.tensor_parallel_degree = 1


Expand Down Expand Up @@ -76,11 +79,25 @@ def forward(self, x):
return [x["loc_preds"], x["structure_probs"]]

def get_transpose_weight_keys(self):
transpose_keys = ["mlp.lin2", "attn.qkv", "mlp.lin1"]
transpose_keys = [
"mlp.lin2",
"attn.qkv",
"mlp.lin1",
"structure_attention_cell.score",
"attn.proj",
"i2h",
"h2h",
"structure_generator.0",
"structure_generator.1",
"loc_generator.0",
"loc_generator.1",
]
need_to_transpose = []
all_weight_keys = []
for name, param in self.backbone.named_parameters():
all_weight_keys.append("backbone." + name)
for name, param in self.head.named_parameters():
all_weight_keys.append("head." + name)
for i in range(len(all_weight_keys)):
for j in range(len(transpose_keys)):
if (transpose_keys[j] in all_weight_keys[i]) and (
Expand Down
82 changes: 52 additions & 30 deletions paddlex/inference/models/text_recognition/modeling/pp_ocrv5_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,34 +27,49 @@
class PPOCRV5RecConfig(PretrainedConfig):
def __init__(
self,
backbone,
MultiHead,
model_type,
scale: float = 0.95,
conv_kxk_num: int = 4,
lr_mult_list: list = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
lab_lr: float = 0.1,
net_config: dict | None = None,
text_rec: bool = True,
stem_channels: list = [3, 32, 48],
det: bool = False,
use_lab: bool = False,
use_last_conv: bool = True,
class_expand: int = 2048,
dropout_prob: float = 0.0,
class_num: int = 1000,
lr_mult_list: list = [1.0, 1.0, 1.0, 1.0, 1.0],
out_indices: list | None = None,
stage_config: dict | None = None,
head_list: list | None = None,
decode_list: dict | None = None,
**kwargs,
):
self.backbone_name = backbone["name"]
if self.backbone_name == "PPLCNetV3":
self.net_config = backbone["net_config"]
self.scale = backbone["scale"]
self.conv_kxk_num = backbone["conv_kxk_num"]
self.lr_mult_list = backbone["lr_mult_list"]
self.lab_lr = backbone["lab_lr"]
elif self.backbone_name == "PPHGNetV2":
self.text_rec = backbone["text_rec"]
self.stem_channels = backbone["stem_channels"]
self.stage_config = backbone["stage_config"]
self.det = backbone["det"]
self.use_lab = backbone["use_lab"]
self.use_last_conv = backbone["use_last_conv"]
self.class_expand = backbone["class_expand"]
self.dropout_prob = backbone["dropout_prob"]
self.class_num = backbone["class_num"]
self.lr_mult_list = backbone["lr_mult_list"]
self.out_indices = backbone["out_indices"]
else:
raise RuntimeError(
f"There is no dynamic graph implementation for backbone {backbone['name']}."
)
self.head_list = MultiHead["head_list"]
self.decode_list = MultiHead["decode_list"]
self.model_type = model_type
if self.model_type == "pp_ocrv5_mobile_rec":
self.net_config = net_config
self.scale = scale
self.conv_kxk_num =conv_kxk_num
self.lr_mult_list = lr_mult_list
self.lab_lr = lab_lr
elif self.model_type == "pp_ocrv5_server_rec":
self.text_rec = text_rec
self.stem_channels = stem_channels
self.stage_config = stage_config
self.det = det
self.use_lab = use_lab
self.use_last_conv = use_last_conv
self.class_expand = class_expand
self.dropout_prob = dropout_prob
self.class_num = class_num
self.lr_mult_list = lr_mult_list
self.out_indices = out_indices

self.head_list = head_list
self.decode_list = decode_list
self.tensor_parallel_degree = 1


Expand All @@ -64,15 +79,15 @@ class PPOCRV5Rec(PretrainedModel):

def __init__(self, config: PPOCRV5RecConfig):
super().__init__(config)
if self.config.backbone_name == "PPLCNetV3":
if self.config.model_type == "pp_ocrv5_mobile_rec":
self.backbone = PPLCNetV3(
scale=self.config.scale,
net_config=self.config.net_config,
conv_kxk_num=self.config.conv_kxk_num,
lr_mult_list=self.config.lr_mult_list,
lab_lr=self.config.lab_lr,
)
elif self.config.backbone_name == "PPHGNetV2":
elif self.config.model_type == "pp_ocrv5_server_rec":
self.backbone = PPHGNetV2(
stage_config=self.config.stage_config,
stem_channels=self.config.stem_channels,
Expand Down Expand Up @@ -102,7 +117,14 @@ def forward(self, x):
return [x.cpu().numpy()]

def get_transpose_weight_keys(self):
transpose_keys = ["fc", "out_proj", "attn.qkv"]
transpose_keys = [
"fc",
"out_proj",
"attn.qkv",
"mixer.qkv",
"cross_attn.kv",
"mixer.proj"
]
need_to_transpose = []
all_weight_keys = []
for name, param in self.head.named_parameters():
Expand Down
Loading