Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions configs/rec/crnn/crnn_resnet34_server.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
system:
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
distribute: True
amp_level: 'O3'
seed: 42
log_interval: 100
val_while_train: True
drop_overflow_update: False

common:
character_dict_path: &character_dict_path mindocr/utils/dict/en_dict.txt
num_classes: &num_classes 96 # num_chars_in_dict+1, TODO: retreive it from dict or check correctness
max_text_len: &max_text_len 24
infer_mode: &infer_mode False
use_space_char: &use_space_char True
lower: &lower False
batch_size: &batch_size 64

model:
type: rec
transform: null
backbone:
name: rec_resnet34
pretrained: False
neck:
name: RNNEncoder
hidden_size: 256
head:
name: CTCHead
weight_init: crnn_customised
bias_init: crnn_customised
out_channels: *num_classes

postprocess:
name: RecCTCLabelDecode
character_dict_path: *character_dict_path
use_space_char: *use_space_char

metric:
name: RecMetric
main_indicator: acc
character_dict_path: *character_dict_path
ignore_space: True
print_flag: False

loss:
name: CTCLoss
pred_seq_len: 25 # TODO: retrieve from the network output shape.
max_label_len: *max_text_len # this value should be smaller than pre_seq_len
batch_size: *batch_size

scheduler:
scheduler: warmup_cosine_decay
min_lr: 0.000001
lr: 0.001
num_epochs: 30
warmup_epochs: 2
decay_epochs: 28

optimizer:
opt: adamw
filter_bias_and_bn: True
momentum: 0.95
weight_decay: 0.0001
nesterov: False

loss_scaler:
type: dynamic
loss_scale: 512
scale_factor: 2.0
scale_window: 1000

train:
ckpt_save_dir: './crnn_resnet34_server'
pred_cast_fp32: False # let CTCLoss cast internally
ema: True # added
dataset_sink_mode: False
dataset:
type: LMDBDataset
dataset_root: /path/to/data_lmdb_release/
data_dir: training/
# label_file: # not required when using LMDBDataset
sample_ratio: 1.0
shuffle: True
transform_pipeline:
- DecodeImage:
img_mode: RGB # changed
to_float32: False
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: *lower
- RecResizeNormImg:
image_shape: [32, 100] # H, W
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: True # aspect ratio will be preserved if true. changed
norm_before_pad: True # changed
- ToCHWImage:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1] # input indices marked as label
#keys_for_loss: 4 # num labels for loss func

loader:
shuffle: True
batch_size: *batch_size
drop_remainder: True
max_rowsize: 12
num_workers: 8

eval:
ckpt_load_path: ./crnn_resnet34_server/best.ckpt
dataset_sink_mode: False
dataset:
type: LMDBDataset
dataset_root: /path/to/data_lmdb_release/
data_dir: validation/
# label_file: # not required when using LMDBDataset
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: RGB # changed
to_float32: False
- RecCTCLabelEncode:
max_text_len: *max_text_len
character_dict_path: *character_dict_path
use_space_char: *use_space_char
lower: *lower
- RecResizeNormImg:
image_shape: [32, 100] # H, W
infer_mode: *infer_mode
character_dict_path: *character_dict_path
padding: True # aspect ratio will be preserved if true. changed
norm_before_pad: True # changed
- ToCHWImage:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
output_columns: ['image', 'text_padded', 'text_length'] # TODO return text string padding w/ fixed length, and a scaler to indicate the length
net_input_column_index: [0] # input indices for network forward func in output_columns
label_column_index: [1, 2] # input indices marked as label

loader:
shuffle: False # TODO: tbc
batch_size: 64
drop_remainder: False
max_rowsize: 12
num_workers: 8
147 changes: 99 additions & 48 deletions mindocr/data/transforms/rec_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"RecCTCLabelEncode",
"RecAttnLabelEncode",
"RecResizeImg",
"RecResizeNormImg",
"RecResizeNormForInfer",
"SVTRRecResizeImg",
"Rotate90IfVertical",
Expand Down Expand Up @@ -247,7 +248,13 @@ def str2idx(text: str, label_dict: Dict[str, int], max_text_len: int = 23, lower


# TODO: reorganize the code for different resize transformation in rec task
def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINEAR):
def resize_norm_img(img,
image_shape,
padding=True,
norm_before_pad=False,
mean=[127.0, 127.0, 127.0],
std=[127.0, 127.0, 127.0],
interpolation=cv2.INTER_LINEAR):
"""
resize image
Args:
Expand All @@ -261,7 +268,8 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE
w = img.shape[1]
c = img.shape[2]
if not padding:
resized_image = cv2.resize(img, (imgW, imgH), interpolation=interpolation)
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=interpolation)
resized_w = imgW
else:
ratio = w / float(h)
Expand All @@ -271,81 +279,126 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))

"""
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
"""
padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8)
padding_im[:, 0:resized_w, :] = resized_image
valid_ratio = min(1.0, float(resized_w / imgW))
return padding_im, valid_ratio

if padding:
if norm_before_pad:
resized_image = (resized_image - mean) / std

padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype)
padded_img[:, 0:resized_w, :] = resized_image

if not norm_before_pad:
padded_img = (padded_img - mean) / std

return padded_img, valid_ratio
else:
resized_image = (resized_image - mean) / std
return resized_image, valid_ratio


# TODO: check diff from resize_norm_img
def resize_norm_img_chinese(img, image_shape):
"""adopted from paddle"""
def resize_norm_img_chinese(img,
image_shape,
norm_before_pad=False,
mean=[127.0, 127.0, 127.0],
std=[127.0, 127.0, 127.0],
interpolation=cv2.INTER_LINEAR):
'''
resize image with aspect-ratio keeping and padding
Args:
img: shape (H, W, C)
image_shape: image shape after resize, in (C, H, W)

'''
imgH, imgW = image_shape
# todo: change to 0 and modified image shape
max_wh_ratio = imgW * 1.0 / imgH
h, w = img.shape[0], img.shape[1]
c = img.shape[2]
ratio = w * 1.0 / h

max_wh_ratio = min(max(max_wh_ratio, ratio), max_wh_ratio)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这一步是多余的。 等价于 max_wh_ratio=max_wh_ratio

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这一步是多余的。 等价于 max_wh_ratio=max_wh_ratio

嗯嗯fixed

imgW = int(imgH * max_wh_ratio)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))

"""
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
"""
# padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8)
# padding_im[:, :, 0:resized_w] = resized_image
padding_im[:, 0:resized_w, :] = resized_image
valid_ratio = min(1.0, float(resized_w / imgW))
return padding_im, valid_ratio

if norm_before_pad:
resized_image = (resized_image - mean) / std

# TODO: remove infer_mode and character_dict_path if they are not necesary
class RecResizeImg(object):
"""adopted from paddle
resize, convert from hwc to chw, rescale pixel value to -1 to 1
"""
padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype)
padded_img[:, 0:resized_w, :] = resized_image

def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs):
if not norm_before_pad:
padded_img = (padded_img - mean) / std

return padded_img, valid_ratio


class RecResizeNormImg(object):
''' adopted from paddle
Resize and normalize image, and pad image if needed.

Args:
norm_before_pad: If True, perform normalization before padding (by doing so, the padding values will beall zero. Good practice.). Otherwise, per Default: False
'''
def __init__(self,
image_shape,
infer_mode=False,
character_dict_path=None,
padding=True,
norm_before_pad=False,
mean=[127.0, 127.0, 127.0],
std=[127.0, 127.0, 127.0],
**kwargs):
self.image_shape = image_shape
self.infer_mode = infer_mode
self.character_dict_path = character_dict_path
self.padding = padding
self.norm_before_pad = norm_before_pad
self.mean = np.array(mean, dtype="float32")
self.std = np.array(std, dtype="float32")

def __call__(self, data):
img = data["image"]
img = data['image']
if self.infer_mode and self.character_dict_path is not None:
norm_img, valid_ratio = resize_norm_img_chinese(img, self.image_shape)
norm_img, valid_ratio = resize_norm_img_chinese(img,
self.image_shape,
self.norm_before_pad,
self.mean,
self.std
)
else:
norm_img, valid_ratio = resize_norm_img(img, self.image_shape, self.padding)
data["image"] = norm_img
data["valid_ratio"] = valid_ratio
# TODO: data['shape_list'] = ?
norm_img, valid_ratio = resize_norm_img(img,
self.image_shape,
self.padding,
self.norm_before_pad,
self.mean,
self.std,
)
data['image'] = norm_img
data['valid_ratio'] = valid_ratio
return data


# TODO: remove infer_mode and character_dict_path if they are not necesary
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

那个character_dict_pathinfer_mode 要不顺便清了,这两个argument很ambiguous而且容易忘记

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好,我看下

那个character_dict_pathinfer_mode 要不顺便清了,这两个argument很ambiguous而且容易忘记

class RecResizeImg(RecResizeNormImg):
'''
This is to make compatible with older version code that uses RecResizeImg, which is to be updated.

TODO: replace RecResizeImg followed by NormlaizeImage in yaml files with RecResizeNormImg op.
'''
def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs):
super.__init__(
image_shape, infer_mode, character_dict_path, padding, norm_befoer_pad=False,
mean=[0., 0., 0.], std=[1., 1., 1.],
)


class SVTRRecResizeImg(object):
def __init__(self, image_shape, padding=True, **kwargs):
self.image_shape = image_shape
Expand Down Expand Up @@ -425,9 +478,7 @@ def __call__(self, data):

# TODO: norm before padding

data["shape_list"] = np.array(
[h, w, resize_h / h, resize_w / w], dtype=np.float32
) # TODO: reformat, currently align to det
data['shape_list'] = [h, w, resize_h / h, resize_w / w] # TODO: reformat, currently align to det
if self.norm_before_pad:
resized_img = self.norm(resized_img)

Expand Down