mindspore-lab · hqkate · Jun 12, 2023 · Jun 12, 2023 · Jun 15, 2023 · Jun 16, 2023
diff --git a/configs/rec/crnn/crnn_resnet34_server.yaml b/configs/rec/crnn/crnn_resnet34_server.yaml
@@ -0,0 +1,150 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: True
+  amp_level: 'O3'
+  seed: 42
+  log_interval: 100
+  val_while_train: True
+  drop_overflow_update: False
+
+common:
+  character_dict_path: &character_dict_path  mindocr/utils/dict/en_dict.txt
+  num_classes: &num_classes 96 # num_chars_in_dict+1,  TODO: retreive it from dict or check correctness
+  max_text_len: &max_text_len 24
+  infer_mode: &infer_mode False
+  use_space_char: &use_space_char True
+  lower: &lower False
+  batch_size: &batch_size 64
+
+model:
+  type: rec
+  transform: null
+  backbone:
+    name: rec_resnet34
+    pretrained: False
+  neck:
+    name: RNNEncoder
+    hidden_size: 256
+  head:
+    name: CTCHead
+    weight_init: crnn_customised
+    bias_init: crnn_customised
+    out_channels: *num_classes
+
+postprocess:
+  name: RecCTCLabelDecode
+  character_dict_path: *character_dict_path
+  use_space_char: *use_space_char
+
+metric:
+  name: RecMetric
+  main_indicator: acc
+  character_dict_path: *character_dict_path
+  ignore_space: True
+  print_flag: False
+
+loss:
+  name: CTCLoss
+  pred_seq_len: 25 # TODO: retrieve from the network output shape.
+  max_label_len: *max_text_len  # this value should be smaller than pre_seq_len
+  batch_size: *batch_size
+
+scheduler:
+  scheduler: warmup_cosine_decay
+  min_lr: 0.000001
+  lr: 0.001
+  num_epochs: 30
+  warmup_epochs: 2
+  decay_epochs: 28
+
+optimizer:
+  opt: adamw
+  filter_bias_and_bn: True
+  momentum: 0.95
+  weight_decay: 0.0001
+  nesterov: False
+
+loss_scaler:
+  type: dynamic
+  loss_scale: 512
+  scale_factor: 2.0
+  scale_window: 1000
+
+train:
+  ckpt_save_dir: './crnn_resnet34_server'
+  pred_cast_fp32: False # let CTCLoss cast internally
+  ema: True # added
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    dataset_root: /path/to/data_lmdb_release/
+    data_dir: training/
+    # label_file: # not required when using LMDBDataset
+    sample_ratio: 1.0
+    shuffle: True
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB # changed
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          lower: *lower
+      - RecResizeNormImg:
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true. changed
+          norm_before_pad: True # changed
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
+    output_columns: ['image', 'text_seq'] #, 'length'] #'img_path']
+    net_input_column_index: [0] # input indices for network forward func in output_columns
+    label_column_index: [1] # input indices marked as label
+    #keys_for_loss: 4 # num labels for loss func
+
+  loader:
+      shuffle: True
+      batch_size: *batch_size
+      drop_remainder: True
+      max_rowsize: 12
+      num_workers: 8
+
+eval:
+  ckpt_load_path: ./crnn_resnet34_server/best.ckpt
+  dataset_sink_mode: False
+  dataset:
+    type: LMDBDataset
+    dataset_root: /path/to/data_lmdb_release/
+    data_dir: validation/
+    # label_file: # not required when using LMDBDataset
+    sample_ratio: 1.0
+    shuffle: False
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB # changed
+          to_float32: False
+      - RecCTCLabelEncode:
+          max_text_len: *max_text_len
+          character_dict_path: *character_dict_path
+          use_space_char: *use_space_char
+          lower: *lower
+      - RecResizeNormImg:
+          image_shape: [32, 100] # H, W
+          infer_mode: *infer_mode
+          character_dict_path: *character_dict_path
+          padding: True # aspect ratio will be preserved if true. changed
+          norm_before_pad: True # changed
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize
+    output_columns: ['image', 'text_padded', 'text_length']  # TODO return text string padding w/ fixed length, and a scaler to indicate the length
+    net_input_column_index: [0] # input indices for network forward func in output_columns
+    label_column_index: [1, 2] # input indices marked as label
+
+  loader:
+      shuffle: False # TODO: tbc
+      batch_size: 64
+      drop_remainder: False
+      max_rowsize: 12
+      num_workers: 8
diff --git a/mindocr/data/transforms/rec_transforms.py b/mindocr/data/transforms/rec_transforms.py
@@ -11,6 +11,7 @@
     "RecCTCLabelEncode",
     "RecAttnLabelEncode",
     "RecResizeImg",
+    "RecResizeNormImg",
     "RecResizeNormForInfer",
     "SVTRRecResizeImg",
     "Rotate90IfVertical",
@@ -247,7 +248,13 @@ def str2idx(text: str, label_dict: Dict[str, int], max_text_len: int = 23, lower
 
 
 # TODO: reorganize the code for different resize transformation in rec task
-def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINEAR):
+def resize_norm_img(img,
+                    image_shape,
+                    padding=True,
+                    norm_before_pad=False,
+                    mean=[127.0, 127.0, 127.0],
+                    std=[127.0, 127.0, 127.0],
+                    interpolation=cv2.INTER_LINEAR):
     """
     resize image
     Args:
@@ -261,7 +268,8 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE
     w = img.shape[1]
     c = img.shape[2]
     if not padding:
-        resized_image = cv2.resize(img, (imgW, imgH), interpolation=interpolation)
+        resized_image = cv2.resize(
+            img, (imgW, imgH), interpolation=interpolation)
         resized_w = imgW
     else:
         ratio = w / float(h)
@@ -271,81 +279,126 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE
             resized_w = int(math.ceil(imgH * ratio))
         resized_image = cv2.resize(img, (resized_w, imgH))
 
-    """
-    resized_image = resized_image.astype('float32')
-    if image_shape[0] == 1:
-        resized_image = resized_image / 255
-        resized_image = resized_image[np.newaxis, :]
-    else:
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-    resized_image -= 0.5
-    resized_image /= 0.5
-    """
-    padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8)
-    padding_im[:, 0:resized_w, :] = resized_image
     valid_ratio = min(1.0, float(resized_w / imgW))
-    return padding_im, valid_ratio
+
+    if padding:
+        if norm_before_pad:
+            resized_image = (resized_image - mean) / std
+
+        padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype)
+        padded_img[:, 0:resized_w, :] = resized_image
+
+        if not norm_before_pad:
+            padded_img = (padded_img - mean) / std
+
+        return padded_img, valid_ratio
+    else:
+        resized_image = (resized_image - mean) / std
+        return resized_image, valid_ratio
 
 
 # TODO: check diff from resize_norm_img
-def resize_norm_img_chinese(img, image_shape):
-    """adopted from paddle"""
+def resize_norm_img_chinese(img,
+                            image_shape,
+                            norm_before_pad=False,
+                            mean=[127.0, 127.0, 127.0],
+                            std=[127.0, 127.0, 127.0],
+                            interpolation=cv2.INTER_LINEAR):
+    '''
+    resize image with aspect-ratio keeping and padding
+    Args:
+        img: shape (H, W, C)
+        image_shape: image shape after resize, in (C, H, W)
+
+    '''
     imgH, imgW = image_shape
     # todo: change to 0 and modified image shape
     max_wh_ratio = imgW * 1.0 / imgH
     h, w = img.shape[0], img.shape[1]
     c = img.shape[2]
     ratio = w * 1.0 / h
-
+    max_wh_ratio = min(max(max_wh_ratio, ratio), max_wh_ratio)
     imgW = int(imgH * max_wh_ratio)
     if math.ceil(imgH * ratio) > imgW:
         resized_w = imgW
     else:
         resized_w = int(math.ceil(imgH * ratio))
     resized_image = cv2.resize(img, (resized_w, imgH))
 
-    """
-    resized_image = resized_image.astype('float32')
-    if image_shape[0] == 1:
-        resized_image = resized_image / 255
-        resized_image = resized_image[np.newaxis, :]
-    else:
-        resized_image = resized_image.transpose((2, 0, 1)) / 255
-    resized_image -= 0.5
-    resized_image /= 0.5
-    """
-    # padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
-    padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8)
-    # padding_im[:, :, 0:resized_w] = resized_image
-    padding_im[:, 0:resized_w, :] = resized_image
     valid_ratio = min(1.0, float(resized_w / imgW))
-    return padding_im, valid_ratio
 
+    if norm_before_pad:
+        resized_image = (resized_image - mean) / std
 
-# TODO: remove infer_mode and character_dict_path if they are not necesary
-class RecResizeImg(object):
-    """adopted from paddle
-    resize, convert from hwc to chw, rescale pixel value to -1 to 1
-    """
+    padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype)
+    padded_img[:, 0:resized_w, :] = resized_image
 
-    def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs):
+    if not norm_before_pad:
+        padded_img = (padded_img - mean) / std
+
+    return padded_img, valid_ratio
+
+
+class RecResizeNormImg(object):
+    ''' adopted from paddle
+    Resize and normalize image, and pad image if needed.
+
+    Args:
+        norm_before_pad: If True, perform normalization before padding (by doing so, the padding values will beall zero. Good practice.). Otherwise, per  Default: False
+    '''
+    def __init__(self,
+                 image_shape,
+                 infer_mode=False,
+                 character_dict_path=None,
+                 padding=True,
+                 norm_before_pad=False,
+                 mean=[127.0, 127.0, 127.0],
+                 std=[127.0, 127.0, 127.0],
+                 **kwargs):
         self.image_shape = image_shape
         self.infer_mode = infer_mode
         self.character_dict_path = character_dict_path
         self.padding = padding
+        self.norm_before_pad = norm_before_pad
+        self.mean = np.array(mean, dtype="float32")
+        self.std = np.array(std, dtype="float32")
 
     def __call__(self, data):
-        img = data["image"]
+        img = data['image']
         if self.infer_mode and self.character_dict_path is not None:
-            norm_img, valid_ratio = resize_norm_img_chinese(img, self.image_shape)
+            norm_img, valid_ratio = resize_norm_img_chinese(img,
+                                                            self.image_shape,
+                                                            self.norm_before_pad,
+                                                            self.mean,
+                                                            self.std
+                                                            )
         else:
-            norm_img, valid_ratio = resize_norm_img(img, self.image_shape, self.padding)
-        data["image"] = norm_img
-        data["valid_ratio"] = valid_ratio
-        # TODO: data['shape_list'] = ?
+            norm_img, valid_ratio = resize_norm_img(img,
+                                                    self.image_shape,
+                                                    self.padding,
+                                                    self.norm_before_pad,
+                                                    self.mean,
+                                                    self.std,
+                                                    )
+        data['image'] = norm_img
+        data['valid_ratio'] = valid_ratio
         return data
 
 
+# TODO: remove infer_mode and character_dict_path if they are not necesary
+class RecResizeImg(RecResizeNormImg):
+    '''
+    This is to make compatible with older version code that uses RecResizeImg, which is to be updated.
+
+    TODO: replace RecResizeImg followed by NormlaizeImage in yaml files with RecResizeNormImg op.
+    '''
+    def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs):
+        super.__init__(
+                image_shape, infer_mode, character_dict_path, padding, norm_befoer_pad=False,
+                mean=[0., 0., 0.], std=[1., 1., 1.],
+                )
+
+
 class SVTRRecResizeImg(object):
     def __init__(self, image_shape, padding=True, **kwargs):
         self.image_shape = image_shape
@@ -425,9 +478,7 @@ def __call__(self, data):
 
         # TODO: norm before padding
 
-        data["shape_list"] = np.array(
-            [h, w, resize_h / h, resize_w / w], dtype=np.float32
-        )  # TODO: reformat, currently align to det
+        data['shape_list'] = [h, w, resize_h / h, resize_w / w] # TODO: reformat, currently align to det
         if self.norm_before_pad:
             resized_img = self.norm(resized_img)