Skip to content

Commit 4b88767

Browse files
authored
[Fix] MJSynth & SynthText Dataset Preparer config (#1805)
* [Fix] MJSynth * update * fix * fix
1 parent bb591d2 commit 4b88767

File tree

5 files changed

+76
-5
lines changed

5 files changed

+76
-5
lines changed

configs/textrecog/_base_/datasets/mjsynth.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,9 @@
55
data_root=mjsynth_textrecog_data_root,
66
ann_file='textrecog_train.json',
77
pipeline=None)
8+
9+
mjsynth_sub_textrecog_train = dict(
10+
type='OCRDataset',
11+
data_root=mjsynth_textrecog_data_root,
12+
ann_file='subset_textrecog_train.json',
13+
pipeline=None)

dataset_zoo/mjsynth/textrecog.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,19 @@
2323
'annotations/annotation.txt'
2424
]
2525
]),
26+
dict(
27+
url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
28+
'Syn90k/subset_textrecog_train.json',
29+
save_name='subset_textrecog_train.json',
30+
md5='ba958d87bb170980f39e194180c15b9e',
31+
split=['train'],
32+
content=['annotation'])
2633
]),
2734
gatherer=dict(type='MonoGatherer', ann_name='annotation.txt'),
2835
parser=dict(
29-
type='ICDARTxtTextRecogAnnParser',
36+
type='MJSynthAnnParser',
3037
separator=' ',
31-
format='img text',
38+
format='img num',
3239
remove_strs=None),
3340
packer=dict(type='TextRecogPacker'),
3441
dumper=dict(type='JsonDumper'),
@@ -37,4 +44,10 @@
3744
delete = ['mjsynth', 'annotations']
3845

3946
config_generator = dict(
40-
type='TextRecogConfigGenerator', data_root=data_root, test_anns=None)
47+
type='TextRecogConfigGenerator',
48+
data_root=data_root,
49+
train_anns=[
50+
dict(ann_file='textrecog_train.json', dataset_postfix=''),
51+
dict(ann_file='subset_textrecog_train.json', dataset_postfix='sub'),
52+
],
53+
test_anns=None)

dataset_zoo/synthtext/textrecog.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@
2727
dict(
2828
ann_file='alphanumeric_textrecog_train.json',
2929
dataset_postfix='an'),
30-
])
30+
],
31+
test_anns=None)

mmocr/datasets/preparers/parsers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from .funsd_parser import FUNSDTextDetAnnParser
66
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
77
ICDARTxtTextRecogAnnParser)
8+
from .mjsynth_parser import MJSynthAnnParser
89
from .naf_parser import NAFAnnParser
910
from .sroie_parser import SROIETextDetAnnParser
1011
from .svt_parser import SVTTextDetAnnParser
@@ -17,5 +18,5 @@
1718
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
1819
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
1920
'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser',
20-
'SynthTextAnnParser'
21+
'SynthTextAnnParser', 'MJSynthAnnParser'
2122
]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
import os.path as osp
3+
from typing import List
4+
5+
from mmocr.registry import DATA_PARSERS
6+
from .icdar_txt_parser import ICDARTxtTextRecogAnnParser
7+
8+
9+
@DATA_PARSERS.register_module()
10+
class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
11+
"""MJSynth Text Recognition Annotation Parser.
12+
13+
The original annotation format of this dataset is stored in txt files,
14+
which is formed as the following format:
15+
img_path, transcription
16+
17+
Args:
18+
separator (str): The separator between each element in a line. Defaults
19+
to ','.
20+
ignore (str): The text to be ignored. Defaults to '#'.
21+
format (str): The format of the annotation. Defaults to 'img, text'.
22+
encoding (str): The encoding of the annotation file. Defaults to
23+
'utf-8-sig'.
24+
nproc (int): The number of processes to parse the annotation. Defaults
25+
to 1.
26+
base_name (bool): Whether to use the basename of the image path as the
27+
image name. Defaults to False.
28+
remove_strs (List[str], Optional): Used to remove redundant strings in
29+
the transcription. Defaults to ['"'].
30+
"""
31+
32+
def parse_files(self, img_dir: str, ann_path: str) -> List:
33+
"""Parse annotations."""
34+
assert isinstance(ann_path, str)
35+
samples = list()
36+
for anno in self.loader(
37+
file_path=ann_path,
38+
format=self.format,
39+
encoding=self.encoding,
40+
separator=self.sep):
41+
text = osp.basename(anno['img']).split('_')[1]
42+
if self.remove_strs is not None:
43+
for strs in self.remove_strs:
44+
text = text.replace(strs, '')
45+
if text == self.ignore:
46+
continue
47+
img_name = anno['img']
48+
samples.append((osp.join(img_dir, img_name), text))
49+
50+
return samples

0 commit comments

Comments
 (0)