Skip to content

Commit 1413b50

Browse files
[Feature] CodeCamp #116 Add SROIE to dataset preparer (#1639)
* added sroie/metafile.yml * add sample_anno.md and textdet.py * modify and add all * fix lint * fix lint * fix lint * Update mmocr/datasets/preparers/data_converpyter. Co-authored-by: Tong Gao <[email protected]> * fix the reviewed * add comment of try to sroie_parser.py * modify data_obtainer.py * fix lint errors * fix download link Co-authored-by: Tong Gao <[email protected]>
1 parent b79382c commit 1413b50

File tree

8 files changed

+183
-1
lines changed

8 files changed

+183
-1
lines changed

dataset_zoo/sroie/metafile.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Name: 'Scanned Receipts OCR and Information Extraction'
2+
Paper:
3+
Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction
4+
URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955
5+
Venue: ICDAR
6+
Year: '2019'
7+
BibTeX: '@INPROCEEDINGS{8977955,
8+
author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
9+
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
10+
title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction},
11+
year={2019},
12+
volume={},
13+
number={},
14+
pages={1516-1520},
15+
doi={10.1109/ICDAR.2019.00244}}'
16+
Data:
17+
Website: https://rrc.cvc.uab.es/?ch=13
18+
Language:
19+
- English
20+
Scene:
21+
- Document
22+
Granularity:
23+
- Word
24+
Tasks:
25+
- textdet
26+
- textrecog
27+
- textspotting
28+
License:
29+
Type: CC BY 4.0
30+
Link: https://creativecommons.org/licenses/by/4.0/
31+
Format: .txt

dataset_zoo/sroie/sample_anno.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
**Text Detection, Text Recognition and Text Spotting**
2+
3+
```text
4+
# x1,y1,x2,y2,x3,y3,x4,y4,trans
5+
6+
72,25,326,25,326,64,72,64,TAN WOON YANN
7+
50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
8+
205,121,285,121,285,139,205,139,789417-W
9+
```

dataset_zoo/sroie/textdet.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
data_root = 'data/sroie'
2+
cache_path = 'data/cache'
3+
4+
data_obtainer = dict(
5+
type='NaiveDataObtainer',
6+
cache_path=cache_path,
7+
data_root=data_root,
8+
files=[
9+
dict(
10+
url='https://download.openmmlab.com/mmocr/data/'
11+
'sroie/0325updated.task1train(626p).zip',
12+
save_name='0325updated.task1train(626p).zip',
13+
md5='16137490f6865caac75772b9111d348c',
14+
split=['train'],
15+
content=['image', 'annotation'],
16+
mapping=[[
17+
'0325updated/0325updated.task1train(626p)/*.jpg',
18+
'textdet_imgs/train'
19+
],
20+
[
21+
'0325updated/0325updated.task1train(626p)/*.txt',
22+
'annotations/train'
23+
]]),
24+
dict(
25+
url='https://download.openmmlab.com/mmocr/data/'
26+
'sroie/task1&2_test(361p).zip',
27+
save_name='task1&2_test(361p).zip',
28+
md5='1bde54705db0995c57a6e34cce437fea',
29+
split=['test'],
30+
content=['image'],
31+
mapping=[[
32+
'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test'
33+
]]),
34+
dict(
35+
url='https://download.openmmlab.com/mmocr/data/sroie/text.zip',
36+
save_name='text.zip',
37+
md5='8c534653f252ff4d3943fa27a956a74b',
38+
split=['test'],
39+
content=['annotation'],
40+
mapping=[['text', 'annotations/test']]),
41+
])
42+
43+
data_converter = dict(
44+
type='TextDetDataConverter',
45+
splits=['train', 'test'],
46+
data_root=data_root,
47+
gatherer=dict(
48+
type='pair_gather',
49+
suffixes=['.jpg'],
50+
rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
51+
parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
52+
dumper=dict(type='JsonDumper'),
53+
delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations'])
54+
55+
config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)

dataset_zoo/sroie/textrecog.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
_base_ = ['textdet.py']
2+
3+
data_converter = dict(type='TextRecogCropConverter')
4+
5+
config_generator = dict(type='TextRecogConfigGenerator')

dataset_zoo/sroie/textspotting.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
_base_ = ['textdet.py']
2+
3+
data_converter = dict(type='TextSpottingDataConverter')
4+
5+
config_generator = dict(type='TextSpottingConfigGenerator')

mmocr/datasets/preparers/data_converter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
177177
"""
178178
files = list()
179179
for file in list_files(img_path, suffixes):
180+
if not re.match(rule[0], osp.basename(file)):
181+
continue
180182
file2 = re.sub(rule[0], rule[1], osp.basename(file))
181183
file2 = file.replace(osp.basename(file), file2)
182184
file2 = file2.replace(self.img_dir, 'annotations')

mmocr/datasets/preparers/parsers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
55
ICDARTxtTextRecogAnnParser)
66
from .naf_parser import NAFAnnParser
7+
from .sroie_parser import SROIETextDetAnnParser
78
from .svt_parser import SVTTextDetAnnParser
89
from .totaltext_parser import TotaltextTextDetAnnParser
910
from .wildreceipt_parser import WildreceiptKIEAnnParser
@@ -12,5 +13,5 @@
1213
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
1314
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
1415
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
15-
'NAFAnnParser'
16+
'SROIETextDetAnnParser', 'NAFAnnParser'
1617
]
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
from typing import List, Optional, Tuple
3+
4+
from mmocr.utils import bbox2poly
5+
from ..data_preparer import DATA_PARSERS
6+
from .base import BaseParser
7+
8+
9+
@DATA_PARSERS.register_module()
10+
class SROIETextDetAnnParser(BaseParser):
11+
"""SROIE Txt Format Text Detection Annotation Parser.
12+
13+
The original annotation format of this dataset is stored in txt files,
14+
which is formed as the following format:
15+
x1, y1, x2, y2, x3, y3, x4, y4, transcription
16+
17+
Args:
18+
separator (str): The separator between each element in a line. Defaults
19+
to ','.
20+
ignore (str): The text to be ignored. Defaults to '###'.
21+
format (str): The format of the annotation. Defaults to
22+
'x1,y1,x2,y2,x3,y3,x4,trans'.
23+
encoding (str): The encoding of the annotation file. Defaults to
24+
'utf-8-sig'.
25+
nproc (int): The number of processes to parse the annotation. Defaults
26+
to 1.
27+
remove_strs (List[str], Optional): Used to remove redundant strings in
28+
the transcription. Defaults to None.
29+
mode (str, optional): The mode of the box converter. Supported modes
30+
are 'xywh' and 'xyxy'. Defaults to None.
31+
"""
32+
33+
def __init__(self,
34+
separator: str = ',',
35+
ignore: str = '###',
36+
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
37+
encoding: str = 'utf-8-sig',
38+
nproc: int = 1,
39+
remove_strs: Optional[List[str]] = None,
40+
mode: str = None) -> None:
41+
self.sep = separator
42+
self.format = format
43+
self.encoding = encoding
44+
self.ignore = ignore
45+
self.mode = mode
46+
self.remove_strs = remove_strs
47+
super().__init__(nproc=nproc)
48+
49+
def parse_file(self, file: Tuple, split: str) -> Tuple:
50+
"""Parse single annotation."""
51+
img_file, txt_file = file
52+
instances = list()
53+
try:
54+
# there might be some illegal symbols in the annotation
55+
# which cannot be parsed by loader
56+
for anno in self.loader(txt_file, self.sep, self.format,
57+
self.encoding):
58+
anno = list(anno.values())
59+
if self.remove_strs is not None:
60+
for strs in self.remove_strs:
61+
for i in range(len(anno)):
62+
if strs in anno[i]:
63+
anno[i] = anno[i].replace(strs, '')
64+
poly = list(map(float, anno[0:-1]))
65+
if self.mode is not None:
66+
poly = bbox2poly(poly, self.mode)
67+
poly = poly.tolist()
68+
text = anno[-1]
69+
instances.append(
70+
dict(poly=poly, text=text, ignore=text == self.ignore))
71+
except Exception:
72+
pass
73+
74+
return img_file, instances

0 commit comments

Comments
 (0)