Skip to content

Commit b79382c

Browse files
authored
[Feature] CodeCamp #115 Add NAF to dataset preparer (#1609)
* add naf converter * fix test * update * use fuzzy search instead * update * update
1 parent e3fd570 commit b79382c

File tree

10 files changed

+290
-3
lines changed

10 files changed

+290
-3
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,4 @@ mmocr/.mim
142142
workdirs/
143143
.history/
144144
.dev/
145+
data/

dataset_zoo/naf/metafile.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Name: 'NAF'
2+
Paper:
3+
Title: Deep Visual Template-Free Form Parsing
4+
URL: https://ieeexplore.ieee.org/abstract/document/8977962
5+
Venue: ICDAR
6+
Year: '2019'
7+
BibTeX: '@inproceedings{davis2019deep,
8+
title={Deep visual template-free form parsing},
9+
author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
10+
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
11+
pages={134--141},
12+
year={2019},
13+
organization={IEEE}}'
14+
Data:
15+
Website: https://github.com/herobd/NAF_dataset
16+
Language:
17+
- English
18+
Scene:
19+
- Document
20+
- Handwritten
21+
Granularity:
22+
- Word
23+
- Line
24+
Tasks:
25+
- textrecog
26+
- textdet
27+
- textspotting
28+
License:
29+
Type: CDLA
30+
Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
31+
Format: .json

dataset_zoo/naf/sample_anno.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
**Text Detection/Recognition/Spotting**
2+
3+
```json
4+
{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
5+
"textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
6+
```

dataset_zoo/naf/textdet.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
data_root = 'data/naf'
2+
cache_path = 'data/cache'
3+
4+
data_obtainer = dict(
5+
type='NaiveDataObtainer',
6+
cache_path=cache_path,
7+
data_root=data_root,
8+
files=[
9+
dict(
10+
url='https://github.com/herobd/NAF_dataset/releases/'
11+
'download/v1.0/labeled_images.tar.gz',
12+
save_name='naf_image.tar.gz',
13+
md5='6521cdc25c313a1f2928a16a77ad8f29',
14+
split=['train', 'test', 'val'],
15+
content=['image'],
16+
mapping=[['naf_image/labeled_images', 'temp_images/']]),
17+
dict(
18+
url='https://github.com/herobd/NAF_dataset/archive/'
19+
'refs/heads/master.zip',
20+
save_name='naf_anno.zip',
21+
md5='abf5af6266cc527d772231751bc884b3',
22+
split=['train', 'test', 'val'],
23+
content=['annotation'],
24+
mapping=[
25+
[
26+
'naf_anno/NAF_dataset-master/groups/**/*.json',
27+
'annotations/'
28+
],
29+
[
30+
'naf_anno/NAF_dataset-master/train_valid_test_split.json',
31+
'data_split.json'
32+
]
33+
]),
34+
])
35+
36+
data_converter = dict(
37+
type='TextDetDataConverter',
38+
splits=['train', 'test', 'val'],
39+
data_root=data_root,
40+
gatherer=dict(type='naf_gather'),
41+
parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
42+
delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
43+
dumper=dict(type='JsonDumper'),
44+
nproc=1)
45+
46+
config_generator = dict(
47+
type='TextDetConfigGenerator',
48+
data_root=data_root,
49+
val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])

dataset_zoo/naf/textrecog.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
2+
# not accurate. The test/valid set ones were hand corrected, but the train set
3+
# was only hand corrected a little. They aren't very good results. Better
4+
# not to use them for recognition and text spotting.
5+
6+
_base_ = ['textdet.py']
7+
data_root = 'data/naf'
8+
9+
data_converter = dict(
10+
type='TextRecogCropConverter',
11+
parser=dict(
12+
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
13+
det=False),
14+
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
15+
16+
config_generator = dict(
17+
type='TextRecogConfigGenerator',
18+
data_root=data_root,
19+
val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])

dataset_zoo/naf/textspotting.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
2+
# not accurate. The test/valid set ones were hand corrected, but the train set
3+
# was only hand corrected a little. They aren't very good results. Better
4+
# not to use them for recognition and text spotting.
5+
6+
_base_ = ['textdet.py']
7+
data_root = 'data/naf'
8+
data_converter = dict(
9+
type='TextSpottingDataConverter',
10+
parser=dict(
11+
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
12+
det=False),
13+
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
14+
15+
config_generator = dict(
16+
type='TextSpottingConfigGenerator',
17+
data_root=data_root,
18+
val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])

mmocr/datasets/preparers/data_converter.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
22
import json
3+
import os
34
import os.path as osp
45
import re
56
import shutil
@@ -61,6 +62,8 @@ def __init__(self,
6162
self.gatherer = self.pair_gather
6263
elif gather_type == 'mono_gather':
6364
self.gatherer = self.mono_gather
65+
elif gather_type == 'naf_gather':
66+
self.gatherer = self.naf_gather
6467
else:
6568
raise NotImplementedError
6669

@@ -181,11 +184,51 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
181184

182185
return files
183186

187+
def naf_gather(self, img_path: str, ann_path: str,
188+
**kwargs) -> List[Tuple]:
189+
"""Gather the dataset file from NAF dataset. Specifically for the case
190+
that there is a split file that contains the names of different splits.
191+
For example,
192+
193+
img_001.jpg train: img_001.jpg
194+
img_002.jpg ---> data_split.json ---> test: img_002.jpg
195+
img_003.jpg val: img_003.jpg
196+
197+
Args:
198+
img_path (str): Path to the images.
199+
anno_path (str): Path to the annotations.
200+
Returns:
201+
List[Tuple]: A list of tuples (img_path, ann_path).
202+
"""
203+
split_file = osp.join(self.data_root, 'data_split.json')
204+
with open(split_file, 'r') as f:
205+
split_data = json.load(f)
206+
files = []
207+
# Rename the key
208+
split_data['val'] = split_data.pop('valid')
209+
if not osp.exists(img_path):
210+
os.makedirs(img_path)
211+
for groups in split_data[self.current_split]:
212+
for img_name in split_data[self.current_split][groups]:
213+
src_img = osp.join(self.data_root, 'temp_images', img_name)
214+
dst_img = osp.join(img_path, img_name)
215+
if not osp.exists(src_img):
216+
Warning(f'{src_img} does not exist!')
217+
continue
218+
# move the image to the new path
219+
shutil.move(src_img, dst_img)
220+
ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
221+
files.append((dst_img, ann))
222+
return files
223+
184224
def clean(self) -> None:
185225
for d in self.delete:
186226
delete_file = osp.join(self.data_root, d)
187227
if osp.exists(delete_file):
188-
shutil.rmtree(delete_file)
228+
if osp.isdir(delete_file):
229+
shutil.rmtree(delete_file)
230+
else:
231+
os.remove(delete_file)
189232

190233

191234
@DATA_CONVERTERS.register_module()

mmocr/datasets/preparers/data_obtainer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
2+
import glob
23
import os
34
import os.path as osp
45
import shutil
@@ -148,7 +149,14 @@ def move(self, mapping: List[Tuple[str, str]]) -> None:
148149
for src, dst in mapping:
149150
src = osp.join(self.data_root, src)
150151
dst = osp.join(self.data_root, dst)
151-
if osp.exists(src) and not osp.exists(dst):
152+
153+
if '*' in src:
154+
mkdir_or_exist(dst)
155+
for f in glob.glob(src):
156+
if not osp.exists(osp.join(dst, osp.basename(f))):
157+
shutil.move(f, dst)
158+
159+
elif osp.exists(src) and not osp.exists(dst):
152160
shutil.move(src, dst)
153161

154162
def clean(self) -> None:

mmocr/datasets/preparers/parsers/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
from .funsd_parser import FUNSDTextDetAnnParser
44
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
55
ICDARTxtTextRecogAnnParser)
6+
from .naf_parser import NAFAnnParser
67
from .svt_parser import SVTTextDetAnnParser
78
from .totaltext_parser import TotaltextTextDetAnnParser
89
from .wildreceipt_parser import WildreceiptKIEAnnParser
910

1011
__all__ = [
1112
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
1213
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
13-
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
14+
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
15+
'NAFAnnParser'
1416
]
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
import json
3+
from typing import Dict, List, Tuple
4+
5+
import numpy as np
6+
7+
from ..data_preparer import DATA_PARSERS
8+
from .base import BaseParser
9+
10+
11+
@DATA_PARSERS.register_module()
12+
class NAFAnnParser(BaseParser):
13+
"""NAF dataset parser.
14+
15+
The original annotation format of this dataset is stored in json files,
16+
which has the following keys that will be used here:
17+
- 'textBBs': List of text bounding box objects
18+
- 'poly_points': list of [x,y] pairs, the box corners going
19+
top-left,top-right,bottom-right,bottom-left
20+
- 'id': id of the textBB, used to match with the text
21+
- 'transcriptions': Dict of transcription objects, use the 'id' key
22+
to match with the textBB.
23+
24+
Some special characters are used in the transcription:
25+
"«text»" indicates that "text" had a strikethrough
26+
"¿" indicates the transcriber could not read a character
27+
"§" indicates the whole line or word was illegible
28+
"" (empty string) is if the field was blank
29+
30+
Args:
31+
data_root (str): Path to the dataset root.
32+
ignore (list(str)): The text of the ignored instances. Default: ['#'].
33+
det (bool): Whether to parse the detection annotation. Default: True.
34+
If False, the parser will consider special case in NAF dataset
35+
where the transcription is not available.
36+
nproc (int): Number of processes to load the data. Default: 1.
37+
"""
38+
39+
def __init__(self,
40+
data_root: str,
41+
ignore: List[str] = ['#'],
42+
det: bool = True,
43+
nproc: int = 1) -> None:
44+
self.ignore = ignore
45+
self.det = det
46+
super().__init__(data_root=data_root, nproc=nproc)
47+
48+
def parse_file(self, file: Tuple, split: str) -> Dict:
49+
"""Convert single annotation."""
50+
img_file, json_file = file
51+
instances = list()
52+
for poly, text in self.loader(json_file):
53+
instances.append(
54+
dict(poly=poly, text=text, ignore=text in self.ignore))
55+
56+
return img_file, instances
57+
58+
def loader(self, file_path: str) -> str:
59+
"""Load the annotation of the NAF dataset.
60+
61+
Args:
62+
file_path (str): Path to the json file
63+
64+
Retyrb:
65+
str: Complete annotation of the json file
66+
"""
67+
with open(file_path, 'r') as f:
68+
data = json.load(f)
69+
70+
# 'textBBs' contains the printed texts of the table while 'fieldBBs'
71+
# contains the text filled by human.
72+
for box_type in ['textBBs', 'fieldBBs']:
73+
if not self.det:
74+
# 'textBBs' is only used for detection task.
75+
if box_type == 'textBBs':
76+
continue
77+
for anno in data[box_type]:
78+
# Skip blanks
79+
if self.det:
80+
if box_type == 'fieldBBs':
81+
if anno['type'] == 'blank':
82+
continue
83+
poly = np.array(anno['poly_points']).reshape(
84+
1, 8)[0].tolist()
85+
# Since detection task only need poly, we can skip the
86+
# transcription part that can be empty.
87+
text = None
88+
else:
89+
# For tasks that need transcription, NAF dataset has
90+
# serval special cases:
91+
# 1. The transcription for the whole image is not
92+
# available.
93+
# 2. The transcription for the certain text is not
94+
# available.
95+
# 3. If the length of the transcription is 0, it should
96+
# be ignored.
97+
if 'transcriptions' not in data.keys():
98+
break
99+
if anno['id'] not in data['transcriptions'].keys():
100+
continue
101+
text = data['transcriptions'][anno['id']]
102+
text = text.strip(
103+
'\u202a') # Remove unicode control character
104+
text = text.replace('»', '').replace(
105+
'«', '') # Remove strikethrough flag
106+
if len(text) == 0:
107+
continue
108+
poly = np.array(anno['poly_points']).reshape(
109+
1, 8)[0].tolist()
110+
yield poly, text

0 commit comments

Comments
 (0)