Skip to content

Commit deb96cc

Browse files
committed
优化ser/re packer,根据words关键字是否存在觉得是否加入
1 parent 1d0c5e3 commit deb96cc

File tree

2 files changed

+46
-72
lines changed

2 files changed

+46
-72
lines changed

mmocr/datasets/preparers/packers/re_packer.py

Lines changed: 24 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
22
import os.path as osp
3-
from typing import Dict, List, Tuple
3+
import warnings
4+
from typing import Dict, Tuple
45

56
import mmcv
67

78
from mmocr.registry import DATA_PACKERS
8-
from .base import BasePacker
9+
from .ser_packer import SERPacker
910

1011

1112
@DATA_PACKERS.register_module()
12-
class REPacker(BasePacker):
13+
class REPacker(SERPacker):
1314
"""Relation Extraction packer. It is used to pack the parsed annotation
1415
info to.
1516
@@ -18,8 +19,6 @@ class REPacker(BasePacker):
1819
{
1920
"metainfo":
2021
{
21-
"dataset_type": "REDataset",
22-
"task_name": "re",
2322
"labels": ['answer', 'header', 'other', 'question'],
2423
"id2label": {
2524
"0": "O",
@@ -49,8 +48,8 @@ class REPacker(BasePacker):
4948
"instances":
5049
{
5150
"texts": ["绩效目标申报表(一级项目)", "项目名称", ...],
52-
"bboxes": [[906,195,1478,259],
53-
[357,325,467,357], ...],
51+
"boxes": [[906,195,1478,259],
52+
[357,325,467,357], ...],
5453
"labels": ["header", "question", ...],
5554
"linkings": [[0, 1], [2, 3], ...],
5655
"ids": [0, 1, ...],
@@ -104,75 +103,44 @@ def pack_instance(self, sample: Tuple) -> Dict:
104103
h, w = img.shape[:2]
105104

106105
texts_per_doc = []
107-
bboxes_per_doc = []
106+
boxes_per_doc = []
108107
labels_per_doc = []
109-
words_per_doc = []
110108
linking_per_doc = []
111109
id_per_doc = []
110+
has_words = all(['words' in ins for ins in instances])
111+
if has_words:
112+
words_per_doc = []
113+
else:
114+
warnings.warn(
115+
'Not all instance has `words` key,'
116+
'so final MMOCR format SER instance will not have `words` key')
117+
112118
for instance in instances:
113119
text = instance.get('text', None)
114120
box = instance.get('box', None)
115121
label = instance.get('label', None)
116122
linking = instance.get('linking', None)
117123
ins_id = instance.get('id', None)
118-
words = instance.get('words', None)
119-
assert text or box or label
124+
assert text or box or label or linking or ins_id
120125
texts_per_doc.append(text)
121-
bboxes_per_doc.append(box)
126+
boxes_per_doc.append(box)
122127
labels_per_doc.append(label)
123-
words_per_doc.append(words)
124128
linking_per_doc.append(linking)
125129
id_per_doc.append(ins_id)
130+
if has_words:
131+
words = instance.get('words', None)
132+
words_per_doc.append(words)
126133
packed_instances = dict(
127134
instances=dict(
128135
texts=texts_per_doc,
129-
bboxes=bboxes_per_doc,
136+
boxes=boxes_per_doc,
130137
labels=labels_per_doc,
131138
linkings=linking_per_doc,
132-
ids=id_per_doc,
133-
words=words_per_doc),
139+
ids=id_per_doc),
134140
img_path=osp.relpath(img_path, self.data_root),
135141
height=h,
136142
width=w)
143+
if has_words:
144+
packed_instances['instances'].update({'words': words_per_doc})
137145

138146
return packed_instances
139-
140-
def add_meta(self, sample: List) -> Dict:
141-
"""Add meta information to the sample.
142-
143-
Args:
144-
sample (List): A list of samples of the dataset.
145-
146-
Returns:
147-
Dict: A dict contains the meta information and samples.
148-
"""
149-
150-
def get_BIO_label_list(labels):
151-
bio_label_list = []
152-
for label in labels:
153-
if label == 'other':
154-
bio_label_list.insert(0, 'O')
155-
else:
156-
bio_label_list.append(f'B-{label.upper()}')
157-
bio_label_list.append(f'I-{label.upper()}')
158-
return bio_label_list
159-
160-
labels = []
161-
for s in sample:
162-
labels += s['instances']['labels']
163-
org_label_list = list(set(labels))
164-
bio_label_list = get_BIO_label_list(org_label_list)
165-
166-
meta = {
167-
'metainfo': {
168-
'dataset_type': 'REDataset',
169-
'task_name': 're',
170-
'labels': org_label_list,
171-
'id2label': {k: v
172-
for k, v in enumerate(bio_label_list)},
173-
'label2id': {v: k
174-
for k, v in enumerate(bio_label_list)}
175-
},
176-
'data_list': sample
177-
}
178-
return meta

mmocr/datasets/preparers/packers/ser_packer.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
22
import os.path as osp
3+
import warnings
34
from typing import Dict, List, Tuple
45

56
import mmcv
@@ -18,8 +19,6 @@ class SERPacker(BasePacker):
1819
{
1920
"metainfo":
2021
{
21-
"dataset_type": "SERDataset",
22-
"task_name": "ser",
2322
"labels": ['answer', 'header', 'other', 'question'],
2423
"id2label": {
2524
"0": "O",
@@ -49,8 +48,8 @@ class SERPacker(BasePacker):
4948
"instances":
5049
{
5150
"texts": ["绩效目标申报表(一级项目)", "项目名称", ...],
52-
"bboxes": [[906,195,1478,259],
53-
[357,325,467,357], ...],
51+
"boxes": [[906,195,1478,259],
52+
[357,325,467,357], ...],
5453
"labels": ["header", "question", ...],
5554
"words": [[{
5655
"box": [
@@ -100,28 +99,37 @@ def pack_instance(self, sample: Tuple) -> Dict:
10099
h, w = img.shape[:2]
101100

102101
texts_per_doc = []
103-
bboxes_per_doc = []
102+
boxes_per_doc = []
104103
labels_per_doc = []
105-
words_per_doc = []
104+
has_words = all(['words' in ins for ins in instances])
105+
if has_words:
106+
words_per_doc = []
107+
else:
108+
warnings.warn(
109+
'Not all instance has `words` key,'
110+
'so final MMOCR format SER instance will not have `words` key')
111+
106112
for instance in instances:
107113
text = instance.get('text', None)
108114
box = instance.get('box', None)
109115
label = instance.get('label', None)
110-
words = instance.get('words', None)
111116
assert text or box or label
112117
texts_per_doc.append(text)
113-
bboxes_per_doc.append(box)
118+
boxes_per_doc.append(box)
114119
labels_per_doc.append(label)
115-
words_per_doc.append(words)
120+
if has_words:
121+
words = instance.get('words', None)
122+
words_per_doc.append(words)
116123
packed_instances = dict(
117124
instances=dict(
118125
texts=texts_per_doc,
119-
bboxes=bboxes_per_doc,
120-
labels=labels_per_doc,
121-
words=words_per_doc),
126+
boxes=boxes_per_doc,
127+
labels=labels_per_doc),
122128
img_path=osp.relpath(img_path, self.data_root),
123129
height=h,
124130
width=w)
131+
if has_words:
132+
packed_instances['instances'].update({'words': words_per_doc})
125133

126134
return packed_instances
127135

@@ -135,7 +143,7 @@ def add_meta(self, sample: List) -> Dict:
135143
Dict: A dict contains the meta information and samples.
136144
"""
137145

138-
def get_BIO_label_list(labels):
146+
def get_bio_label_list(labels):
139147
bio_label_list = []
140148
for label in labels:
141149
if label == 'other':
@@ -149,12 +157,10 @@ def get_BIO_label_list(labels):
149157
for s in sample:
150158
labels += s['instances']['labels']
151159
org_label_list = list(set(labels))
152-
bio_label_list = get_BIO_label_list(org_label_list)
160+
bio_label_list = get_bio_label_list(org_label_list)
153161

154162
meta = {
155163
'metainfo': {
156-
'dataset_type': 'SERDataset',
157-
'task_name': 'ser',
158164
'labels': org_label_list,
159165
'id2label': {k: v
160166
for k, v in enumerate(bio_label_list)},

0 commit comments

Comments
 (0)