11# Copyright (c) OpenMMLab. All rights reserved.
22import os .path as osp
3- from typing import Dict , List , Tuple
3+ import warnings
4+ from typing import Dict , Tuple
45
56import mmcv
67
78from mmocr .registry import DATA_PACKERS
8- from .base import BasePacker
9+ from .ser_packer import SERPacker
910
1011
1112@DATA_PACKERS .register_module ()
12- class REPacker (BasePacker ):
13+ class REPacker (SERPacker ):
1314 """Relation Extraction packer. It is used to pack the parsed annotation
1415 info to.
1516
@@ -18,8 +19,6 @@ class REPacker(BasePacker):
1819 {
1920 "metainfo":
2021 {
21- "dataset_type": "REDataset",
22- "task_name": "re",
2322 "labels": ['answer', 'header', 'other', 'question'],
2423 "id2label": {
2524 "0": "O",
@@ -49,8 +48,8 @@ class REPacker(BasePacker):
4948 "instances":
5049 {
5150 "texts": ["绩效目标申报表(一级项目)", "项目名称", ...],
52- "bboxes ": [[906,195,1478,259],
53- [357,325,467,357], ...],
51+ "boxes ": [[906,195,1478,259],
52+ [357,325,467,357], ...],
5453 "labels": ["header", "question", ...],
5554 "linkings": [[0, 1], [2, 3], ...],
5655 "ids": [0, 1, ...],
@@ -104,75 +103,44 @@ def pack_instance(self, sample: Tuple) -> Dict:
104103 h , w = img .shape [:2 ]
105104
106105 texts_per_doc = []
107- bboxes_per_doc = []
106+ boxes_per_doc = []
108107 labels_per_doc = []
109- words_per_doc = []
110108 linking_per_doc = []
111109 id_per_doc = []
110+ has_words = all (['words' in ins for ins in instances ])
111+ if has_words :
112+ words_per_doc = []
113+ else :
114+ warnings .warn (
115+ 'Not all instance has `words` key,'
116+ 'so final MMOCR format SER instance will not have `words` key' )
117+
112118 for instance in instances :
113119 text = instance .get ('text' , None )
114120 box = instance .get ('box' , None )
115121 label = instance .get ('label' , None )
116122 linking = instance .get ('linking' , None )
117123 ins_id = instance .get ('id' , None )
118- words = instance .get ('words' , None )
119- assert text or box or label
124+ assert text or box or label or linking or ins_id
120125 texts_per_doc .append (text )
121- bboxes_per_doc .append (box )
126+ boxes_per_doc .append (box )
122127 labels_per_doc .append (label )
123- words_per_doc .append (words )
124128 linking_per_doc .append (linking )
125129 id_per_doc .append (ins_id )
130+ if has_words :
131+ words = instance .get ('words' , None )
132+ words_per_doc .append (words )
126133 packed_instances = dict (
127134 instances = dict (
128135 texts = texts_per_doc ,
129- bboxes = bboxes_per_doc ,
136+ boxes = boxes_per_doc ,
130137 labels = labels_per_doc ,
131138 linkings = linking_per_doc ,
132- ids = id_per_doc ,
133- words = words_per_doc ),
139+ ids = id_per_doc ),
134140 img_path = osp .relpath (img_path , self .data_root ),
135141 height = h ,
136142 width = w )
143+ if has_words :
144+ packed_instances ['instances' ].update ({'words' : words_per_doc })
137145
138146 return packed_instances
139-
140- def add_meta (self , sample : List ) -> Dict :
141- """Add meta information to the sample.
142-
143- Args:
144- sample (List): A list of samples of the dataset.
145-
146- Returns:
147- Dict: A dict contains the meta information and samples.
148- """
149-
150- def get_BIO_label_list (labels ):
151- bio_label_list = []
152- for label in labels :
153- if label == 'other' :
154- bio_label_list .insert (0 , 'O' )
155- else :
156- bio_label_list .append (f'B-{ label .upper ()} ' )
157- bio_label_list .append (f'I-{ label .upper ()} ' )
158- return bio_label_list
159-
160- labels = []
161- for s in sample :
162- labels += s ['instances' ]['labels' ]
163- org_label_list = list (set (labels ))
164- bio_label_list = get_BIO_label_list (org_label_list )
165-
166- meta = {
167- 'metainfo' : {
168- 'dataset_type' : 'REDataset' ,
169- 'task_name' : 're' ,
170- 'labels' : org_label_list ,
171- 'id2label' : {k : v
172- for k , v in enumerate (bio_label_list )},
173- 'label2id' : {v : k
174- for k , v in enumerate (bio_label_list )}
175- },
176- 'data_list' : sample
177- }
178- return meta
0 commit comments