|
| 1 | +# Copyright (c) OpenMMLab. All rights reserved. |
| 2 | +import json |
| 3 | +from typing import Dict, List, Tuple |
| 4 | + |
| 5 | +import numpy as np |
| 6 | + |
| 7 | +from ..data_preparer import DATA_PARSERS |
| 8 | +from .base import BaseParser |
| 9 | + |
| 10 | + |
| 11 | +@DATA_PARSERS.register_module() |
| 12 | +class NAFAnnParser(BaseParser): |
| 13 | + """NAF dataset parser. |
| 14 | +
|
| 15 | + The original annotation format of this dataset is stored in json files, |
| 16 | + which has the following keys that will be used here: |
| 17 | + - 'textBBs': List of text bounding box objects |
| 18 | + - 'poly_points': list of [x,y] pairs, the box corners going |
| 19 | + top-left,top-right,bottom-right,bottom-left |
| 20 | + - 'id': id of the textBB, used to match with the text |
| 21 | + - 'transcriptions': Dict of transcription objects, use the 'id' key |
| 22 | + to match with the textBB. |
| 23 | +
|
| 24 | + Some special characters are used in the transcription: |
| 25 | + "«text»" indicates that "text" had a strikethrough |
| 26 | + "¿" indicates the transcriber could not read a character |
| 27 | + "§" indicates the whole line or word was illegible |
| 28 | + "" (empty string) is if the field was blank |
| 29 | +
|
| 30 | + Args: |
| 31 | + data_root (str): Path to the dataset root. |
| 32 | + ignore (list(str)): The text of the ignored instances. Default: ['#']. |
| 33 | + det (bool): Whether to parse the detection annotation. Default: True. |
| 34 | + If False, the parser will consider special case in NAF dataset |
| 35 | + where the transcription is not available. |
| 36 | + nproc (int): Number of processes to load the data. Default: 1. |
| 37 | + """ |
| 38 | + |
| 39 | + def __init__(self, |
| 40 | + data_root: str, |
| 41 | + ignore: List[str] = ['#'], |
| 42 | + det: bool = True, |
| 43 | + nproc: int = 1) -> None: |
| 44 | + self.ignore = ignore |
| 45 | + self.det = det |
| 46 | + super().__init__(data_root=data_root, nproc=nproc) |
| 47 | + |
| 48 | + def parse_file(self, file: Tuple, split: str) -> Dict: |
| 49 | + """Convert single annotation.""" |
| 50 | + img_file, json_file = file |
| 51 | + instances = list() |
| 52 | + for poly, text in self.loader(json_file): |
| 53 | + instances.append( |
| 54 | + dict(poly=poly, text=text, ignore=text in self.ignore)) |
| 55 | + |
| 56 | + return img_file, instances |
| 57 | + |
| 58 | + def loader(self, file_path: str) -> str: |
| 59 | + """Load the annotation of the NAF dataset. |
| 60 | +
|
| 61 | + Args: |
| 62 | + file_path (str): Path to the json file |
| 63 | +
|
| 64 | + Retyrb: |
| 65 | + str: Complete annotation of the json file |
| 66 | + """ |
| 67 | + with open(file_path, 'r') as f: |
| 68 | + data = json.load(f) |
| 69 | + |
| 70 | + # 'textBBs' contains the printed texts of the table while 'fieldBBs' |
| 71 | + # contains the text filled by human. |
| 72 | + for box_type in ['textBBs', 'fieldBBs']: |
| 73 | + if not self.det: |
| 74 | + # 'textBBs' is only used for detection task. |
| 75 | + if box_type == 'textBBs': |
| 76 | + continue |
| 77 | + for anno in data[box_type]: |
| 78 | + # Skip blanks |
| 79 | + if self.det: |
| 80 | + if box_type == 'fieldBBs': |
| 81 | + if anno['type'] == 'blank': |
| 82 | + continue |
| 83 | + poly = np.array(anno['poly_points']).reshape( |
| 84 | + 1, 8)[0].tolist() |
| 85 | + # Since detection task only need poly, we can skip the |
| 86 | + # transcription part that can be empty. |
| 87 | + text = None |
| 88 | + else: |
| 89 | + # For tasks that need transcription, NAF dataset has |
| 90 | + # serval special cases: |
| 91 | + # 1. The transcription for the whole image is not |
| 92 | + # available. |
| 93 | + # 2. The transcription for the certain text is not |
| 94 | + # available. |
| 95 | + # 3. If the length of the transcription is 0, it should |
| 96 | + # be ignored. |
| 97 | + if 'transcriptions' not in data.keys(): |
| 98 | + break |
| 99 | + if anno['id'] not in data['transcriptions'].keys(): |
| 100 | + continue |
| 101 | + text = data['transcriptions'][anno['id']] |
| 102 | + text = text.strip( |
| 103 | + '\u202a') # Remove unicode control character |
| 104 | + text = text.replace('»', '').replace( |
| 105 | + '«', '') # Remove strikethrough flag |
| 106 | + if len(text) == 0: |
| 107 | + continue |
| 108 | + poly = np.array(anno['poly_points']).reshape( |
| 109 | + 1, 8)[0].tolist() |
| 110 | + yield poly, text |
0 commit comments