|
| 1 | +import logging |
| 2 | +import os |
| 3 | + |
| 4 | +import paddle |
| 5 | +from paddle.io import Dataset |
| 6 | + |
| 7 | +logger = logging.getLogger(__name__) |
| 8 | +import numpy as np |
| 9 | + |
| 10 | + |
| 11 | +class FunsdDataset(Dataset): |
| 12 | + def __init__(self, args, tokenizer, labels, pad_token_label_id, mode): |
| 13 | + logger.info("Creating features from dataset file at %s", args.data_dir) |
| 14 | + examples = read_examples_from_file(args.data_dir, mode) |
| 15 | + features = convert_examples_to_features( |
| 16 | + examples, |
| 17 | + labels, |
| 18 | + args.max_seq_length, |
| 19 | + tokenizer, |
| 20 | + cls_token_at_end=bool(args.model_type in ["xlnet"]), |
| 21 | + # xlnet has a cls token at the end |
| 22 | + cls_token=tokenizer.cls_token, |
| 23 | + cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, |
| 24 | + sep_token=tokenizer.sep_token, |
| 25 | + sep_token_extra=bool(args.model_type in ["roberta"]), |
| 26 | + # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 |
| 27 | + pad_on_left=bool(args.model_type in ["xlnet"]), |
| 28 | + # pad on the left for xlnet |
| 29 | + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], |
| 30 | + pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, |
| 31 | + pad_token_label_id=pad_token_label_id, |
| 32 | + model_type=args.model_type) |
| 33 | + |
| 34 | + self.features = features |
| 35 | + # Convert to Tensors and build dataset |
| 36 | + self.all_input_ids = paddle.to_tensor( |
| 37 | + [f.input_ids for f in features], dtype="int64") |
| 38 | + self.all_input_mask = paddle.to_tensor( |
| 39 | + [f.input_mask for f in features], dtype="int64") |
| 40 | + self.all_segment_ids = paddle.to_tensor( |
| 41 | + [f.segment_ids for f in features], dtype="int64") |
| 42 | + self.all_label_ids = paddle.to_tensor( |
| 43 | + [f.label_ids for f in features], dtype="int64") |
| 44 | + self.all_bboxes = paddle.to_tensor( |
| 45 | + [f.boxes for f in features], dtype="int64") |
| 46 | + |
| 47 | + def __len__(self): |
| 48 | + return len(self.features) |
| 49 | + |
| 50 | + def __getitem__(self, index): |
| 51 | + return ( |
| 52 | + self.all_input_ids[index], |
| 53 | + self.all_input_mask[index], |
| 54 | + self.all_segment_ids[index], |
| 55 | + self.all_label_ids[index], |
| 56 | + self.all_bboxes[index], ) |
| 57 | + |
| 58 | + |
| 59 | +class InputExample(object): |
| 60 | + """A single training/test example for token classification.""" |
| 61 | + |
| 62 | + def __init__(self, guid, words, labels, boxes, actual_bboxes, file_name, |
| 63 | + page_size): |
| 64 | + """Constructs a InputExample. |
| 65 | + Args: |
| 66 | + guid: Unique id for the example. |
| 67 | + words: list. The words of the sequence. |
| 68 | + labels: (Optional) list. The labels for each word of the sequence. This should be |
| 69 | + specified for train and dev examples, but not for test examples. |
| 70 | + """ |
| 71 | + self.guid = guid |
| 72 | + self.words = words |
| 73 | + self.labels = labels |
| 74 | + self.boxes = boxes |
| 75 | + self.actual_bboxes = actual_bboxes |
| 76 | + self.file_name = file_name |
| 77 | + self.page_size = page_size |
| 78 | + |
| 79 | + |
| 80 | +class InputFeatures(object): |
| 81 | + """A single set of features of data.""" |
| 82 | + |
| 83 | + def __init__( |
| 84 | + self, |
| 85 | + input_ids, |
| 86 | + input_mask, |
| 87 | + segment_ids, |
| 88 | + label_ids, |
| 89 | + boxes, |
| 90 | + actual_bboxes, |
| 91 | + file_name, |
| 92 | + page_size, ): |
| 93 | + assert ( |
| 94 | + 0 <= all(boxes) <= 1000 |
| 95 | + ), "Error with input bbox ({}): the coordinate value is not between 0 and 1000".format( |
| 96 | + boxes) |
| 97 | + self.input_ids = input_ids |
| 98 | + self.input_mask = input_mask |
| 99 | + self.segment_ids = segment_ids |
| 100 | + self.label_ids = label_ids |
| 101 | + self.boxes = boxes |
| 102 | + self.actual_bboxes = actual_bboxes |
| 103 | + self.file_name = file_name |
| 104 | + self.page_size = page_size |
| 105 | + |
| 106 | + |
| 107 | +def read_examples_from_file(data_dir, mode): |
| 108 | + file_path = os.path.join(data_dir, "{}.txt".format(mode)) |
| 109 | + box_file_path = os.path.join(data_dir, "{}_box.txt".format(mode)) |
| 110 | + image_file_path = os.path.join(data_dir, "{}_image.txt".format(mode)) |
| 111 | + guid_index = 1 |
| 112 | + examples = [] |
| 113 | + with open( |
| 114 | + file_path, encoding="utf-8") as f, open( |
| 115 | + box_file_path, encoding="utf-8") as fb, open( |
| 116 | + image_file_path, encoding="utf-8") as fi: |
| 117 | + words = [] |
| 118 | + boxes = [] |
| 119 | + actual_bboxes = [] |
| 120 | + file_name = None |
| 121 | + page_size = None |
| 122 | + labels = [] |
| 123 | + for line, bline, iline in zip(f, fb, fi): |
| 124 | + if line.startswith("-DOCSTART-") or line == "" or line == "\n": |
| 125 | + if words: |
| 126 | + examples.append( |
| 127 | + InputExample( |
| 128 | + guid="{}-{}".format(mode, guid_index), |
| 129 | + words=words, |
| 130 | + labels=labels, |
| 131 | + boxes=boxes, |
| 132 | + actual_bboxes=actual_bboxes, |
| 133 | + file_name=file_name, |
| 134 | + page_size=page_size, )) |
| 135 | + guid_index += 1 |
| 136 | + words = [] |
| 137 | + boxes = [] |
| 138 | + actual_bboxes = [] |
| 139 | + file_name = None |
| 140 | + page_size = None |
| 141 | + labels = [] |
| 142 | + else: |
| 143 | + splits = line.split("\t") |
| 144 | + bsplits = bline.split("\t") |
| 145 | + isplits = iline.split("\t") |
| 146 | + assert len(splits) == 2 |
| 147 | + assert len(bsplits) == 2 |
| 148 | + assert len(isplits) == 4 |
| 149 | + assert splits[0] == bsplits[0] |
| 150 | + words.append(splits[0]) |
| 151 | + if len(splits) > 1: |
| 152 | + labels.append(splits[-1].replace("\n", "")) |
| 153 | + box = bsplits[-1].replace("\n", "") |
| 154 | + box = [int(b) for b in box.split()] |
| 155 | + boxes.append(box) |
| 156 | + actual_bbox = [int(b) for b in isplits[1].split()] |
| 157 | + actual_bboxes.append(actual_bbox) |
| 158 | + page_size = [int(i) for i in isplits[2].split()] |
| 159 | + file_name = isplits[3].strip() |
| 160 | + else: |
| 161 | + # Examples could have no label for mode = "test" |
| 162 | + labels.append("O") |
| 163 | + if words: |
| 164 | + examples.append( |
| 165 | + InputExample( |
| 166 | + guid="%s-%d".format(mode, guid_index), |
| 167 | + words=words, |
| 168 | + labels=labels, |
| 169 | + boxes=boxes, |
| 170 | + actual_bboxes=actual_bboxes, |
| 171 | + file_name=file_name, |
| 172 | + page_size=page_size, )) |
| 173 | + return examples |
| 174 | + |
| 175 | + |
| 176 | +def convert_examples_to_features(examples, |
| 177 | + label_list, |
| 178 | + max_seq_length, |
| 179 | + tokenizer, |
| 180 | + cls_token_at_end=False, |
| 181 | + cls_token="[CLS]", |
| 182 | + cls_token_segment_id=1, |
| 183 | + sep_token="[SEP]", |
| 184 | + sep_token_extra=False, |
| 185 | + pad_on_left=False, |
| 186 | + pad_token=0, |
| 187 | + cls_token_box=[0, 0, 0, 0], |
| 188 | + sep_token_box=[1000, 1000, 1000, 1000], |
| 189 | + pad_token_box=[0, 0, 0, 0], |
| 190 | + pad_token_segment_id=0, |
| 191 | + pad_token_label_id=-1, |
| 192 | + sequence_a_segment_id=0, |
| 193 | + mask_padding_with_zero=True, |
| 194 | + model_type="bert"): |
| 195 | + """ Loads a data file into a list of `InputBatch`s |
| 196 | + `cls_token_at_end` define the location of the CLS token: |
| 197 | + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] |
| 198 | + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] |
| 199 | + `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) |
| 200 | + """ |
| 201 | + |
| 202 | + label_map = {label: i for i, label in enumerate(label_list)} |
| 203 | + |
| 204 | + features = [] |
| 205 | + for (ex_index, example) in enumerate(examples): |
| 206 | + file_name = example.file_name |
| 207 | + page_size = example.page_size |
| 208 | + width, height = page_size |
| 209 | + if ex_index % 10000 == 0: |
| 210 | + logger.info("Writing example %d of %d", ex_index, len(examples)) |
| 211 | + |
| 212 | + tokens = [] |
| 213 | + token_boxes = [] |
| 214 | + actual_bboxes = [] |
| 215 | + label_ids = [] |
| 216 | + for word, label, box, actual_bbox in zip(example.words, example.labels, |
| 217 | + example.boxes, |
| 218 | + example.actual_bboxes): |
| 219 | + word_tokens = tokenizer.tokenize(word) |
| 220 | + tokens.extend(word_tokens) |
| 221 | + token_boxes.extend([box] * len(word_tokens)) |
| 222 | + actual_bboxes.extend([actual_bbox] * len(word_tokens)) |
| 223 | + # Use the real label id for the first token of the word, and padding ids for the remaining tokens |
| 224 | + label_ids.extend([label_map[label]] + [pad_token_label_id] * (len( |
| 225 | + word_tokens) - 1)) |
| 226 | + |
| 227 | + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. |
| 228 | + special_tokens_count = 3 if sep_token_extra else 2 |
| 229 | + if len(tokens) > max_seq_length - special_tokens_count: |
| 230 | + tokens = tokens[:(max_seq_length - special_tokens_count)] |
| 231 | + token_boxes = token_boxes[:(max_seq_length - special_tokens_count)] |
| 232 | + actual_bboxes = actual_bboxes[:(max_seq_length - |
| 233 | + special_tokens_count)] |
| 234 | + label_ids = label_ids[:(max_seq_length - special_tokens_count)] |
| 235 | + |
| 236 | + # The convention in BERT is: |
| 237 | + # (a) For sequence pairs: |
| 238 | + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] |
| 239 | + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 |
| 240 | + # (b) For single sequences: |
| 241 | + # tokens: [CLS] the dog is hairy . [SEP] |
| 242 | + # type_ids: 0 0 0 0 0 0 0 |
| 243 | + # |
| 244 | + # Where "type_ids" are used to indicate whether this is the first |
| 245 | + # sequence or the second sequence. The embedding vectors for `type=0` and |
| 246 | + # `type=1` were learned during pre-training and are added to the wordpiece |
| 247 | + # embedding vector (and position vector). This is not *strictly* necessary |
| 248 | + # since the [SEP] token unambiguously separates the sequences, but it makes |
| 249 | + # it easier for the model to learn the concept of sequences. |
| 250 | + # |
| 251 | + # For classification tasks, the first vector (corresponding to [CLS]) is |
| 252 | + # used as as the "sentence vector". Note that this only makes sense because |
| 253 | + # the entire model is fine-tuned. |
| 254 | + tokens += [sep_token] |
| 255 | + token_boxes += [sep_token_box] |
| 256 | + actual_bboxes += [[0, 0, width, height]] |
| 257 | + label_ids += [pad_token_label_id] |
| 258 | + if sep_token_extra: |
| 259 | + # roberta uses an extra separator b/w pairs of sentences |
| 260 | + tokens += [sep_token] |
| 261 | + token_boxes += [sep_token_box] |
| 262 | + actual_bboxes += [[0, 0, width, height]] |
| 263 | + label_ids += [pad_token_label_id] |
| 264 | + segment_ids = [sequence_a_segment_id] * len(tokens) |
| 265 | + |
| 266 | + if cls_token_at_end: |
| 267 | + tokens += [cls_token] |
| 268 | + token_boxes += [cls_token_box] |
| 269 | + actual_bboxes += [[0, 0, width, height]] |
| 270 | + label_ids += [pad_token_label_id] |
| 271 | + segment_ids += [cls_token_segment_id] |
| 272 | + else: |
| 273 | + tokens = [cls_token] + tokens |
| 274 | + token_boxes = [cls_token_box] + token_boxes |
| 275 | + actual_bboxes = [[0, 0, width, height]] + actual_bboxes |
| 276 | + label_ids = [pad_token_label_id] + label_ids |
| 277 | + segment_ids = [cls_token_segment_id] + segment_ids |
| 278 | + |
| 279 | + input_ids = tokenizer.convert_tokens_to_ids(tokens) |
| 280 | + |
| 281 | + # The mask has 1 for real tokens and 0 for padding tokens. Only real |
| 282 | + # tokens are attended to. |
| 283 | + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) |
| 284 | + |
| 285 | + # Zero-pad up to the sequence length. |
| 286 | + padding_length = max_seq_length - len(input_ids) |
| 287 | + if pad_on_left: |
| 288 | + input_ids = ([pad_token] * padding_length) + input_ids |
| 289 | + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length |
| 290 | + ) + input_mask |
| 291 | + segment_ids = ([pad_token_segment_id] * padding_length |
| 292 | + ) + segment_ids |
| 293 | + label_ids = ([pad_token_label_id] * padding_length) + label_ids |
| 294 | + token_boxes = ([pad_token_box] * padding_length) + token_boxes |
| 295 | + else: |
| 296 | + input_ids += [pad_token] * padding_length |
| 297 | + input_mask += [0 if mask_padding_with_zero else 1] * padding_length |
| 298 | + segment_ids += [pad_token_segment_id] * padding_length |
| 299 | + label_ids += [pad_token_label_id] * padding_length |
| 300 | + token_boxes += [pad_token_box] * padding_length |
| 301 | + |
| 302 | + assert len(input_ids) == max_seq_length |
| 303 | + assert len(input_mask) == max_seq_length |
| 304 | + assert len(segment_ids) == max_seq_length |
| 305 | + assert len(label_ids) == max_seq_length |
| 306 | + assert len(token_boxes) == max_seq_length |
| 307 | + |
| 308 | + if model_type != "layoutlm": |
| 309 | + input_mask = np.array(input_mask) |
| 310 | + input_mask = np.reshape( |
| 311 | + input_mask.astype(np.float32), [1, 1, input_mask.shape[0]]) |
| 312 | + |
| 313 | + features.append( |
| 314 | + InputFeatures( |
| 315 | + input_ids=input_ids, |
| 316 | + input_mask=input_mask, |
| 317 | + segment_ids=segment_ids, |
| 318 | + label_ids=label_ids, |
| 319 | + boxes=token_boxes, |
| 320 | + actual_bboxes=actual_bboxes, |
| 321 | + file_name=file_name, |
| 322 | + page_size=page_size, )) |
| 323 | + return features |
0 commit comments