Skip to content

Commit 570685f

Browse files
authored
Add LayoutLMModel (#1362)
* modify transforner-rst * modify roformer tokenizer * delete modifications * modify chunk * delete changes * init layoutlm model * modify layoutlmmodel * test * fix errors * add layoutlmformaskedlm * modify tokenizer * update * update * add layoutlm example * add example * update * update * delete data * update * modify preprocess
1 parent 0ba1205 commit 570685f

File tree

11 files changed

+1667
-0
lines changed

11 files changed

+1667
-0
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# LayoutLM
2+
3+
## 模型简介
4+
本项目是 [LayoutLM:Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/pdf/1912.13318v5.pdf) 在 Paddle 2.2上的开源实现,
5+
包含了在 [FUNSD数据集](https://github.com/doc-analysis/FUNSD) 上的微调代码。
6+
7+
## 快速开始
8+
### 配置环境
9+
环境依赖
10+
- cv2
11+
- sentencepiece
12+
- yacs
13+
14+
安装命令:
15+
```shell
16+
pip install opencv-python
17+
pip install sentencepiece
18+
pip install yacs
19+
```
20+
21+
### 数据准备
22+
处理好的FUNSD中文数据集下载地址:https://bj.bcebos.com/v1/paddlenlp/datasets/FUNSD.zip
23+
24+
下载并解压该数据集,解压后将数据集放置在当前目录下。
25+
26+
### 执行Fine-tuning
27+
1. ``Sequence Labeling`` 任务启动Fine-tuning的方式如下:
28+
```shell
29+
bash train_funsd.sh
30+
31+
# 结果如下:
32+
# best metrics: {'precision': 0.7642124883504194, 'recall': 0.8204102051025512, 'f1': 0.7913148371531967}
33+
```
34+
35+
36+
## Reference
37+
- [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/pdf/1912.13318v5.pdf)
38+
- [microsoft/unilm/layoutlm](https://github.com/microsoft/unilm/tree/master/layoutlm)
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
import logging
2+
import os
3+
4+
import paddle
5+
from paddle.io import Dataset
6+
7+
logger = logging.getLogger(__name__)
8+
import numpy as np
9+
10+
11+
class FunsdDataset(Dataset):
12+
def __init__(self, args, tokenizer, labels, pad_token_label_id, mode):
13+
logger.info("Creating features from dataset file at %s", args.data_dir)
14+
examples = read_examples_from_file(args.data_dir, mode)
15+
features = convert_examples_to_features(
16+
examples,
17+
labels,
18+
args.max_seq_length,
19+
tokenizer,
20+
cls_token_at_end=bool(args.model_type in ["xlnet"]),
21+
# xlnet has a cls token at the end
22+
cls_token=tokenizer.cls_token,
23+
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
24+
sep_token=tokenizer.sep_token,
25+
sep_token_extra=bool(args.model_type in ["roberta"]),
26+
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
27+
pad_on_left=bool(args.model_type in ["xlnet"]),
28+
# pad on the left for xlnet
29+
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
30+
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
31+
pad_token_label_id=pad_token_label_id,
32+
model_type=args.model_type)
33+
34+
self.features = features
35+
# Convert to Tensors and build dataset
36+
self.all_input_ids = paddle.to_tensor(
37+
[f.input_ids for f in features], dtype="int64")
38+
self.all_input_mask = paddle.to_tensor(
39+
[f.input_mask for f in features], dtype="int64")
40+
self.all_segment_ids = paddle.to_tensor(
41+
[f.segment_ids for f in features], dtype="int64")
42+
self.all_label_ids = paddle.to_tensor(
43+
[f.label_ids for f in features], dtype="int64")
44+
self.all_bboxes = paddle.to_tensor(
45+
[f.boxes for f in features], dtype="int64")
46+
47+
def __len__(self):
48+
return len(self.features)
49+
50+
def __getitem__(self, index):
51+
return (
52+
self.all_input_ids[index],
53+
self.all_input_mask[index],
54+
self.all_segment_ids[index],
55+
self.all_label_ids[index],
56+
self.all_bboxes[index], )
57+
58+
59+
class InputExample(object):
60+
"""A single training/test example for token classification."""
61+
62+
def __init__(self, guid, words, labels, boxes, actual_bboxes, file_name,
63+
page_size):
64+
"""Constructs a InputExample.
65+
Args:
66+
guid: Unique id for the example.
67+
words: list. The words of the sequence.
68+
labels: (Optional) list. The labels for each word of the sequence. This should be
69+
specified for train and dev examples, but not for test examples.
70+
"""
71+
self.guid = guid
72+
self.words = words
73+
self.labels = labels
74+
self.boxes = boxes
75+
self.actual_bboxes = actual_bboxes
76+
self.file_name = file_name
77+
self.page_size = page_size
78+
79+
80+
class InputFeatures(object):
81+
"""A single set of features of data."""
82+
83+
def __init__(
84+
self,
85+
input_ids,
86+
input_mask,
87+
segment_ids,
88+
label_ids,
89+
boxes,
90+
actual_bboxes,
91+
file_name,
92+
page_size, ):
93+
assert (
94+
0 <= all(boxes) <= 1000
95+
), "Error with input bbox ({}): the coordinate value is not between 0 and 1000".format(
96+
boxes)
97+
self.input_ids = input_ids
98+
self.input_mask = input_mask
99+
self.segment_ids = segment_ids
100+
self.label_ids = label_ids
101+
self.boxes = boxes
102+
self.actual_bboxes = actual_bboxes
103+
self.file_name = file_name
104+
self.page_size = page_size
105+
106+
107+
def read_examples_from_file(data_dir, mode):
108+
file_path = os.path.join(data_dir, "{}.txt".format(mode))
109+
box_file_path = os.path.join(data_dir, "{}_box.txt".format(mode))
110+
image_file_path = os.path.join(data_dir, "{}_image.txt".format(mode))
111+
guid_index = 1
112+
examples = []
113+
with open(
114+
file_path, encoding="utf-8") as f, open(
115+
box_file_path, encoding="utf-8") as fb, open(
116+
image_file_path, encoding="utf-8") as fi:
117+
words = []
118+
boxes = []
119+
actual_bboxes = []
120+
file_name = None
121+
page_size = None
122+
labels = []
123+
for line, bline, iline in zip(f, fb, fi):
124+
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
125+
if words:
126+
examples.append(
127+
InputExample(
128+
guid="{}-{}".format(mode, guid_index),
129+
words=words,
130+
labels=labels,
131+
boxes=boxes,
132+
actual_bboxes=actual_bboxes,
133+
file_name=file_name,
134+
page_size=page_size, ))
135+
guid_index += 1
136+
words = []
137+
boxes = []
138+
actual_bboxes = []
139+
file_name = None
140+
page_size = None
141+
labels = []
142+
else:
143+
splits = line.split("\t")
144+
bsplits = bline.split("\t")
145+
isplits = iline.split("\t")
146+
assert len(splits) == 2
147+
assert len(bsplits) == 2
148+
assert len(isplits) == 4
149+
assert splits[0] == bsplits[0]
150+
words.append(splits[0])
151+
if len(splits) > 1:
152+
labels.append(splits[-1].replace("\n", ""))
153+
box = bsplits[-1].replace("\n", "")
154+
box = [int(b) for b in box.split()]
155+
boxes.append(box)
156+
actual_bbox = [int(b) for b in isplits[1].split()]
157+
actual_bboxes.append(actual_bbox)
158+
page_size = [int(i) for i in isplits[2].split()]
159+
file_name = isplits[3].strip()
160+
else:
161+
# Examples could have no label for mode = "test"
162+
labels.append("O")
163+
if words:
164+
examples.append(
165+
InputExample(
166+
guid="%s-%d".format(mode, guid_index),
167+
words=words,
168+
labels=labels,
169+
boxes=boxes,
170+
actual_bboxes=actual_bboxes,
171+
file_name=file_name,
172+
page_size=page_size, ))
173+
return examples
174+
175+
176+
def convert_examples_to_features(examples,
177+
label_list,
178+
max_seq_length,
179+
tokenizer,
180+
cls_token_at_end=False,
181+
cls_token="[CLS]",
182+
cls_token_segment_id=1,
183+
sep_token="[SEP]",
184+
sep_token_extra=False,
185+
pad_on_left=False,
186+
pad_token=0,
187+
cls_token_box=[0, 0, 0, 0],
188+
sep_token_box=[1000, 1000, 1000, 1000],
189+
pad_token_box=[0, 0, 0, 0],
190+
pad_token_segment_id=0,
191+
pad_token_label_id=-1,
192+
sequence_a_segment_id=0,
193+
mask_padding_with_zero=True,
194+
model_type="bert"):
195+
""" Loads a data file into a list of `InputBatch`s
196+
`cls_token_at_end` define the location of the CLS token:
197+
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
198+
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
199+
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
200+
"""
201+
202+
label_map = {label: i for i, label in enumerate(label_list)}
203+
204+
features = []
205+
for (ex_index, example) in enumerate(examples):
206+
file_name = example.file_name
207+
page_size = example.page_size
208+
width, height = page_size
209+
if ex_index % 10000 == 0:
210+
logger.info("Writing example %d of %d", ex_index, len(examples))
211+
212+
tokens = []
213+
token_boxes = []
214+
actual_bboxes = []
215+
label_ids = []
216+
for word, label, box, actual_bbox in zip(example.words, example.labels,
217+
example.boxes,
218+
example.actual_bboxes):
219+
word_tokens = tokenizer.tokenize(word)
220+
tokens.extend(word_tokens)
221+
token_boxes.extend([box] * len(word_tokens))
222+
actual_bboxes.extend([actual_bbox] * len(word_tokens))
223+
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
224+
label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(
225+
word_tokens) - 1))
226+
227+
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
228+
special_tokens_count = 3 if sep_token_extra else 2
229+
if len(tokens) > max_seq_length - special_tokens_count:
230+
tokens = tokens[:(max_seq_length - special_tokens_count)]
231+
token_boxes = token_boxes[:(max_seq_length - special_tokens_count)]
232+
actual_bboxes = actual_bboxes[:(max_seq_length -
233+
special_tokens_count)]
234+
label_ids = label_ids[:(max_seq_length - special_tokens_count)]
235+
236+
# The convention in BERT is:
237+
# (a) For sequence pairs:
238+
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
239+
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
240+
# (b) For single sequences:
241+
# tokens: [CLS] the dog is hairy . [SEP]
242+
# type_ids: 0 0 0 0 0 0 0
243+
#
244+
# Where "type_ids" are used to indicate whether this is the first
245+
# sequence or the second sequence. The embedding vectors for `type=0` and
246+
# `type=1` were learned during pre-training and are added to the wordpiece
247+
# embedding vector (and position vector). This is not *strictly* necessary
248+
# since the [SEP] token unambiguously separates the sequences, but it makes
249+
# it easier for the model to learn the concept of sequences.
250+
#
251+
# For classification tasks, the first vector (corresponding to [CLS]) is
252+
# used as as the "sentence vector". Note that this only makes sense because
253+
# the entire model is fine-tuned.
254+
tokens += [sep_token]
255+
token_boxes += [sep_token_box]
256+
actual_bboxes += [[0, 0, width, height]]
257+
label_ids += [pad_token_label_id]
258+
if sep_token_extra:
259+
# roberta uses an extra separator b/w pairs of sentences
260+
tokens += [sep_token]
261+
token_boxes += [sep_token_box]
262+
actual_bboxes += [[0, 0, width, height]]
263+
label_ids += [pad_token_label_id]
264+
segment_ids = [sequence_a_segment_id] * len(tokens)
265+
266+
if cls_token_at_end:
267+
tokens += [cls_token]
268+
token_boxes += [cls_token_box]
269+
actual_bboxes += [[0, 0, width, height]]
270+
label_ids += [pad_token_label_id]
271+
segment_ids += [cls_token_segment_id]
272+
else:
273+
tokens = [cls_token] + tokens
274+
token_boxes = [cls_token_box] + token_boxes
275+
actual_bboxes = [[0, 0, width, height]] + actual_bboxes
276+
label_ids = [pad_token_label_id] + label_ids
277+
segment_ids = [cls_token_segment_id] + segment_ids
278+
279+
input_ids = tokenizer.convert_tokens_to_ids(tokens)
280+
281+
# The mask has 1 for real tokens and 0 for padding tokens. Only real
282+
# tokens are attended to.
283+
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
284+
285+
# Zero-pad up to the sequence length.
286+
padding_length = max_seq_length - len(input_ids)
287+
if pad_on_left:
288+
input_ids = ([pad_token] * padding_length) + input_ids
289+
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length
290+
) + input_mask
291+
segment_ids = ([pad_token_segment_id] * padding_length
292+
) + segment_ids
293+
label_ids = ([pad_token_label_id] * padding_length) + label_ids
294+
token_boxes = ([pad_token_box] * padding_length) + token_boxes
295+
else:
296+
input_ids += [pad_token] * padding_length
297+
input_mask += [0 if mask_padding_with_zero else 1] * padding_length
298+
segment_ids += [pad_token_segment_id] * padding_length
299+
label_ids += [pad_token_label_id] * padding_length
300+
token_boxes += [pad_token_box] * padding_length
301+
302+
assert len(input_ids) == max_seq_length
303+
assert len(input_mask) == max_seq_length
304+
assert len(segment_ids) == max_seq_length
305+
assert len(label_ids) == max_seq_length
306+
assert len(token_boxes) == max_seq_length
307+
308+
if model_type != "layoutlm":
309+
input_mask = np.array(input_mask)
310+
input_mask = np.reshape(
311+
input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])
312+
313+
features.append(
314+
InputFeatures(
315+
input_ids=input_ids,
316+
input_mask=input_mask,
317+
segment_ids=segment_ids,
318+
label_ids=label_ids,
319+
boxes=token_boxes,
320+
actual_bboxes=actual_bboxes,
321+
file_name=file_name,
322+
page_size=page_size, ))
323+
return features

0 commit comments

Comments
 (0)