Skip to content

Commit beed3da

Browse files
authored
Modify Layoutlm example (#1418)
* modify transforner-rst * modify roformer tokenizer * delete modifications * modify chunk * delete changes * init layoutlm model * modify layoutlmmodel * test * fix errors * add layoutlmformaskedlm * modify tokenizer * update * update * add layoutlm example * add example * update * update * delete data * update * modify preprocess * update * modify loss * modify example * find error * fix errors * fix errors
1 parent 8268726 commit beed3da

File tree

6 files changed

+174
-115
lines changed

6 files changed

+174
-115
lines changed

examples/multimodal/layoutlm/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ pip install yacs
3232
# best metrics: {'precision': 0.7642124883504194, 'recall': 0.8204102051025512, 'f1': 0.7913148371531967}
3333
```
3434

35+
### 数据处理
36+
FUNSD数据集是常用的表格理解数据集,原始的数据集下载地址:https://guillaumejaume.github.io/FUNSD/dataset.zip.
37+
包括training_data和test_dataing两个子文件夹,包括149个训练数据和50个测试数据。数据预处理方式如下:
38+
```shell
39+
bash preprocess.sh
40+
```
3541

3642
## Reference
3743
- [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/pdf/1912.13318v5.pdf)

examples/multimodal/layoutlm/funsd.py

Lines changed: 25 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,15 @@ def __init__(self, args, tokenizer, labels, pad_token_label_id, mode):
1717
labels,
1818
args.max_seq_length,
1919
tokenizer,
20-
cls_token_at_end=bool(args.model_type in ["xlnet"]),
21-
# xlnet has a cls token at the end
20+
cls_token_at_end=False,
2221
cls_token=tokenizer.cls_token,
23-
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
22+
cls_token_segment_id=0,
2423
sep_token=tokenizer.sep_token,
25-
sep_token_extra=bool(args.model_type in ["roberta"]),
26-
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
27-
pad_on_left=bool(args.model_type in ["xlnet"]),
28-
# pad on the left for xlnet
24+
sep_token_extra=False,
25+
pad_on_left=False,
2926
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
30-
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
31-
pad_token_label_id=pad_token_label_id,
32-
model_type=args.model_type)
27+
pad_token_segment_id=0,
28+
pad_token_label_id=pad_token_label_id, )
3329

3430
self.features = features
3531
# Convert to Tensors and build dataset
@@ -173,31 +169,25 @@ def read_examples_from_file(data_dir, mode):
173169
return examples
174170

175171

176-
def convert_examples_to_features(examples,
177-
label_list,
178-
max_seq_length,
179-
tokenizer,
180-
cls_token_at_end=False,
181-
cls_token="[CLS]",
182-
cls_token_segment_id=1,
183-
sep_token="[SEP]",
184-
sep_token_extra=False,
185-
pad_on_left=False,
186-
pad_token=0,
187-
cls_token_box=[0, 0, 0, 0],
188-
sep_token_box=[1000, 1000, 1000, 1000],
189-
pad_token_box=[0, 0, 0, 0],
190-
pad_token_segment_id=0,
191-
pad_token_label_id=-1,
192-
sequence_a_segment_id=0,
193-
mask_padding_with_zero=True,
194-
model_type="bert"):
195-
""" Loads a data file into a list of `InputBatch`s
196-
`cls_token_at_end` define the location of the CLS token:
197-
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
198-
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
199-
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
200-
"""
172+
def convert_examples_to_features(
173+
examples,
174+
label_list,
175+
max_seq_length,
176+
tokenizer,
177+
cls_token_at_end=False,
178+
cls_token="[CLS]",
179+
cls_token_segment_id=1,
180+
sep_token="[SEP]",
181+
sep_token_extra=False,
182+
pad_on_left=False,
183+
pad_token=0,
184+
cls_token_box=[0, 0, 0, 0],
185+
sep_token_box=[1000, 1000, 1000, 1000],
186+
pad_token_box=[0, 0, 0, 0],
187+
pad_token_segment_id=0,
188+
pad_token_label_id=-1,
189+
sequence_a_segment_id=0,
190+
mask_padding_with_zero=True, ):
201191

202192
label_map = {label: i for i, label in enumerate(label_list)}
203193

@@ -305,11 +295,6 @@ def convert_examples_to_features(examples,
305295
assert len(label_ids) == max_seq_length
306296
assert len(token_boxes) == max_seq_length
307297

308-
if model_type != "layoutlm":
309-
input_mask = np.array(input_mask)
310-
input_mask = np.reshape(
311-
input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])
312-
313298
features.append(
314299
InputFeatures(
315300
input_ids=input_ids,

examples/multimodal/layoutlm/train_funsd.py

Lines changed: 32 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def train(args):
4646
level=logging.INFO
4747
if paddle.distributed.get_rank() == 0 else logging.WARN, )
4848

49-
labels = get_labels(args.labels)
49+
all_labels = get_labels(args.labels)
50+
5051
pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index
5152

5253
tokenizer = LayoutLMTokenizer.from_pretrained(args.model_name_or_path)
@@ -59,10 +60,10 @@ def train(args):
5960
else:
6061
model = LayoutLMModel.from_pretrained(args.model_name_or_path)
6162
model = LayoutLMForTokenClassification(
62-
model, num_classes=len(labels), dropout=None)
63+
model, num_classes=len(all_labels), dropout=None)
6364

6465
train_dataset = FunsdDataset(
65-
args, tokenizer, labels, pad_token_label_id, mode="train")
66+
args, tokenizer, all_labels, pad_token_label_id, mode="train")
6667
train_sampler = paddle.io.DistributedBatchSampler(
6768
train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)
6869

@@ -95,7 +96,9 @@ def train(args):
9596
epsilon=args.adam_epsilon,
9697
weight_decay=args.weight_decay)
9798

98-
# Train!
99+
loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=pad_token_label_id)
100+
101+
# Train
99102
logger.info("***** Running training *****")
100103
logger.info(" Num examples = %d", len(train_dataset))
101104
logger.info(" Num Epochs = %d", args.num_train_epochs)
@@ -115,33 +118,26 @@ def train(args):
115118
int(args.num_train_epochs),
116119
desc="Epoch",
117120
disable=args.local_rank not in [-1, 0])
118-
set_seed(
119-
args) # Added here for reproductibility (even between python 2 and 3)
121+
set_seed(args)
120122
for _ in train_iterator:
121123
epoch_iterator = tqdm(
122124
train_dataloader,
123125
desc="Iteration",
124126
disable=args.local_rank not in [-1, 0])
125127
for step, batch in enumerate(epoch_iterator):
126-
# model.eval()
127128
model.train()
128129
inputs = {
129130
"input_ids": batch[0],
130131
"attention_mask": batch[1],
131-
"labels": batch[3],
132+
"token_type_ids": batch[2],
133+
"bbox": batch[4],
132134
}
133-
if args.model_type in ["layoutlm"]:
134-
inputs["bbox"] = batch[4]
135-
inputs["token_type_ids"] = (
136-
batch[2] if args.model_type in ["bert", "layoutlm"] else
137-
None) # RoBERTa don"t use segment_ids
138-
139-
outputs = model(**inputs)
140-
# model outputs are always tuple in ppnlp (see doc)
141-
loss = outputs[0]
135+
labels = batch[3]
136+
logits = model(**inputs)
137+
loss = loss_fct(
138+
logits.reshape([-1, len(all_labels)]), labels.reshape([-1, ]))
142139

143140
loss = loss.mean()
144-
145141
logger.info("train loss: {}".format(loss.numpy()))
146142
loss.backward()
147143

@@ -162,7 +158,8 @@ def train(args):
162158
args,
163159
model,
164160
tokenizer,
165-
labels,
161+
all_labels,
162+
loss_fct,
166163
pad_token_label_id,
167164
mode="test", )
168165
logger.info("results: {}".format(results))
@@ -194,21 +191,21 @@ def train(args):
194191
def evaluate(args,
195192
model,
196193
tokenizer,
197-
labels,
194+
all_labels,
195+
loss_fct,
198196
pad_token_label_id,
199197
mode,
200198
prefix=""):
201199
eval_dataset = FunsdDataset(
202-
args, tokenizer, labels, pad_token_label_id, mode=mode)
203-
200+
args, tokenizer, all_labels, pad_token_label_id, mode=mode)
204201
args.eval_batch_size = args.per_gpu_eval_batch_size * max(
205202
1, paddle.distributed.get_world_size())
206203
eval_dataloader = paddle.io.DataLoader(
207204
eval_dataset,
208205
batch_size=args.eval_batch_size,
209206
collate_fn=None, )
210207

211-
# Eval!
208+
# Eval
212209
logger.info("***** Running evaluation %s *****", prefix)
213210
logger.info(" Num examples = %d", len(eval_dataset))
214211
logger.info(" Batch size = %d", args.eval_batch_size)
@@ -222,33 +219,29 @@ def evaluate(args,
222219
inputs = {
223220
"input_ids": batch[0],
224221
"attention_mask": batch[1],
225-
"labels": batch[3],
222+
"token_type_ids": batch[2],
223+
"bbox": batch[4],
226224
}
227-
if args.model_type in ["layoutlm"]:
228-
inputs["bbox"] = batch[4]
229-
inputs["token_type_ids"] = (
230-
batch[2] if args.model_type in ["bert", "layoutlm"] else
231-
None) # RoBERTa don"t use segment_ids
232-
outputs = model(**inputs)
233-
tmp_eval_loss, logits = outputs[:2]
234-
225+
labels = batch[3]
226+
attention_mask = batch[1]
227+
logits = model(**inputs)
228+
tmp_eval_loss = loss_fct(
229+
logits.reshape([-1, len(all_labels)]), labels.reshape([-1, ]))
235230
tmp_eval_loss = tmp_eval_loss.mean()
236-
237231
eval_loss += tmp_eval_loss.item()
232+
238233
nb_eval_steps += 1
239234
if preds is None:
240235
preds = logits.numpy()
241-
out_label_ids = inputs["labels"].numpy()
236+
out_label_ids = labels.numpy()
242237
else:
243238
preds = np.append(preds, logits.numpy(), axis=0)
244-
out_label_ids = np.append(
245-
out_label_ids, inputs["labels"].numpy(), axis=0)
239+
out_label_ids = np.append(out_label_ids, labels.numpy(), axis=0)
246240

247241
eval_loss = eval_loss / nb_eval_steps
248242
preds = np.argmax(preds, axis=2)
249243

250-
label_map = {i: label for i, label in enumerate(labels)}
251-
244+
label_map = {i: label for i, label in enumerate(all_labels)}
252245
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
253246
preds_list = [[] for _ in range(out_label_ids.shape[0])]
254247

@@ -272,7 +265,7 @@ def evaluate(args,
272265
for key in sorted(results.keys()):
273266
logger.info(" %s = %s", key, str(results[key]))
274267

275-
return results, preds_list
268+
return results, preds
276269

277270

278271
if __name__ == "__main__":

examples/multimodal/layoutlm/train_funsd.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ export CUDA_VISIBLE_DEVICES=7
22

33
python3.7 train_funsd.py \
44
--data_dir "./data/" \
5-
--model_type "layoutlm" \
65
--model_name_or_path "layoutlm-base-uncased" \
76
--do_lower_case \
87
--max_seq_length 512 \

examples/multimodal/layoutlm/utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ def parse_args():
2121
required=True,
2222
help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
2323
)
24-
parser.add_argument(
25-
"--model_type",
26-
default=None,
27-
type=str,
28-
required=True, )
2924
parser.add_argument(
3025
"--model_name_or_path",
3126
default=None,

0 commit comments

Comments
 (0)