PaddlePaddle
diff --git a/‎examples/multimodal/layoutlm/README.md‎
Lines changed: 38 additions & 0 deletions b/‎examples/multimodal/layoutlm/README.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎examples/multimodal/layoutlm/funsd.py‎
Lines changed: 323 additions & 0 deletions b/‎examples/multimodal/layoutlm/funsd.py‎
Lines changed: 323 additions & 0 deletions
@@ -0,0 +1,38 @@
+# LayoutLM
+
+## 模型简介
+本项目是 [LayoutLM:Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/pdf/1912.13318v5.pdf) 在 Paddle 2.2上的开源实现，
+包含了在 [FUNSD数据集](https://github.com/doc-analysis/FUNSD) 上的微调代码。
+
+## 快速开始
+### 配置环境
+环境依赖
+- cv2
+- sentencepiece
+- yacs
+
+安装命令：
+```shell
+pip install opencv-python
+pip install sentencepiece
+pip install yacs
+```
+
+### 数据准备
+处理好的FUNSD中文数据集下载地址：https://bj.bcebos.com/v1/paddlenlp/datasets/FUNSD.zip 。
+
+下载并解压该数据集，解压后将数据集放置在当前目录下。
+
+### 执行Fine-tuning
+1. ``Sequence Labeling`` 任务启动Fine-tuning的方式如下：
+    ```shell
+    bash train_funsd.sh
+
+    # 结果如下:
+    # best metrics: {'precision': 0.7642124883504194, 'recall': 0.8204102051025512, 'f1': 0.7913148371531967}
+    ```
+
+
+## Reference
+- [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/pdf/1912.13318v5.pdf)
+- [microsoft/unilm/layoutlm](https://github.com/microsoft/unilm/tree/master/layoutlm)
@@ -0,0 +1,323 @@
+import logging
+import os
+
+import paddle
+from paddle.io import Dataset
+
+logger = logging.getLogger(__name__)
+import numpy as np
+
+
+class FunsdDataset(Dataset):
+    def __init__(self, args, tokenizer, labels, pad_token_label_id, mode):
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        examples = read_examples_from_file(args.data_dir, mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args.max_seq_length,
+            tokenizer,
+            cls_token_at_end=bool(args.model_type in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args.model_type in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+            model_type=args.model_type)
+
+        self.features = features
+        # Convert to Tensors and build dataset
+        self.all_input_ids = paddle.to_tensor(
+            [f.input_ids for f in features], dtype="int64")
+        self.all_input_mask = paddle.to_tensor(
+            [f.input_mask for f in features], dtype="int64")
+        self.all_segment_ids = paddle.to_tensor(
+            [f.segment_ids for f in features], dtype="int64")
+        self.all_label_ids = paddle.to_tensor(
+            [f.label_ids for f in features], dtype="int64")
+        self.all_bboxes = paddle.to_tensor(
+            [f.boxes for f in features], dtype="int64")
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, index):
+        return (
+            self.all_input_ids[index],
+            self.all_input_mask[index],
+            self.all_segment_ids[index],
+            self.all_label_ids[index],
+            self.all_bboxes[index], )
+
+
+class InputExample(object):
+    """A single training/test example for token classification."""
+
+    def __init__(self, guid, words, labels, boxes, actual_bboxes, file_name,
+                 page_size):
+        """Constructs a InputExample.
+        Args:
+            guid: Unique id for the example.
+            words: list. The words of the sequence.
+            labels: (Optional) list. The labels for each word of the sequence. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.words = words
+        self.labels = labels
+        self.boxes = boxes
+        self.actual_bboxes = actual_bboxes
+        self.file_name = file_name
+        self.page_size = page_size
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(
+            self,
+            input_ids,
+            input_mask,
+            segment_ids,
+            label_ids,
+            boxes,
+            actual_bboxes,
+            file_name,
+            page_size, ):
+        assert (
+            0 <= all(boxes) <= 1000
+        ), "Error with input bbox ({}): the coordinate value is not between 0 and 1000".format(
+            boxes)
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_ids = label_ids
+        self.boxes = boxes
+        self.actual_bboxes = actual_bboxes
+        self.file_name = file_name
+        self.page_size = page_size
+
+
+def read_examples_from_file(data_dir, mode):
+    file_path = os.path.join(data_dir, "{}.txt".format(mode))
+    box_file_path = os.path.join(data_dir, "{}_box.txt".format(mode))
+    image_file_path = os.path.join(data_dir, "{}_image.txt".format(mode))
+    guid_index = 1
+    examples = []
+    with open(
+            file_path, encoding="utf-8") as f, open(
+                box_file_path, encoding="utf-8") as fb, open(
+                    image_file_path, encoding="utf-8") as fi:
+        words = []
+        boxes = []
+        actual_bboxes = []
+        file_name = None
+        page_size = None
+        labels = []
+        for line, bline, iline in zip(f, fb, fi):
+            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                if words:
+                    examples.append(
+                        InputExample(
+                            guid="{}-{}".format(mode, guid_index),
+                            words=words,
+                            labels=labels,
+                            boxes=boxes,
+                            actual_bboxes=actual_bboxes,
+                            file_name=file_name,
+                            page_size=page_size, ))
+                    guid_index += 1
+                    words = []
+                    boxes = []
+                    actual_bboxes = []
+                    file_name = None
+                    page_size = None
+                    labels = []
+            else:
+                splits = line.split("\t")
+                bsplits = bline.split("\t")
+                isplits = iline.split("\t")
+                assert len(splits) == 2
+                assert len(bsplits) == 2
+                assert len(isplits) == 4
+                assert splits[0] == bsplits[0]
+                words.append(splits[0])
+                if len(splits) > 1:
+                    labels.append(splits[-1].replace("\n", ""))
+                    box = bsplits[-1].replace("\n", "")
+                    box = [int(b) for b in box.split()]
+                    boxes.append(box)
+                    actual_bbox = [int(b) for b in isplits[1].split()]
+                    actual_bboxes.append(actual_bbox)
+                    page_size = [int(i) for i in isplits[2].split()]
+                    file_name = isplits[3].strip()
+                else:
+                    # Examples could have no label for mode = "test"
+                    labels.append("O")
+        if words:
+            examples.append(
+                InputExample(
+                    guid="%s-%d".format(mode, guid_index),
+                    words=words,
+                    labels=labels,
+                    boxes=boxes,
+                    actual_bboxes=actual_bboxes,
+                    file_name=file_name,
+                    page_size=page_size, ))
+    return examples
+
+
+def convert_examples_to_features(examples,
+                                 label_list,
+                                 max_seq_length,
+                                 tokenizer,
+                                 cls_token_at_end=False,
+                                 cls_token="[CLS]",
+                                 cls_token_segment_id=1,
+                                 sep_token="[SEP]",
+                                 sep_token_extra=False,
+                                 pad_on_left=False,
+                                 pad_token=0,
+                                 cls_token_box=[0, 0, 0, 0],
+                                 sep_token_box=[1000, 1000, 1000, 1000],
+                                 pad_token_box=[0, 0, 0, 0],
+                                 pad_token_segment_id=0,
+                                 pad_token_label_id=-1,
+                                 sequence_a_segment_id=0,
+                                 mask_padding_with_zero=True,
+                                 model_type="bert"):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        file_name = example.file_name
+        page_size = example.page_size
+        width, height = page_size
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d", ex_index, len(examples))
+
+        tokens = []
+        token_boxes = []
+        actual_bboxes = []
+        label_ids = []
+        for word, label, box, actual_bbox in zip(example.words, example.labels,
+                                                 example.boxes,
+                                                 example.actual_bboxes):
+            word_tokens = tokenizer.tokenize(word)
+            tokens.extend(word_tokens)
+            token_boxes.extend([box] * len(word_tokens))
+            actual_bboxes.extend([actual_bbox] * len(word_tokens))
+            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(
+                word_tokens) - 1))
+
+        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+        special_tokens_count = 3 if sep_token_extra else 2
+        if len(tokens) > max_seq_length - special_tokens_count:
+            tokens = tokens[:(max_seq_length - special_tokens_count)]
+            token_boxes = token_boxes[:(max_seq_length - special_tokens_count)]
+            actual_bboxes = actual_bboxes[:(max_seq_length -
+                                            special_tokens_count)]
+            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens += [sep_token]
+        token_boxes += [sep_token_box]
+        actual_bboxes += [[0, 0, width, height]]
+        label_ids += [pad_token_label_id]
+        if sep_token_extra:
+            # roberta uses an extra separator b/w pairs of sentences
+            tokens += [sep_token]
+            token_boxes += [sep_token_box]
+            actual_bboxes += [[0, 0, width, height]]
+            label_ids += [pad_token_label_id]
+        segment_ids = [sequence_a_segment_id] * len(tokens)
+
+        if cls_token_at_end:
+            tokens += [cls_token]
+            token_boxes += [cls_token_box]
+            actual_bboxes += [[0, 0, width, height]]
+            label_ids += [pad_token_label_id]
+            segment_ids += [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            token_boxes = [cls_token_box] + token_boxes
+            actual_bboxes = [[0, 0, width, height]] + actual_bboxes
+            label_ids = [pad_token_label_id] + label_ids
+            segment_ids = [cls_token_segment_id] + segment_ids
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length
+                          ) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length
+                           ) + segment_ids
+            label_ids = ([pad_token_label_id] * padding_length) + label_ids
+            token_boxes = ([pad_token_box] * padding_length) + token_boxes
+        else:
+            input_ids += [pad_token] * padding_length
+            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
+            segment_ids += [pad_token_segment_id] * padding_length
+            label_ids += [pad_token_label_id] * padding_length
+            token_boxes += [pad_token_box] * padding_length
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        assert len(label_ids) == max_seq_length
+        assert len(token_boxes) == max_seq_length
+
+        if model_type != "layoutlm":
+            input_mask = np.array(input_mask)
+            input_mask = np.reshape(
+                input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])
+
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_ids=label_ids,
+                boxes=token_boxes,
+                actual_bboxes=actual_bboxes,
+                file_name=file_name,
+                page_size=page_size, ))
+    return features