[Bug Fix]update tokenizer utils (#3204)

wj-Mcat · web-flow · commit 9d9b00bb75c1 · 2022-09-06T19:13:45.000+08:00
* update tokenizer utils

* update example

* remove debug code

* test=document_fix

* test=document_fix

* test=document_fix
diff --git a/examples/benchmark/clue/mrc/run_c3.py b/examples/benchmark/clue/mrc/run_c3.py
@@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
 
             new_data = tokenizer(tokens_t_list,
                                  text_pair=tokens_c_list,
-                                 is_split_into_words=True)
+                                 is_split_into_words='token')
 
             # Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
             # because length of each choice could be different.
@@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
                 remove_columns=column_names,
                 load_from_cache_file=not args.overwrite_cache,
                 desc="Running tokenizer on train dataset")
+
         batchify_fn = lambda samples, fn=Dict({
             'input_ids':
             Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
diff --git a/examples/information_extraction/DuEE/sequence_labeling.py b/examples/information_extraction/DuEE/sequence_labeling.py
@@ -98,7 +98,7 @@ def convert_example_to_feature(example,
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len)
 
     input_ids = tokenized_input['input_ids']
diff --git a/examples/information_extraction/msra_ner/eval.py b/examples/information_extraction/msra_ner/eval.py
@@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 
diff --git a/examples/information_extraction/msra_ner/predict.py b/examples/information_extraction/msra_ner/predict.py
@@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 
diff --git a/examples/information_extraction/msra_ner/train.py b/examples/information_extraction/msra_ner/train.py
@@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples):
             examples['tokens'],
             max_seq_len=args.max_seq_length,
             # We use this argument because the texts in our dataset are lists of words (with a label for each word).
-            is_split_into_words=True,
+            is_split_into_words='token',
             return_length=True)
         labels = []
 
diff --git a/examples/information_extraction/waybill_ie/deploy/python/predict.py b/examples/information_extraction/waybill_ie/deploy/python/predict.py
@@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer):
     tokens = example[0]
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     return tokenized_input['input_ids'], tokenized_input[
         'token_type_ids'], tokenized_input['seq_len']
diff --git a/examples/information_extraction/waybill_ie/run_ernie.py b/examples/information_extraction/waybill_ie/run_ernie.py
@@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab):
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     labels = ['O'] + labels + ['O']
     tokenized_input['labels'] = [label_vocab[x] for x in labels]
diff --git a/examples/information_extraction/waybill_ie/run_ernie_crf.py b/examples/information_extraction/waybill_ie/run_ernie_crf.py
@@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab):
     tokens, labels = example
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True)
+                                is_split_into_words='token')
     # Token '[CLS]' and '[SEP]' will get label 'O'
     labels = ['O'] + labels + ['O']
     tokenized_input['labels'] = [label_vocab[x] for x in labels]
diff --git a/examples/sentiment_analysis/skep/predict_opinion.py b/examples/sentiment_analysis/skep/predict_opinion.py
@@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
     tokens = example["tokens"]
     encoded_inputs = tokenizer(tokens,
                                return_length=True,
-                               is_split_into_words=True,
+                               is_split_into_words='token',
                                max_seq_len=max_seq_length)
     input_ids = np.array(encoded_inputs["input_ids"], dtype="int64")
     token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64")
diff --git a/examples/text_to_knowledge/ernie-ctm/data.py b/examples/text_to_knowledge/ernie-ctm/data.py
@@ -37,7 +37,7 @@ def convert_example(example,
     tokens = example["tokens"]
     tokenized_input = tokenizer(tokens,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len)
 
     if is_test:
diff --git a/examples/text_to_knowledge/nptag/data.py b/examples/text_to_knowledge/nptag/data.py
@@ -57,7 +57,7 @@ def convert_example(example,
     tokens = list(example["text"]) + ["是"] + ["[MASK]"] * max_cls_len
     inputs = tokenzier(tokens,
                        return_length=True,
-                       is_split_into_words=True,
+                       is_split_into_words='token',
                        max_length=max_seq_len)
 
     label_indices = list(
diff --git a/model_zoo/gpt/run_msra_ner.py b/model_zoo/gpt/run_msra_ner.py
@@ -76,7 +76,7 @@ def tokenize_and_align_labels(example,
     example = example['tokens']
     tokenized_input = tokenizer(example,
                                 return_length=True,
-                                is_split_into_words=True,
+                                is_split_into_words='token',
                                 max_seq_len=max_seq_len,
                                 return_token_type_ids=False)
 
diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
@@ -986,7 +986,7 @@ def get_input_ids(text):
             elif isinstance(text,
                             (list, tuple)) and len(text) > 0 and isinstance(
                                 text[0], str):
-                if is_split_into_words:
+                if is_split_into_words == True:
                     tokens = list(
                         itertools.chain(*(
                             self.tokenize(t, is_split_into_words=True, **kwargs)
@@ -1071,7 +1071,7 @@ def get_input_ids(text):
             elif isinstance(text,
                             (list, tuple)) and len(text) > 0 and isinstance(
                                 text[0], str):
-                if is_split_into_words:
+                if is_split_into_words == True:
                     tokens = list(
                         itertools.chain(*(
                             self.tokenize(t, is_split_into_words=True, **kwargs)
diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -2013,7 +2013,7 @@ def __call__(self,
                                            List[List[str]]]] = None,
                  max_length: Optional[int] = None,
                  stride: int = 0,
-                 is_split_into_words: bool = False,
+                 is_split_into_words: Union[bool, str] = False,
                  padding: Union[bool, str, PaddingStrategy] = False,
                  truncation: Union[bool, str, TruncationStrategy] = False,
                  return_position_ids: bool = False,
@@ -2061,6 +2061,10 @@ def __call__(self,
                 a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
                 and 'offset_mapping' preserving the original example and position
                 information will be added to the returned dictionary. Defaults to 0.
+            is_split_into_words (Union[bool, str], optional):
+                when the text is words or tokens, `is_split_into_words` should be True or `token`.
+                `True`: means that the text should be words which should be tokenized.
+                `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
             padding (bool, str or [PaddingStrategy], optional):
                 Activates and controls padding. Accepts the following values:
 
@@ -2201,6 +2205,13 @@ def _is_valid_text_input(t):
                 "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                 "or `List[List[str]]` (batch of pretokenized examples).")
 
+        # check `split_into_words` value
+        if isinstance(is_split_into_words,
+                      str) and is_split_into_words != 'token':
+            raise ValueError(
+                "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
+                is_split_into_words)
+
         if is_split_into_words:
             is_batched = isinstance(text,
                                     (list, tuple)) and text and isinstance(