Skip to content

Commit 9d9b00b

Browse files
authored
[Bug Fix]update tokenizer utils (#3204)
* update tokenizer utils * update example * remove debug code * test=document_fix * test=document_fix * test=document_fix
1 parent ceded4b commit 9d9b00b

File tree

14 files changed

+27
-15
lines changed

14 files changed

+27
-15
lines changed

examples/benchmark/clue/mrc/run_c3.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
258258

259259
new_data = tokenizer(tokens_t_list,
260260
text_pair=tokens_c_list,
261-
is_split_into_words=True)
261+
is_split_into_words='token')
262262

263263
# Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
264264
# because length of each choice could be different.
@@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
305305
remove_columns=column_names,
306306
load_from_cache_file=not args.overwrite_cache,
307307
desc="Running tokenizer on train dataset")
308+
308309
batchify_fn = lambda samples, fn=Dict({
309310
'input_ids':
310311
Pad(axis=1, pad_val=tokenizer.pad_token_id), # input

examples/information_extraction/DuEE/sequence_labeling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def convert_example_to_feature(example,
9898
tokens, labels = example
9999
tokenized_input = tokenizer(tokens,
100100
return_length=True,
101-
is_split_into_words=True,
101+
is_split_into_words='token',
102102
max_seq_len=max_seq_len)
103103

104104
input_ids = tokenized_input['input_ids']

examples/information_extraction/msra_ner/eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples):
5656
examples['tokens'],
5757
max_seq_len=args.max_seq_length,
5858
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
59-
is_split_into_words=True,
59+
is_split_into_words='token',
6060
return_length=True)
6161
labels = []
6262

examples/information_extraction/msra_ner/predict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples):
8686
examples['tokens'],
8787
max_seq_len=args.max_seq_length,
8888
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
89-
is_split_into_words=True,
89+
is_split_into_words='token',
9090
return_length=True)
9191
labels = []
9292

examples/information_extraction/msra_ner/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples):
105105
examples['tokens'],
106106
max_seq_len=args.max_seq_length,
107107
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
108-
is_split_into_words=True,
108+
is_split_into_words='token',
109109
return_length=True)
110110
labels = []
111111

examples/information_extraction/waybill_ie/deploy/python/predict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer):
116116
tokens = example[0]
117117
tokenized_input = tokenizer(tokens,
118118
return_length=True,
119-
is_split_into_words=True)
119+
is_split_into_words='token')
120120
# Token '[CLS]' and '[SEP]' will get label 'O'
121121
return tokenized_input['input_ids'], tokenized_input[
122122
'token_type_ids'], tokenized_input['seq_len']

examples/information_extraction/waybill_ie/run_ernie.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab):
4040
tokens, labels = example
4141
tokenized_input = tokenizer(tokens,
4242
return_length=True,
43-
is_split_into_words=True)
43+
is_split_into_words='token')
4444
# Token '[CLS]' and '[SEP]' will get label 'O'
4545
labels = ['O'] + labels + ['O']
4646
tokenized_input['labels'] = [label_vocab[x] for x in labels]

examples/information_extraction/waybill_ie/run_ernie_crf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab):
4141
tokens, labels = example
4242
tokenized_input = tokenizer(tokens,
4343
return_length=True,
44-
is_split_into_words=True)
44+
is_split_into_words='token')
4545
# Token '[CLS]' and '[SEP]' will get label 'O'
4646
labels = ['O'] + labels + ['O']
4747
tokenized_input['labels'] = [label_vocab[x] for x in labels]

examples/sentiment_analysis/skep/predict_opinion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
6767
tokens = example["tokens"]
6868
encoded_inputs = tokenizer(tokens,
6969
return_length=True,
70-
is_split_into_words=True,
70+
is_split_into_words='token',
7171
max_seq_len=max_seq_length)
7272
input_ids = np.array(encoded_inputs["input_ids"], dtype="int64")
7373
token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64")

examples/text_to_knowledge/ernie-ctm/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def convert_example(example,
3737
tokens = example["tokens"]
3838
tokenized_input = tokenizer(tokens,
3939
return_length=True,
40-
is_split_into_words=True,
40+
is_split_into_words='token',
4141
max_seq_len=max_seq_len)
4242

4343
if is_test:

0 commit comments

Comments
 (0)