Skip to content

Commit a86a6ae

Browse files
committed
Disable unhelpful truncation of long sequences
1 parent 42abc74 commit a86a6ae

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

tibert/bertcoref.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from transformers.models.camembert.modeling_camembert import CamembertModel
2020
from transformers.models.camembert.configuration_camembert import CamembertConfig
2121
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
22+
from transformers.utils import logging as transformers_logging
2223
from tqdm import tqdm
2324
from tibert.utils import spans_indexs, batch_index_select, spans
2425

@@ -131,7 +132,14 @@ def prepared_document(
131132
"""
132133
# (silly) exemple for the tokens ["I", "am", "PG"]
133134
# a BertTokenizer would produce ["[CLS]", "I", "am", "P", "##G", "[SEP]"]
134-
batch = tokenizer(self.tokens, is_split_into_words=True, truncation=True) # type: ignore
135+
# NOTE: we disable tokenizer warning to avoid a length
136+
# ---- warning. Usually, sequences should be truncated to a max
137+
# length (512 for BERT). However, in our case, the sequence is
138+
# later cut into segments of configurable size, so this does
139+
# not apply (see BertForCoreferenceResolutionConfig.segment_size)
140+
transformers_logging.set_verbosity_error()
141+
batch = tokenizer(self.tokens, is_split_into_words=True)
142+
transformers_logging.set_verbosity_info()
135143
tokens = tokenizer.convert_ids_to_tokens(batch["input_ids"]) # type: ignore
136144

137145
# words_ids is used to correspond post-tokenization word pieces

0 commit comments

Comments
 (0)