|
19 | 19 | from transformers.models.camembert.modeling_camembert import CamembertModel |
20 | 20 | from transformers.models.camembert.configuration_camembert import CamembertConfig |
21 | 21 | from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase |
| 22 | +from transformers.utils import logging as transformers_logging |
22 | 23 | from tqdm import tqdm |
23 | 24 | from tibert.utils import spans_indexs, batch_index_select, spans |
24 | 25 |
|
@@ -131,7 +132,14 @@ def prepared_document( |
131 | 132 | """ |
132 | 133 | # (silly) exemple for the tokens ["I", "am", "PG"] |
133 | 134 | # a BertTokenizer would produce ["[CLS]", "I", "am", "P", "##G", "[SEP]"] |
134 | | - batch = tokenizer(self.tokens, is_split_into_words=True, truncation=True) # type: ignore |
| 135 | + # NOTE: we disable tokenizer warning to avoid a length |
| 136 | + # ---- warning. Usually, sequences should be truncated to a max |
| 137 | + # length (512 for BERT). However, in our case, the sequence is |
| 138 | + # later cut into segments of configurable size, so this does |
| 139 | + # not apply (see BertForCoreferenceResolutionConfig.segment_size) |
| 140 | + transformers_logging.set_verbosity_error() |
| 141 | + batch = tokenizer(self.tokens, is_split_into_words=True) |
| 142 | + transformers_logging.set_verbosity_info() |
135 | 143 | tokens = tokenizer.convert_ids_to_tokens(batch["input_ids"]) # type: ignore |
136 | 144 |
|
137 | 145 | # words_ids is used to correspond post-tokenization word pieces |
|
0 commit comments