|
19 | 19 | from transformers import AutoModelForCausalLM, AutoTokenizer |
20 | 20 | import torch |
21 | 21 |
|
| 22 | +# First Party |
| 23 | +from tests.artifacts.testdata import CUSTOM_TOKENIZER_TINYLLAMA |
| 24 | + |
22 | 25 | # Local |
23 | 26 | from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize |
24 | 27 |
|
@@ -106,6 +109,60 @@ def test_resize_with_special_tokens(): |
106 | 109 | assert output is not None |
107 | 110 |
|
108 | 111 |
|
| 112 | +def test_special_tokens_before_and_after(): |
| 113 | + """Test if additional special tokens added do not replace existing tokens""" |
| 114 | + input_text = INPUT_TEXT |
| 115 | + tokenizer = AutoTokenizer.from_pretrained(CUSTOM_TOKENIZER_TINYLLAMA) |
| 116 | + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
| 117 | + |
| 118 | + input_tokenizer_len = len(tokenizer.get_vocab()) |
| 119 | + addn_spl_tokens_before = tokenizer.special_tokens_map.get( |
| 120 | + "additional_special_tokens" |
| 121 | + ) |
| 122 | + assert ( |
| 123 | + len(addn_spl_tokens_before) > 0 |
| 124 | + ), "this test needs tokenizer special tokens to not be empty before testing" |
| 125 | + |
| 126 | + special_tokens_dict = {"sep_token": "<SEP>", "pad_token": "<PAD>"} |
| 127 | + addn_spl_tokens_added = ["<NotSeenTokenA>", "<NotSeenTokenB>", "<NotSeenTokenC>"] |
| 128 | + special_tokens_dict["additional_special_tokens"] = addn_spl_tokens_added |
| 129 | + |
| 130 | + resize_result = tokenizer_and_embedding_resize( |
| 131 | + special_tokens_dict=special_tokens_dict, |
| 132 | + tokenizer=tokenizer, |
| 133 | + model=model, |
| 134 | + multiple_of=1, |
| 135 | + ) |
| 136 | + |
| 137 | + output_tokenizer_len = len(tokenizer.get_vocab()) |
| 138 | + addn_spl_tokens_before.extend(addn_spl_tokens_added) |
| 139 | + expected_addn_special_tokens = addn_spl_tokens_before |
| 140 | + expected_embedding_size = input_tokenizer_len + len(addn_spl_tokens_added) + 2 |
| 141 | + addn_spl_tokens_after = tokenizer.special_tokens_map.get( |
| 142 | + "additional_special_tokens" |
| 143 | + ) |
| 144 | + |
| 145 | + assert "<SEP>" in tokenizer.get_vocab() |
| 146 | + assert "<PAD>" in tokenizer.get_vocab() |
| 147 | + assert output_tokenizer_len == expected_embedding_size |
| 148 | + assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len |
| 149 | + assert resize_result["new_embedding_size"] == expected_embedding_size |
| 150 | + |
| 151 | + assert len(addn_spl_tokens_after) == len( |
| 152 | + expected_addn_special_tokens |
| 153 | + ), "length of the additional special tokens after must equal length before plus added tokens" |
| 154 | + |
| 155 | + for tok in expected_addn_special_tokens: |
| 156 | + assert ( |
| 157 | + tok in addn_spl_tokens_after |
| 158 | + ), "additional special tokens added are not in tokenizer" |
| 159 | + |
| 160 | + output = _inference( |
| 161 | + tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 |
| 162 | + ) |
| 163 | + assert output is not None |
| 164 | + |
| 165 | + |
109 | 166 | def test_no_resize_when_no_special_tokens(): |
110 | 167 | input_text = INPUT_TEXT |
111 | 168 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
0 commit comments