|
1 | | -# Third party |
2 | 1 | # Third Party |
3 | 2 | from transformers import AutoModelForCausalLM, AutoTokenizer |
4 | 3 |
|
5 | 4 | # First Party |
6 | 5 | from tests.artifacts.testdata import MODEL_NAME |
7 | 6 |
|
8 | 7 | # Local |
9 | | -# First party |
10 | | -from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize |
| 8 | +from tuning.config import configs |
| 9 | +from tuning.utils.tokenizer_data_utils import ( |
| 10 | + get_special_tokens_dict, |
| 11 | + tokenizer_and_embedding_resize, |
| 12 | +) |
11 | 13 |
|
12 | 14 |
|
13 | | -def test_tokenizer_and_embedding_resize_return_values(): |
14 | | - """Test to ensure number of added tokens are returned correctly""" |
| 15 | +def test_setting_special_tokens_with_LlamaTokenizerFast(): |
| 16 | + """ |
| 17 | + Unit test using a LlamaTokenizerFast tokenizer. This tokenizer is only missing a PAD token, |
| 18 | + however because it is a LlamaTokenizer, the function code automatically adds the BOS, EOS, |
| 19 | + UNK and PAD tokens to the special tokens dict. Then, the <pad> token is replaced with |
| 20 | + a <PAD> token, because the Llama tokenizer does not have a pad token specified. |
| 21 | + """ |
| 22 | + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) |
| 23 | + model_args = configs.ModelArguments() |
| 24 | + special_tokens_dict = get_special_tokens_dict( |
| 25 | + tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer |
| 26 | + ) |
| 27 | + assert special_tokens_dict == { |
| 28 | + "bos_token": "<s>", |
| 29 | + "eos_token": "</s>", |
| 30 | + "unk_token": "<unk>", |
| 31 | + "pad_token": "<PAD>", |
| 32 | + } |
| 33 | + |
| 34 | + |
| 35 | +def test_setting_special_tokens_with_GPT2TokenizerFast(): |
| 36 | + """ |
| 37 | + Unit test using a GPT2TokenizerFast tokenizer. This tokenizer is the case where the |
| 38 | + EOS token = PAD token, both of them are <|endoftext|>. So, the pad token in the tokenizer is set |
| 39 | + to <PAD> and the "pad_token": "<PAD>" is also added to the special tokens dict. |
| 40 | + """ |
| 41 | + tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base") |
| 42 | + model_args = configs.ModelArguments() |
| 43 | + special_tokens_dict = get_special_tokens_dict( |
| 44 | + tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer |
| 45 | + ) |
| 46 | + assert special_tokens_dict == { |
| 47 | + "pad_token": "<PAD>", |
| 48 | + } |
| 49 | + |
| 50 | + |
| 51 | +def test_setting_special_tokens_with_GPTNeoXTokenizerFast(): |
| 52 | + """ |
| 53 | + Unit test using a GPTNeoXTokenizerFast tokenizer. This tokenizer is another one that is |
| 54 | + hardcoded into the function to automatically add just a pad token to the special tokens dict. |
| 55 | + However, the tokenizer itself is also missing a pad token, so the function then replaces |
| 56 | + the <pad> token with the default <PAD> token. |
| 57 | + """ |
| 58 | + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") |
| 59 | + model_args = configs.ModelArguments() |
| 60 | + special_tokens_dict = get_special_tokens_dict( |
| 61 | + tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer |
| 62 | + ) |
| 63 | + assert special_tokens_dict == { |
| 64 | + "pad_token": "<PAD>", |
| 65 | + } |
| 66 | + |
| 67 | + |
| 68 | +def test_setting_special_tokens_when_missing_all_special_tokens(): |
| 69 | + """ |
| 70 | + Unit test using the GPT2TokenizerFast tokenizer. All the special tokens have been |
| 71 | + removed from the tokenizer, so we expect all of them to appear in the special tokens dict. |
| 72 | + """ |
| 73 | + tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base") |
| 74 | + |
| 75 | + # Set all special tokens to None |
| 76 | + tokenizer.bos_token = None |
| 77 | + tokenizer.eos_token = None |
| 78 | + tokenizer.unk_token = None |
| 79 | + tokenizer.pad_token = None |
| 80 | + |
| 81 | + model_args = configs.ModelArguments() |
| 82 | + special_tokens_dict = get_special_tokens_dict( |
| 83 | + tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer |
| 84 | + ) |
| 85 | + assert special_tokens_dict == { |
| 86 | + "pad_token": "<PAD>", |
| 87 | + "eos_token": "</s>", |
| 88 | + "bos_token": "<s>", |
| 89 | + "unk_token": "<unk>", |
| 90 | + } |
| 91 | + |
| 92 | + |
| 93 | +def test_setting_special_tokens_when_path_is_not_none(): |
| 94 | + """ |
| 95 | + A simple unit test that sets the `tokenizer_name_or_path` argument in |
| 96 | + `model_args` to a non None value. Since the argument is not None, almost |
| 97 | + the entire `get_special_tokens_dict` function is skipped and the |
| 98 | + special tokens dict is expected to be empty. |
| 99 | + """ |
| 100 | + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) |
| 101 | + model_args = configs.ModelArguments(tokenizer_name_or_path="test_path") |
| 102 | + special_tokens_dict = get_special_tokens_dict( |
| 103 | + tokenizer_name_or_path=model_args.tokenizer_name_or_path, tokenizer=tokenizer |
| 104 | + ) |
| 105 | + # Assert special_tokens_dict is empty |
| 106 | + assert not special_tokens_dict |
| 107 | + |
| 108 | + |
| 109 | +def test_tokenizer_and_embedding_resize_return_values_missing_one_token(): |
| 110 | + """ |
| 111 | + Tests the resizing function when the special tokens dict contains a PAD token, |
| 112 | + which means the tokenizer is missing one special token. |
| 113 | +
|
| 114 | + `mulitple_of` is set to 1. |
| 115 | + """ |
15 | 116 | special_tokens_dict = {"pad_token": "<pad>"} |
16 | 117 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
17 | 118 | model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
18 | 119 | metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model) |
19 | 120 | assert metadata["num_new_tokens"] == 1 |
20 | | - assert "new_embedding_size" in metadata |
| 121 | + assert metadata["new_embedding_size"] == len(tokenizer) |
| 122 | + |
| 123 | + |
| 124 | +def test_tokenizer_and_embedding_resize_return_values_missing_four_tokens(): |
| 125 | + """ |
| 126 | + Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token, |
| 127 | + which means the tokenizer is missing four special tokens. |
| 128 | +
|
| 129 | + `mulitple_of` is set to 1. |
| 130 | + """ |
| 131 | + special_tokens_dict = { |
| 132 | + "pad_token": "<PAD>", |
| 133 | + "eos_token": "</s>", |
| 134 | + "bos_token": "<s>", |
| 135 | + "unk_token": "<unk>", |
| 136 | + } |
| 137 | + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) |
| 138 | + model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0") |
| 139 | + metadata = tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model) |
| 140 | + assert metadata["num_new_tokens"] == 4 |
| 141 | + assert metadata["new_embedding_size"] == len(tokenizer) |
| 142 | + |
| 143 | + |
| 144 | +def test_tokenizer_and_embedding_resize_return_values_mutliple_of_two(): |
| 145 | + """ |
| 146 | + Tests the resizing when the special tokens dict contains a PAD, EOS, BOS and UNK token, |
| 147 | + which means the tokenizer is missing four special tokens. |
| 148 | +
|
| 149 | + `mulitple_of` is set to 2; this add one to the count of num_new_tokens and adds |
| 150 | + one to the count of new_embedding_size. |
| 151 | + """ |
| 152 | + special_tokens_dict = { |
| 153 | + "pad_token": "<PAD>", |
| 154 | + "eos_token": "</s>", |
| 155 | + "bos_token": "<s>", |
| 156 | + "unk_token": "<unk>", |
| 157 | + } |
| 158 | + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) |
| 159 | + model = AutoModelForCausalLM.from_pretrained("Maykeye/TinyLLama-v0") |
| 160 | + metadata = tokenizer_and_embedding_resize( |
| 161 | + special_tokens_dict, tokenizer, model, multiple_of=2 |
| 162 | + ) |
| 163 | + assert metadata["num_new_tokens"] == 5 |
| 164 | + assert metadata["new_embedding_size"] == len(tokenizer) + 1 |
0 commit comments