File tree Expand file tree Collapse file tree 1 file changed +8
-5
lines changed
recipes/quickstart/finetuning/datasets Expand file tree Collapse file tree 1 file changed +8
-5
lines changed Original file line number Diff line number Diff line change 9
9
10
10
11
11
B_INST , E_INST = "[INST]" , "[/INST]"
12
+ EOT_ID = 128009 #<|eot_id|>
12
13
13
14
def tokenize_dialog (dialog , tokenizer ):
14
15
if tokenizer .vocab_size >= 128000 :
15
16
dialog_tokens = tokenizer .apply_chat_template (dialog )
16
- dialog_tokens = dialog_tokens [:- 4 ] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
17
- eot_indices = [i for i ,n in enumerate (dialog_tokens ) if n == 128009 ]
17
+ eot_indices = [i for i ,n in enumerate (dialog_tokens ) if n == EOT_ID ]
18
18
labels = copy .copy (dialog_tokens )
19
+ #determine token for system and user
20
+ system_or_user = (tokenizer .encode ("system" )[- 1 ], tokenizer .encode ("user" )[- 1 ])
19
21
last_idx = 0
20
22
for n , idx in enumerate (eot_indices ):
21
- if n % 2 == 1 :
22
- last_idx = idx
23
- else :
23
+ role_token = labels [ last_idx : idx + 1 ][ 2 ]
24
+ if role_token in system_or_user :
25
+ # Set labels to -100 for system and user tokens to ignore in loss function
24
26
labels [last_idx :idx + 1 ] = [- 100 ] * (idx - last_idx + 1 )
27
+ last_idx = idx
25
28
26
29
dialog_tokens = [dialog_tokens ]
27
30
labels_tokens = [labels ]
You can’t perform that action at this time.
0 commit comments