@@ -51,7 +51,9 @@ def _get_header_conversation_type_mask_role(source, special_tokens):
5151 if TYPE_INSTRUCTION [data_type ] != '' :
5252 conversation = conversation + '\n ' + TYPE_INSTRUCTION [data_type ]
5353 mask_role = source .get ('mask' , 'User' )
54- header = f"{ special_tokens ['system_turn_start' ]} { SYSTEM_TOKEN } { END_NAME_SIGNAL } { conversation } { END_SIGNAL } "
54+ system_token = source .get ("system_token" , SYSTEM_TOKEN )
55+ header = f"{ special_tokens ['system_turn_start' ]} { system_token } { END_NAME_SIGNAL } { conversation } { END_SIGNAL } "
56+ # logging.info(f"DBG HEADER:\n```{header}```")
5557 conversation = _add_speaker_and_signal (header , source ['conversations' ], mask_role , data_type , special_tokens )
5658 return header , conversation , data_type , mask_role
5759
@@ -60,13 +62,14 @@ def get_prompt_template_example(special_tokens):
6062 source = {
6163 'system' : '{system message}' ,
6264 'conversations' : [
63- {'from' : 'User ' , 'value' : '{turn 1 user message}' , 'label' : None },
64- {'from' : 'Assistant ' , 'value' : '{turn 1 assistant message}' , 'label' : '{turn 1 assistant label}' },
65- {'from' : 'User ' , 'value' : '{turn 2 user message}' , 'label' : None },
66- {'from' : 'Assistant ' , 'value' : '{turn 2 assistant message}' , 'label' : '{turn 2 assistant label}' },
65+ {'from' : '{user role} ' , 'value' : '{turn 1 user message}' , 'label' : None },
66+ {'from' : '{assistant role} ' , 'value' : '{turn 1 assistant message}' , 'label' : '{turn 1 assistant label}' },
67+ {'from' : '{user role} ' , 'value' : '{turn 2 user message}' , 'label' : None },
68+ {'from' : '{assistant role} ' , 'value' : '{turn 2 assistant message}' , 'label' : '{turn 2 assistant label}' },
6769 ],
68- "mask" : "User " ,
70+ "mask" : "{user role} " ,
6971 "type" : "VALUE_TO_TEXT" ,
72+ "system_token" : '{system token}' ,
7073 }
7174 _ , conversation , _ , _ = _get_header_conversation_type_mask_role (source , special_tokens )
7275 return conversation
@@ -273,6 +276,7 @@ def preprocess(
273276 id1 = tokenizer .text_to_ids (PREFIX_STR + s ["value" ])
274277 id2 = tokenizer .text_to_ids (PREFIX_STR )
275278 tokenized_sentence = id1 [len (id2 ) :]
279+ # logging.info(f"CONV DBG: {tokenized_sentence[0:20]} ... {tokenized_sentence[-20:]}")
276280 ids .append (torch .tensor (tokenized_sentence ))
277281 tokenized_lens .append (len (tokenized_sentence ))
278282 speakers = [sentence ["from" ] for sentence in source ['conversations' ]]
@@ -326,6 +330,8 @@ def _build_samples_mapping(self):
326330 id2 = self .tokenizer .text_to_ids (PREFIX_STR )
327331 self .num_turn_start_tokens = len (id1 ) - len (id2 )
328332
333+ # logging.info(f"DATASET DBG:\n{self.special_tokens=}\n{self.label_start_tokens=}, {self.name_end_token_ids=}, {self.num_turn_start_tokens=}")
334+
329335 def _process_example (self , example ):
330336 """
331337 Create an example by concatenating text and answer.
0 commit comments