77
88
99
10- def prepare_data (data : list [str ], tokenizer , max_seq_length : int = 1024 , prompt_length : int = 1 ):
10+ def prepare_data (data_0 : list [str ], tokenizer_0 , max_seq_length : int = 1024 , prompt_length : int = 1 ):
1111 """
1212 Prepares tokenized input sequences and corresponding labels for training the Cerebros
1313 [not so] large language model.
@@ -25,7 +25,7 @@ def prepare_data(data: list[str], tokenizer, max_seq_length: int = 1024, prompt_
2525
2626 Parameters:
2727 -----------
28- data : list of str
28+ data_0 : list of str
2929 List of input text samples to be processed.
3030 max_seq_length : int, optional: default = 1024
3131 Maximum sequence length for input tensors. Sequences longer than this will be truncated,
@@ -58,14 +58,14 @@ def prepare_data(data: list[str], tokenizer, max_seq_length: int = 1024, prompt_
5858 pad_token_id = tokenizer .pad_token_id
5959
6060 # Tokenize all data at once for efficiency
61- tokenized_data = tokenizer (
62- data ,
61+ tokenized_data = tokenizer_0 (
62+ data_0 ,
6363 max_length = max_seq_length ,
6464 padding = 'max_length' ,
6565 truncation = True ,
6666 add_special_tokens = False # We'll handle special tokens manually
6767 )
68- vocab_size = len (tokenizer )
68+ vocab_size = len (tokenizer_0 )
6969
7070 # Get the token ID for </prompt>
7171 end_prompt_token_id = tokenizer .encode ("</prompt>" , add_special_tokens = False )[0 ]
0 commit comments