@@ -54,6 +54,7 @@ def objective(trial: optuna.Trial) -> float:
5454 import numpy as np
5555 from cerebros .simplecerebrosrandomsearch .simple_cerebros_random_search \
5656 import SimpleCerebrosRandomSearch
57+ from cerebrosllmutils .llm_utils import prepare_data
5758 import pendulum
5859 from cerebros .units .units import DenseUnit
5960 from cerebros .denseautomlstructuralcomponent .dense_automl_structural_component \
@@ -209,83 +210,83 @@ def objective(trial: optuna.Trial) -> float:
209210
210211 # Data Preprocessing:
211212
212- def prepare_data (data , max_seq_length : int = MAX_SEQ_LENGTH ):
213- all_input_ids = []
214- all_labels = []
213+ # def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
214+ # all_input_ids = []
215+ # all_labels = []
215216
216- pad_token_id = tokenizer .pad_token_id
217+ # pad_token_id = tokenizer.pad_token_id
217218
218- # Tokenize all data at once for efficiency
219- tokenized_data = tokenizer (
220- data ,
221- max_length = max_seq_length ,
222- padding = 'max_length' ,
223- truncation = True ,
224- add_special_tokens = False # We'll handle special tokens manually
225- )
219+ # # Tokenize all data at once for efficiency
220+ # tokenized_data = tokenizer(
221+ # data,
222+ # max_length=max_seq_length,
223+ # padding='max_length',
224+ # truncation=True,
225+ # add_special_tokens=False # We'll handle special tokens manually
226+ # )
226227
227- # Get the token ID for </prompt>
228- end_prompt_token_id = tokenizer .encode ("</prompt>" , add_special_tokens = False )[0 ]
228+ # # Get the token ID for </prompt>
229+ # end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
229230
230- # Process each sample
231- for sample_tokens in tokenized_data ['input_ids' ]:
232- # Find the index of </prompt> token
233- try :
234- end_prompt_index = sample_tokens .index (end_prompt_token_id )
235- except ValueError :
236- # If </prompt> not found, treat sample as a non-instruct sample
237- end_prompt_index = (PROMPT_LENGTH - 1 ) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
231+ # # Process each sample
232+ # for sample_tokens in tokenized_data['input_ids']:
233+ # # Find the index of </prompt> token
234+ # try:
235+ # end_prompt_index = sample_tokens.index(end_prompt_token_id)
236+ # except ValueError:
237+ # # If </prompt> not found, treat sample as a non-instruct sample
238+ # end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
238239
239- # Find first pad token after </prompt>
240- first_pad_index = None
241- for i in range (end_prompt_index + 1 , len (sample_tokens )):
242- if sample_tokens [i ] == pad_token_id :
243- first_pad_index = i
244- break
240+ # # Find first pad token after </prompt>
241+ # first_pad_index = None
242+ # for i in range(end_prompt_index + 1, len(sample_tokens)):
243+ # if sample_tokens[i] == pad_token_id:
244+ # first_pad_index = i
245+ # break
245246
246- # If no pad token found, use the end of sequence
247- if first_pad_index is None :
248- first_pad_index = len (sample_tokens )
247+ # # If no pad token found, use the end of sequence
248+ # if first_pad_index is None:
249+ # first_pad_index = len(sample_tokens)
249250
250- # Apply sliding window from after </prompt> to first pad token
251- # Start from end_prompt_index + 1 (first token to predict)
252- # End at first_pad_index - 1 (last token to predict)
253- for i in range (end_prompt_index + 1 , first_pad_index ):
254- # Input: from start up to (but not including) token i
255- input_ids = sample_tokens [:i ]
251+ # # Apply sliding window from after </prompt> to first pad token
252+ # # Start from end_prompt_index + 1 (first token to predict)
253+ # # End at first_pad_index - 1 (last token to predict)
254+ # for i in range(end_prompt_index + 1, first_pad_index):
255+ # # Input: from start up to (but not including) token i
256+ # input_ids = sample_tokens[:i]
256257
257- # Pad or truncate to max_seq_length
258- if len (input_ids ) > max_seq_length :
259- input_ids = input_ids [:max_seq_length ]
260- else :
261- input_ids = input_ids + [pad_token_id ] * (max_seq_length - len (input_ids ))
258+ # # Pad or truncate to max_seq_length
259+ # if len(input_ids) > max_seq_length:
260+ # input_ids = input_ids[:max_seq_length]
261+ # else:
262+ # input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
262263
263- # Label: one-hot encoding of token at position i
264- next_token = sample_tokens [i ]
265- label = [0 ] * VOCABULARY_SIZE
266- label [next_token ] = 1
264+ # # Label: one-hot encoding of token at position i
265+ # next_token = sample_tokens[i]
266+ # label = [0] * VOCABULARY_SIZE
267+ # label[next_token] = 1
267268
268- all_input_ids .append (input_ids )
269- all_labels .append (label )
269+ # all_input_ids.append(input_ids)
270+ # all_labels.append(label)
270271
271- # Add final sample with pad token as label to indicate termination
272- if first_pad_index < len (sample_tokens ): # Only if there's actually a pad token
273- input_ids = sample_tokens [:first_pad_index ]
272+ # # Add final sample with pad token as label to indicate termination
273+ # if first_pad_index < len(sample_tokens): # Only if there's actually a pad token
274+ # input_ids = sample_tokens[:first_pad_index]
274275
275- # Pad or truncate to max_seq_length
276- if len (input_ids ) > max_seq_length :
277- input_ids = input_ids [:max_seq_length ]
278- else :
279- input_ids = input_ids + [pad_token_id ] * (max_seq_length - len (input_ids ))
276+ # # Pad or truncate to max_seq_length
277+ # if len(input_ids) > max_seq_length:
278+ # input_ids = input_ids[:max_seq_length]
279+ # else:
280+ # input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
280281
281- # Label: one-hot encoding of pad token
282- label = [0 ] * VOCABULARY_SIZE
283- label [pad_token_id ] = 1
282+ # # Label: one-hot encoding of pad token
283+ # label = [0] * VOCABULARY_SIZE
284+ # label[pad_token_id] = 1
284285
285- all_input_ids .append (input_ids )
286- all_labels .append (label )
286+ # all_input_ids.append(input_ids)
287+ # all_labels.append(label)
287288
288- return all_input_ids , all_labels , VOCABULARY_SIZE
289+ # return all_input_ids, all_labels, VOCABULARY_SIZE
289290
290291
291292 ## Only add re, tokenizer already in script
@@ -439,15 +440,16 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
439440 # Add non-instruct samples
440441 # data += non_instruct_samples
441442
442-
443- x , y , vocab_size = prepare_data (non_instruct_samples ) # data)
443+ ##<<##<<
444+ x , y , vocab_size = prepare_data (data_0 = non_instruct_samples , tokenizer_0 = toeknizer , max_seq_length = MAX_SEQ_LENGTH , prompt_length : int = 1 )
445+ # x, y, vocab_size = prepare_data() # data)
444446
445447 print ("Input IDs shape:" , len (x ), "x" , len (x [0 ]) if x else 0 )
446448 print ("Labels shape:" , len (y ), "x" , len (y [0 ]) if y else 0 )
447449 print ("Vocabulary size:" , vocab_size )
448450 print ("First few samples generated:" , len (x ))
449451
450-
452+ raise ValueError ( "Debug" )
451453 # i = 1
452454 # for d,l in zip(x, y):
453455 # print(f"Sample {i}:")
0 commit comments