Skip to content

Commit a0877a9

Browse files
Replace inline prepare_data with utility function
Refactor data preparation function to use external utility.
1 parent 79bb1ba commit a0877a9

File tree

1 file changed

+67
-65
lines changed

1 file changed

+67
-65
lines changed

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 67 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def objective(trial: optuna.Trial) -> float:
5454
import numpy as np
5555
from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
5656
import SimpleCerebrosRandomSearch
57+
from cerebrosllmutils.llm_utils import prepare_data
5758
import pendulum
5859
from cerebros.units.units import DenseUnit
5960
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
@@ -209,83 +210,83 @@ def objective(trial: optuna.Trial) -> float:
209210

210211
# Data Preprocessing:
211212

212-
def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
213-
all_input_ids = []
214-
all_labels = []
213+
# def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
214+
# all_input_ids = []
215+
# all_labels = []
215216

216-
pad_token_id = tokenizer.pad_token_id
217+
# pad_token_id = tokenizer.pad_token_id
217218

218-
# Tokenize all data at once for efficiency
219-
tokenized_data = tokenizer(
220-
data,
221-
max_length=max_seq_length,
222-
padding='max_length',
223-
truncation=True,
224-
add_special_tokens=False # We'll handle special tokens manually
225-
)
219+
# # Tokenize all data at once for efficiency
220+
# tokenized_data = tokenizer(
221+
# data,
222+
# max_length=max_seq_length,
223+
# padding='max_length',
224+
# truncation=True,
225+
# add_special_tokens=False # We'll handle special tokens manually
226+
# )
226227

227-
# Get the token ID for </prompt>
228-
end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
228+
# # Get the token ID for </prompt>
229+
# end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
229230

230-
# Process each sample
231-
for sample_tokens in tokenized_data['input_ids']:
232-
# Find the index of </prompt> token
233-
try:
234-
end_prompt_index = sample_tokens.index(end_prompt_token_id)
235-
except ValueError:
236-
# If </prompt> not found, treat sample as a non-instruct sample
237-
end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
231+
# # Process each sample
232+
# for sample_tokens in tokenized_data['input_ids']:
233+
# # Find the index of </prompt> token
234+
# try:
235+
# end_prompt_index = sample_tokens.index(end_prompt_token_id)
236+
# except ValueError:
237+
# # If </prompt> not found, treat sample as a non-instruct sample
238+
# end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
238239

239-
# Find first pad token after </prompt>
240-
first_pad_index = None
241-
for i in range(end_prompt_index + 1, len(sample_tokens)):
242-
if sample_tokens[i] == pad_token_id:
243-
first_pad_index = i
244-
break
240+
# # Find first pad token after </prompt>
241+
# first_pad_index = None
242+
# for i in range(end_prompt_index + 1, len(sample_tokens)):
243+
# if sample_tokens[i] == pad_token_id:
244+
# first_pad_index = i
245+
# break
245246

246-
# If no pad token found, use the end of sequence
247-
if first_pad_index is None:
248-
first_pad_index = len(sample_tokens)
247+
# # If no pad token found, use the end of sequence
248+
# if first_pad_index is None:
249+
# first_pad_index = len(sample_tokens)
249250

250-
# Apply sliding window from after </prompt> to first pad token
251-
# Start from end_prompt_index + 1 (first token to predict)
252-
# End at first_pad_index - 1 (last token to predict)
253-
for i in range(end_prompt_index + 1, first_pad_index):
254-
# Input: from start up to (but not including) token i
255-
input_ids = sample_tokens[:i]
251+
# # Apply sliding window from after </prompt> to first pad token
252+
# # Start from end_prompt_index + 1 (first token to predict)
253+
# # End at first_pad_index - 1 (last token to predict)
254+
# for i in range(end_prompt_index + 1, first_pad_index):
255+
# # Input: from start up to (but not including) token i
256+
# input_ids = sample_tokens[:i]
256257

257-
# Pad or truncate to max_seq_length
258-
if len(input_ids) > max_seq_length:
259-
input_ids = input_ids[:max_seq_length]
260-
else:
261-
input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
258+
# # Pad or truncate to max_seq_length
259+
# if len(input_ids) > max_seq_length:
260+
# input_ids = input_ids[:max_seq_length]
261+
# else:
262+
# input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
262263

263-
# Label: one-hot encoding of token at position i
264-
next_token = sample_tokens[i]
265-
label = [0] * VOCABULARY_SIZE
266-
label[next_token] = 1
264+
# # Label: one-hot encoding of token at position i
265+
# next_token = sample_tokens[i]
266+
# label = [0] * VOCABULARY_SIZE
267+
# label[next_token] = 1
267268

268-
all_input_ids.append(input_ids)
269-
all_labels.append(label)
269+
# all_input_ids.append(input_ids)
270+
# all_labels.append(label)
270271

271-
# Add final sample with pad token as label to indicate termination
272-
if first_pad_index < len(sample_tokens): # Only if there's actually a pad token
273-
input_ids = sample_tokens[:first_pad_index]
272+
# # Add final sample with pad token as label to indicate termination
273+
# if first_pad_index < len(sample_tokens): # Only if there's actually a pad token
274+
# input_ids = sample_tokens[:first_pad_index]
274275

275-
# Pad or truncate to max_seq_length
276-
if len(input_ids) > max_seq_length:
277-
input_ids = input_ids[:max_seq_length]
278-
else:
279-
input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
276+
# # Pad or truncate to max_seq_length
277+
# if len(input_ids) > max_seq_length:
278+
# input_ids = input_ids[:max_seq_length]
279+
# else:
280+
# input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
280281

281-
# Label: one-hot encoding of pad token
282-
label = [0] * VOCABULARY_SIZE
283-
label[pad_token_id] = 1
282+
# # Label: one-hot encoding of pad token
283+
# label = [0] * VOCABULARY_SIZE
284+
# label[pad_token_id] = 1
284285

285-
all_input_ids.append(input_ids)
286-
all_labels.append(label)
286+
# all_input_ids.append(input_ids)
287+
# all_labels.append(label)
287288

288-
return all_input_ids, all_labels, VOCABULARY_SIZE
289+
# return all_input_ids, all_labels, VOCABULARY_SIZE
289290

290291

291292
## Only add re, tokenizer already in script
@@ -439,15 +440,16 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
439440
# Add non-instruct samples
440441
# data += non_instruct_samples
441442

442-
443-
x, y, vocab_size = prepare_data(non_instruct_samples) # data)
443+
##<<##<<
444+
x, y, vocab_size = prepare_data(data_0=non_instruct_samples, tokenizer_0=toeknizer, max_seq_length=MAX_SEQ_LENGTH, prompt_length: int = 1)
445+
# x, y, vocab_size = prepare_data() # data)
444446

445447
print("Input IDs shape:", len(x), "x", len(x[0]) if x else 0)
446448
print("Labels shape:", len(y), "x", len(y[0]) if y else 0)
447449
print("Vocabulary size:", vocab_size)
448450
print("First few samples generated:", len(x))
449451

450-
452+
raise ValueError("Debug")
451453
# i = 1
452454
# for d,l in zip(x, y):
453455
# print(f"Sample {i}:")

0 commit comments

Comments
 (0)