Skip to content

Commit dc62473

Browse files
Fix type annotations and variable references in llm_utils
1 parent 8c66b33 commit dc62473

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

cerebrosllmutils/llm_utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ def prepare_data(
4545
Returns:
4646
--------
4747
tuple:
48-
- all_input_ids (2d list of int): Tuple[List[List[int]] Token IDs for each input sequence, shaped
48+
- all_input_ids (2d list of int): Tuple[List[List[int]]] Token IDs for each input sequence, shaped
4949
[num_samples, max_seq_length].
50-
- all_labels (2d list of int): Tuple[List[List[int]] One-hot encoded labels for next-token prediction,
50+
- all_labels (2d list of int): Tuple[List[List[int]]] One-hot encoded labels for next-token prediction,
5151
shaped [num_samples, vocab_size].
5252
- vocab_size (int): Size of the tokenizer's vocabulary, used for label dimensions.
5353
@@ -62,7 +62,7 @@ def prepare_data(
6262
all_input_ids = []
6363
all_labels = []
6464

65-
pad_token_id = tokenizer.pad_token_id
65+
pad_token_id = tokenizer_0.pad_token_id
6666

6767
# Tokenize all data at once for efficiency
6868
tokenized_data = tokenizer_0(
@@ -75,7 +75,7 @@ def prepare_data(
7575
vocab_size = len(tokenizer_0)
7676

7777
# Get the token ID for </prompt>
78-
end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
78+
end_prompt_token_id = tokenizer_0.encode("</prompt>", add_special_tokens=False)[0]
7979

8080
# Process each sample
8181
for sample_tokens in tokenized_data['input_ids']:
@@ -85,7 +85,7 @@ def prepare_data(
8585
except ValueError:
8686
# If </prompt> not found, treat sample as a non-instruct sample
8787
end_prompt_index = (
88-
PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
88+
prompt_length - 1) # int(np.ceil(len(sample_tokens) * (1/3))) # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples
8989

9090
# Find first pad token after </prompt>
9191
first_pad_index = None

0 commit comments

Comments
 (0)