Skip to content

Commit 0285ec9

Browse files
Rename parameters in prepare_data function
Use local variable nomenclature so there are no naming collisions.
1 parent 4203bf2 commit 0285ec9

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

cerebrosllmutils/llm_utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99

10-
def prepare_data(data: list[str], tokenizer, max_seq_length: int = 1024, prompt_length: int=1):
10+
def prepare_data(data_0: list[str], tokenizer_0, max_seq_length: int = 1024, prompt_length: int=1):
1111
"""
1212
Prepares tokenized input sequences and corresponding labels for training the Cerebros
1313
[not so] large language model.
@@ -25,7 +25,7 @@ def prepare_data(data: list[str], tokenizer, max_seq_length: int = 1024, prompt_
2525
2626
Parameters:
2727
-----------
28-
data : list of str
28+
data_0 : list of str
2929
List of input text samples to be processed.
3030
max_seq_length : int, optional: default = 1024
3131
Maximum sequence length for input tensors. Sequences longer than this will be truncated,
@@ -58,14 +58,14 @@ def prepare_data(data: list[str], tokenizer, max_seq_length: int = 1024, prompt_
5858
pad_token_id = tokenizer.pad_token_id
5959

6060
# Tokenize all data at once for efficiency
61-
tokenized_data = tokenizer(
62-
data,
61+
tokenized_data = tokenizer_0(
62+
data_0,
6363
max_length=max_seq_length,
6464
padding='max_length',
6565
truncation=True,
6666
add_special_tokens=False # We'll handle special tokens manually
6767
)
68-
vocab_size = len(tokenizer)
68+
vocab_size = len(tokenizer_0)
6969

7070
# Get the token ID for </prompt>
7171
end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]

0 commit comments

Comments
 (0)