Skip to content

Commit 4652135

Browse files
committed
chore: tox -e fix
Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
1 parent 050818c commit 4652135

File tree

5 files changed

+67
-56
lines changed

5 files changed

+67
-56
lines changed

examples/01_building_a_reasoning_model.ipynb

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"outputs": [],
3333
"source": [
3434
"from datasets import load_dataset, concatenate_datasets\n",
35+
"\n",
3536
"dataset = load_dataset(\"nvidia/Llama-Nemotron-Post-Training-Dataset-v1\")"
3637
]
3738
},
@@ -50,20 +51,25 @@
5051
"source": [
5152
"def generalize_sample(sample):\n",
5253
" user = sample[\"input\"].split(\"user<|end_header_id|>\\n\\n\")[1].split(\"<|eot_id|>\")[0]\n",
53-
" assistant = sample[\"output\"].replace(\"<|eot_id|>\", '')\n",
54+
" assistant = sample[\"output\"].replace(\"<|eot_id|>\", \"\")\n",
5455
" message_list = [\n",
5556
" {\"role\": \"system\", \"content\": f\"detailed thinking {sample['reasoning']}\"},\n",
5657
" {\"role\": \"user\", \"content\": user},\n",
5758
" {\"role\": \"assistant\", \"content\": assistant},\n",
5859
" ]\n",
5960
" return {\"messages\": message_list}\n",
6061
"\n",
62+
"\n",
6163
"generic_samples_datasets = []\n",
6264
"for split in dataset.keys():\n",
6365
" print(f\"Processing {split} samples\")\n",
64-
" new_split = dataset[split].filter(lambda sample: sample[\"used_in_training\"] == 'yes', num_proc=8)\n",
66+
" new_split = dataset[split].filter(\n",
67+
" lambda sample: sample[\"used_in_training\"] == \"yes\", num_proc=8\n",
68+
" )\n",
6569
" print(f\"Adding {len(new_split)} samples\")\n",
66-
" new_samples = new_split.map(generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8)\n",
70+
" new_samples = new_split.map(\n",
71+
" generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8\n",
72+
" )\n",
6773
" generic_samples_datasets.append(new_samples)\n",
6874
" print(\"Samples added\\n\")"
6975
]
@@ -123,7 +129,12 @@
123129
"metadata": {},
124130
"outputs": [],
125131
"source": [
126-
"from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n",
132+
"from instructlab.training.config import (\n",
133+
" TorchrunArgs,\n",
134+
" TrainingArgs,\n",
135+
" DistributedBackend,\n",
136+
" FSDPOptions,\n",
137+
")\n",
127138
"from instructlab.training.main_ds import run_training"
128139
]
129140
},
@@ -141,11 +152,11 @@
141152
"outputs": [],
142153
"source": [
143154
"torch_args = TorchrunArgs(\n",
144-
"nproc_per_node=8,\n",
145-
"\tnnodes=1,\n",
146-
" \tnode_rank=0,\n",
147-
" rdzv_id=123,\n",
148-
" \trdzv_endpoint=\"0.0.0.0:8888\",\n",
155+
" nproc_per_node=8,\n",
156+
" nnodes=1,\n",
157+
" node_rank=0,\n",
158+
" rdzv_id=123,\n",
159+
" rdzv_endpoint=\"0.0.0.0:8888\",\n",
149160
")"
150161
]
151162
},
@@ -163,22 +174,22 @@
163174
"outputs": [],
164175
"source": [
165176
"train_args = TrainingArgs(\n",
166-
"\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n",
167-
"\tdata_path=\"nemotron.jsonl\",\n",
168-
"\tckpt_output_dir=\"experiments/training_output\",\n",
169-
"\tdata_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n",
170-
"\tmax_seq_len=20000,\n",
171-
"\tmax_batch_len=30000, # max tokens per gpu\n",
172-
"\tnum_epochs=3, \n",
173-
"\teffective_batch_size=256, # target batch size per model update\n",
174-
"\tlearning_rate=2e-5,\n",
175-
"\twarmup_steps=25,\n",
176-
" save_samples=0, # save ckpt after num of samples seen (0=off)\n",
177-
" checkpoint_at_epoch = True, # save ckpt after every epoch\n",
178-
" accelerate_full_state_at_epoch = False, # save full-state for resuming\n",
179-
" process_data=True, # can set to false if data processed before\n",
180-
"\tdistributed_backend=DistributedBackend.FSDP,\n",
181-
"\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n",
177+
" model_path=\"microsoft/Phi-4-mini-instruct\",\n",
178+
" data_path=\"nemotron.jsonl\",\n",
179+
" ckpt_output_dir=\"experiments/training_output\",\n",
180+
" data_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n",
181+
" max_seq_len=20000,\n",
182+
" max_batch_len=30000, # max tokens per gpu\n",
183+
" num_epochs=3,\n",
184+
" effective_batch_size=256, # target batch size per model update\n",
185+
" learning_rate=2e-5,\n",
186+
" warmup_steps=25,\n",
187+
" save_samples=0, # save ckpt after num of samples seen (0=off)\n",
188+
" checkpoint_at_epoch=True, # save ckpt after every epoch\n",
189+
" accelerate_full_state_at_epoch=False, # save full-state for resuming\n",
190+
" process_data=True, # can set to false if data processed before\n",
191+
" distributed_backend=DistributedBackend.FSDP,\n",
192+
" fsdp_options=FSDPOptions(cpu_offload_params=False),\n",
182193
")"
183194
]
184195
},
@@ -195,7 +206,7 @@
195206
"metadata": {},
196207
"outputs": [],
197208
"source": [
198-
"run_training(torch_args=torch_args,train_args=train_args)"
209+
"run_training(torch_args=torch_args, train_args=train_args)"
199210
]
200211
},
201212
{

src/instructlab/training/data_process.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,9 +197,9 @@ def find_longest_match(start_idx, sequences):
197197
for i in range(len(final_sentence_tk)):
198198
for seq in special_sequences:
199199
if final_sentence_tk[i : i + len(seq)] == seq:
200-
assert all(
201-
final_labels[i + j] == -100 for j in range(len(seq))
202-
), f"Special sequence {seq} is unmasked"
200+
assert all(final_labels[i + j] == -100 for j in range(len(seq))), (
201+
f"Special sequence {seq} is unmasked"
202+
)
203203

204204
# 2. No pretrain tokens should be in the final sentence_tk
205205
assert all(

src/instructlab/training/tokenizer_utils.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ def setup_tokenizer_with_existing_chat_template(
1111
tokenizer: PreTrainedTokenizer,
1212
) -> PreTrainedTokenizer:
1313
# otherwise, when the user doesn't provide a chat template path, we will use the default chat template
14-
assert (
15-
tokenizer.eos_token is not None
16-
), "provided chat template doesn't have an EOS token, need to handle this case"
14+
assert tokenizer.eos_token is not None, (
15+
"provided chat template doesn't have an EOS token, need to handle this case"
16+
)
1717
if not tokenizer.pad_token:
1818
# we need to set the padding token
1919
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
@@ -33,12 +33,12 @@ def setup_tokenizer_with_existing_chat_template(
3333
)
3434

3535
# ensure the necessary tokens exist
36-
assert (
37-
len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1
38-
), "padding token doesn't exist or is of incorrect length"
39-
assert (
40-
len(get_sp_token(tokenizer, tokenizer.eos_token)) == 1
41-
), "EOS token doesn't exist or is of incorrect length"
36+
assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, (
37+
"padding token doesn't exist or is of incorrect length"
38+
)
39+
assert len(get_sp_token(tokenizer, tokenizer.eos_token)) == 1, (
40+
"EOS token doesn't exist or is of incorrect length"
41+
)
4242
return tokenizer
4343

4444

@@ -69,12 +69,12 @@ def setup_tokenizer_from_new_chat_template(
6969
tokenizer.add_eos_token = False
7070

7171
tokenizer.chat_template = CHAT_TEMPLATE
72-
assert (
73-
len(get_sp_token(tokenizer, SPECIAL_TOKENS.eos.token)) == 1
74-
), "EOS token doesn't exist or is of incorrect length"
75-
assert (
76-
len(get_sp_token(tokenizer, SPECIAL_TOKENS.pad.token)) == 1
77-
), "Padding token doesn't exist or is of incorrect length"
72+
assert len(get_sp_token(tokenizer, SPECIAL_TOKENS.eos.token)) == 1, (
73+
"EOS token doesn't exist or is of incorrect length"
74+
)
75+
assert len(get_sp_token(tokenizer, SPECIAL_TOKENS.pad.token)) == 1, (
76+
"Padding token doesn't exist or is of incorrect length"
77+
)
7878
return tokenizer
7979

8080

src/instructlab/training/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -723,12 +723,12 @@ class UniversalCheckpointArgs:
723723
"an empty dictionary."
724724
)
725725
ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
726-
assert (
727-
ds_checkpoint.tp_degree == 1
728-
), "if universal checkpointing info is missing, TP must be absent"
729-
assert (
730-
ds_checkpoint.pp_degree == 1
731-
), "if universal checkpointing info is missing, PP must be absent"
726+
assert ds_checkpoint.tp_degree == 1, (
727+
"if universal checkpointing info is missing, TP must be absent"
728+
)
729+
assert ds_checkpoint.pp_degree == 1, (
730+
"if universal checkpointing info is missing, PP must be absent"
731+
)
732732
_check_for_required_state(ds_checkpoint)
733733

734734
slice_shapes = []

tests/unit/test_logger.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,16 +163,16 @@ def get_log_files():
163163

164164
# Test mapping content creates a file/directory
165165
logger_with_handler.info({"test": 3, "test2": 3.7})
166-
assert (
167-
len(get_log_files()) == 1
168-
), "Expected test_run file/directory found in tmp_path"
166+
assert len(get_log_files()) == 1, (
167+
"Expected test_run file/directory found in tmp_path"
168+
)
169169

170170
# Test call with step
171171
for i in range(10):
172172
logger_with_handler.info({"test": 3, "test2": 3.7}, extra={"step": i})
173173

174174
# Test call with hparams
175175
logger_with_handler.info({"epoch": 2, "lr": 0.001}, extra={"hparams": True})
176-
assert (
177-
len(get_log_files()) == 1
178-
), "Expected test_run file/directory found in tmp_path"
176+
assert len(get_log_files()) == 1, (
177+
"Expected test_run file/directory found in tmp_path"
178+
)

0 commit comments

Comments
 (0)