chore: tox -e fix

booxter · booxter · commit 46521357f945 · 2025-05-21T14:03:59.000Z
Signed-off-by: Ihar Hrachyshka &lt;ihar.hrachyshka@gmail.com&gt;
diff --git a/examples/01_building_a_reasoning_model.ipynb b/examples/01_building_a_reasoning_model.ipynb
@@ -32,6 +32,7 @@
    "outputs": [],
    "source": [
     "from datasets import load_dataset, concatenate_datasets\n",
+    "\n",
     "dataset = load_dataset(\"nvidia/Llama-Nemotron-Post-Training-Dataset-v1\")"
    ]
   },
@@ -50,20 +51,25 @@
    "source": [
     "def generalize_sample(sample):\n",
     "    user = sample[\"input\"].split(\"user<|end_header_id|>\\n\\n\")[1].split(\"<|eot_id|>\")[0]\n",
-    "    assistant = sample[\"output\"].replace(\"<|eot_id|>\", '')\n",
+    "    assistant = sample[\"output\"].replace(\"<|eot_id|>\", \"\")\n",
     "    message_list = [\n",
     "        {\"role\": \"system\", \"content\": f\"detailed thinking {sample['reasoning']}\"},\n",
     "        {\"role\": \"user\", \"content\": user},\n",
     "        {\"role\": \"assistant\", \"content\": assistant},\n",
     "    ]\n",
     "    return {\"messages\": message_list}\n",
     "\n",
+    "\n",
     "generic_samples_datasets = []\n",
     "for split in dataset.keys():\n",
     "    print(f\"Processing {split} samples\")\n",
-    "    new_split = dataset[split].filter(lambda sample: sample[\"used_in_training\"] == 'yes', num_proc=8)\n",
+    "    new_split = dataset[split].filter(\n",
+    "        lambda sample: sample[\"used_in_training\"] == \"yes\", num_proc=8\n",
+    "    )\n",
     "    print(f\"Adding {len(new_split)} samples\")\n",
-    "    new_samples = new_split.map(generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8)\n",
+    "    new_samples = new_split.map(\n",
+    "        generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8\n",
+    "    )\n",
     "    generic_samples_datasets.append(new_samples)\n",
     "    print(\"Samples added\\n\")"
    ]
@@ -123,7 +129,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n",
+    "from instructlab.training.config import (\n",
+    "    TorchrunArgs,\n",
+    "    TrainingArgs,\n",
+    "    DistributedBackend,\n",
+    "    FSDPOptions,\n",
+    ")\n",
     "from instructlab.training.main_ds import run_training"
    ]
   },
@@ -141,11 +152,11 @@
    "outputs": [],
    "source": [
     "torch_args = TorchrunArgs(\n",
-    "nproc_per_node=8,\n",
-    "\tnnodes=1,\n",
-    " \tnode_rank=0,\n",
-    "       rdzv_id=123,\n",
-    " \trdzv_endpoint=\"0.0.0.0:8888\",\n",
+    "    nproc_per_node=8,\n",
+    "    nnodes=1,\n",
+    "    node_rank=0,\n",
+    "    rdzv_id=123,\n",
+    "    rdzv_endpoint=\"0.0.0.0:8888\",\n",
     ")"
    ]
   },
@@ -163,22 +174,22 @@
    "outputs": [],
    "source": [
     "train_args = TrainingArgs(\n",
-    "\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n",
-    "\tdata_path=\"nemotron.jsonl\",\n",
-    "\tckpt_output_dir=\"experiments/training_output\",\n",
-    "\tdata_output_dir=\"data/processed-data\",                    # processed data ids/labels/masks\n",
-    "\tmax_seq_len=20000,\n",
-    "\tmax_batch_len=30000,                                      # max tokens per gpu\n",
-    "\tnum_epochs=3, \n",
-    "\teffective_batch_size=256,                                 # target batch size per model update\n",
-    "\tlearning_rate=2e-5,\n",
-    "\twarmup_steps=25,\n",
-    "    save_samples=0,                                           # save ckpt after num of samples seen (0=off)\n",
-    "    checkpoint_at_epoch = True,                               # save ckpt after every epoch\n",
-    "    accelerate_full_state_at_epoch = False,                   # save full-state for resuming\n",
-    "    process_data=True,                                        # can set to false if data processed before\n",
-    "\tdistributed_backend=DistributedBackend.FSDP,\n",
-    "\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n",
+    "    model_path=\"microsoft/Phi-4-mini-instruct\",\n",
+    "    data_path=\"nemotron.jsonl\",\n",
+    "    ckpt_output_dir=\"experiments/training_output\",\n",
+    "    data_output_dir=\"data/processed-data\",  # processed data ids/labels/masks\n",
+    "    max_seq_len=20000,\n",
+    "    max_batch_len=30000,  # max tokens per gpu\n",
+    "    num_epochs=3,\n",
+    "    effective_batch_size=256,  # target batch size per model update\n",
+    "    learning_rate=2e-5,\n",
+    "    warmup_steps=25,\n",
+    "    save_samples=0,  # save ckpt after num of samples seen (0=off)\n",
+    "    checkpoint_at_epoch=True,  # save ckpt after every epoch\n",
+    "    accelerate_full_state_at_epoch=False,  # save full-state for resuming\n",
+    "    process_data=True,  # can set to false if data processed before\n",
+    "    distributed_backend=DistributedBackend.FSDP,\n",
+    "    fsdp_options=FSDPOptions(cpu_offload_params=False),\n",
     ")"
    ]
   },
@@ -195,7 +206,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "run_training(torch_args=torch_args,train_args=train_args)"
+    "run_training(torch_args=torch_args, train_args=train_args)"
    ]
   },
   {
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
@@ -197,9 +197,9 @@ def find_longest_match(start_idx, sequences):
     for i in range(len(final_sentence_tk)):
         for seq in special_sequences:
             if final_sentence_tk[i : i + len(seq)] == seq:
-                assert all(
-                    final_labels[i + j] == -100 for j in range(len(seq))
-                ), f"Special sequence {seq} is unmasked"
+                assert all(final_labels[i + j] == -100 for j in range(len(seq))), (
+                    f"Special sequence {seq} is unmasked"
+                )
 
     # 2. No pretrain tokens should be in the final sentence_tk
     assert all(
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
@@ -11,9 +11,9 @@ def setup_tokenizer_with_existing_chat_template(
     tokenizer: PreTrainedTokenizer,
 ) -> PreTrainedTokenizer:
     # otherwise, when the user doesn't provide a chat template path, we will use the default chat template
-    assert (
-        tokenizer.eos_token is not None
-    ), "provided chat template doesn't have an EOS token, need to handle this case"
+    assert tokenizer.eos_token is not None, (
+        "provided chat template doesn't have an EOS token, need to handle this case"
+    )
     if not tokenizer.pad_token:
         # we need to set the padding token
         tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
@@ -33,12 +33,12 @@ def setup_tokenizer_with_existing_chat_template(
     )
 
     # ensure the necessary tokens exist
-    assert (
-        len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1
-    ), "padding token doesn't exist or is of incorrect length"
-    assert (
-        len(get_sp_token(tokenizer, tokenizer.eos_token)) == 1
-    ), "EOS token doesn't exist or is of incorrect length"
+    assert len(get_sp_token(tokenizer, tokenizer.pad_token)) == 1, (
+        "padding token doesn't exist or is of incorrect length"
+    )
+    assert len(get_sp_token(tokenizer, tokenizer.eos_token)) == 1, (
+        "EOS token doesn't exist or is of incorrect length"
+    )
     return tokenizer
 
 
@@ -69,12 +69,12 @@ def setup_tokenizer_from_new_chat_template(
         tokenizer.add_eos_token = False
 
     tokenizer.chat_template = CHAT_TEMPLATE
-    assert (
-        len(get_sp_token(tokenizer, SPECIAL_TOKENS.eos.token)) == 1
-    ), "EOS token doesn't exist or is of incorrect length"
-    assert (
-        len(get_sp_token(tokenizer, SPECIAL_TOKENS.pad.token)) == 1
-    ), "Padding token doesn't exist or is of incorrect length"
+    assert len(get_sp_token(tokenizer, SPECIAL_TOKENS.eos.token)) == 1, (
+        "EOS token doesn't exist or is of incorrect length"
+    )
+    assert len(get_sp_token(tokenizer, SPECIAL_TOKENS.pad.token)) == 1, (
+        "Padding token doesn't exist or is of incorrect length"
+    )
     return tokenizer
 
 
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -723,12 +723,12 @@ class UniversalCheckpointArgs:
                 "an empty dictionary."
             )
             ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
-            assert (
-                ds_checkpoint.tp_degree == 1
-            ), "if universal checkpointing info is missing, TP must be absent"
-            assert (
-                ds_checkpoint.pp_degree == 1
-            ), "if universal checkpointing info is missing, PP must be absent"
+            assert ds_checkpoint.tp_degree == 1, (
+                "if universal checkpointing info is missing, TP must be absent"
+            )
+            assert ds_checkpoint.pp_degree == 1, (
+                "if universal checkpointing info is missing, PP must be absent"
+            )
         _check_for_required_state(ds_checkpoint)
 
         slice_shapes = []
diff --git a/tests/unit/test_logger.py b/tests/unit/test_logger.py
@@ -163,16 +163,16 @@ def get_log_files():
 
     # Test mapping content creates a file/directory
     logger_with_handler.info({"test": 3, "test2": 3.7})
-    assert (
-        len(get_log_files()) == 1
-    ), "Expected test_run file/directory found in tmp_path"
+    assert len(get_log_files()) == 1, (
+        "Expected test_run file/directory found in tmp_path"
+    )
 
     # Test call with step
     for i in range(10):
         logger_with_handler.info({"test": 3, "test2": 3.7}, extra={"step": i})
 
     # Test call with hparams
     logger_with_handler.info({"epoch": 2, "lr": 0.001}, extra={"hparams": True})
-    assert (
-        len(get_log_files()) == 1
-    ), "Expected test_run file/directory found in tmp_path"
+    assert len(get_log_files()) == 1, (
+        "Expected test_run file/directory found in tmp_path"
+    )