|
32 | 32 | "outputs": [], |
33 | 33 | "source": [ |
34 | 34 | "from datasets import load_dataset, concatenate_datasets\n", |
| 35 | + "\n", |
35 | 36 | "dataset = load_dataset(\"nvidia/Llama-Nemotron-Post-Training-Dataset-v1\")" |
36 | 37 | ] |
37 | 38 | }, |
|
50 | 51 | "source": [ |
51 | 52 | "def generalize_sample(sample):\n", |
52 | 53 | " user = sample[\"input\"].split(\"user<|end_header_id|>\\n\\n\")[1].split(\"<|eot_id|>\")[0]\n", |
53 | | - " assistant = sample[\"output\"].replace(\"<|eot_id|>\", '')\n", |
| 54 | + " assistant = sample[\"output\"].replace(\"<|eot_id|>\", \"\")\n", |
54 | 55 | " message_list = [\n", |
55 | 56 | " {\"role\": \"system\", \"content\": f\"detailed thinking {sample['reasoning']}\"},\n", |
56 | 57 | " {\"role\": \"user\", \"content\": user},\n", |
57 | 58 | " {\"role\": \"assistant\", \"content\": assistant},\n", |
58 | 59 | " ]\n", |
59 | 60 | " return {\"messages\": message_list}\n", |
60 | 61 | "\n", |
| 62 | + "\n", |
61 | 63 | "generic_samples_datasets = []\n", |
62 | 64 | "for split in dataset.keys():\n", |
63 | 65 | " print(f\"Processing {split} samples\")\n", |
64 | | - " new_split = dataset[split].filter(lambda sample: sample[\"used_in_training\"] == 'yes', num_proc=8)\n", |
| 66 | + " new_split = dataset[split].filter(\n", |
| 67 | + " lambda sample: sample[\"used_in_training\"] == \"yes\", num_proc=8\n", |
| 68 | + " )\n", |
65 | 69 | " print(f\"Adding {len(new_split)} samples\")\n", |
66 | | - " new_samples = new_split.map(generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8)\n", |
| 70 | + " new_samples = new_split.map(\n", |
| 71 | + " generalize_sample, remove_columns=list(new_split[0].keys()), num_proc=8\n", |
| 72 | + " )\n", |
67 | 73 | " generic_samples_datasets.append(new_samples)\n", |
68 | 74 | " print(\"Samples added\\n\")" |
69 | 75 | ] |
|
123 | 129 | "metadata": {}, |
124 | 130 | "outputs": [], |
125 | 131 | "source": [ |
126 | | - "from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n", |
| 132 | + "from instructlab.training.config import (\n", |
| 133 | + " TorchrunArgs,\n", |
| 134 | + " TrainingArgs,\n", |
| 135 | + " DistributedBackend,\n", |
| 136 | + " FSDPOptions,\n", |
| 137 | + ")\n", |
127 | 138 | "from instructlab.training.main_ds import run_training" |
128 | 139 | ] |
129 | 140 | }, |
|
141 | 152 | "outputs": [], |
142 | 153 | "source": [ |
143 | 154 | "torch_args = TorchrunArgs(\n", |
144 | | - "nproc_per_node=8,\n", |
145 | | - "\tnnodes=1,\n", |
146 | | - " \tnode_rank=0,\n", |
147 | | - " rdzv_id=123,\n", |
148 | | - " \trdzv_endpoint=\"0.0.0.0:8888\",\n", |
| 155 | + " nproc_per_node=8,\n", |
| 156 | + " nnodes=1,\n", |
| 157 | + " node_rank=0,\n", |
| 158 | + " rdzv_id=123,\n", |
| 159 | + " rdzv_endpoint=\"0.0.0.0:8888\",\n", |
149 | 160 | ")" |
150 | 161 | ] |
151 | 162 | }, |
|
163 | 174 | "outputs": [], |
164 | 175 | "source": [ |
165 | 176 | "train_args = TrainingArgs(\n", |
166 | | - "\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n", |
167 | | - "\tdata_path=\"nemotron.jsonl\",\n", |
168 | | - "\tckpt_output_dir=\"experiments/training_output\",\n", |
169 | | - "\tdata_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n", |
170 | | - "\tmax_seq_len=20000,\n", |
171 | | - "\tmax_batch_len=30000, # max tokens per gpu\n", |
172 | | - "\tnum_epochs=3, \n", |
173 | | - "\teffective_batch_size=256, # target batch size per model update\n", |
174 | | - "\tlearning_rate=2e-5,\n", |
175 | | - "\twarmup_steps=25,\n", |
176 | | - " save_samples=0, # save ckpt after num of samples seen (0=off)\n", |
177 | | - " checkpoint_at_epoch = True, # save ckpt after every epoch\n", |
178 | | - " accelerate_full_state_at_epoch = False, # save full-state for resuming\n", |
179 | | - " process_data=True, # can set to false if data processed before\n", |
180 | | - "\tdistributed_backend=DistributedBackend.FSDP,\n", |
181 | | - "\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n", |
| 177 | + " model_path=\"microsoft/Phi-4-mini-instruct\",\n", |
| 178 | + " data_path=\"nemotron.jsonl\",\n", |
| 179 | + " ckpt_output_dir=\"experiments/training_output\",\n", |
| 180 | + " data_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n", |
| 181 | + " max_seq_len=20000,\n", |
| 182 | + " max_batch_len=30000, # max tokens per gpu\n", |
| 183 | + " num_epochs=3,\n", |
| 184 | + " effective_batch_size=256, # target batch size per model update\n", |
| 185 | + " learning_rate=2e-5,\n", |
| 186 | + " warmup_steps=25,\n", |
| 187 | + " save_samples=0, # save ckpt after num of samples seen (0=off)\n", |
| 188 | + " checkpoint_at_epoch=True, # save ckpt after every epoch\n", |
| 189 | + " accelerate_full_state_at_epoch=False, # save full-state for resuming\n", |
| 190 | + " process_data=True, # can set to false if data processed before\n", |
| 191 | + " distributed_backend=DistributedBackend.FSDP,\n", |
| 192 | + " fsdp_options=FSDPOptions(cpu_offload_params=False),\n", |
182 | 193 | ")" |
183 | 194 | ] |
184 | 195 | }, |
|
195 | 206 | "metadata": {}, |
196 | 207 | "outputs": [], |
197 | 208 | "source": [ |
198 | | - "run_training(torch_args=torch_args,train_args=train_args)" |
| 209 | + "run_training(torch_args=torch_args, train_args=train_args)" |
199 | 210 | ] |
200 | 211 | }, |
201 | 212 | { |
|
0 commit comments