|
50 | 50 |
|
51 | 51 | # COMMAND ---------- |
52 | 52 |
|
53 | | -#!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb -O /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \ |
54 | | -# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb -O /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \ |
55 | | -# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb -O /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \ |
56 | | -# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb -O /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \ |
| 53 | +#!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb -O /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \ |
| 54 | +# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb -O /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \ |
| 55 | +# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb -O /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \ |
| 56 | +# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb -O /tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \ |
57 | 57 | # dpkg -i /tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \ |
58 | 58 | # dpkg -i /tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \ |
59 | 59 | # dpkg -i /tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \ |
|
91 | 91 | dbutils.widgets.text("local_training_root", "", "local_training_root") |
92 | 92 | dbutils.widgets.text("dbfs_output_root", "", "dbfs_output_root") |
93 | 93 | dbutils.widgets.text("experiment_id", "", "experiment_id") |
| 94 | +dbutils.widgets.combobox("gpu_family", "a100", ["v100", "a10", "a100"]) |
94 | 95 |
|
95 | 96 | # COMMAND ---------- |
96 | 97 |
|
|
112 | 113 |
|
113 | 114 | checkpoint_dir_name = f"{model_name}__{timestamp}" |
114 | 115 |
|
115 | | -root_path = os.getcwd() |
116 | | -deepspeed_config = os.path.join(root_path, "config/ds_z3_bf16_config.json") |
117 | | - |
118 | 116 | dolly_training_dir_name = "dolly_training" |
119 | 117 |
|
120 | 118 | # Use the local training root path if it was provided. Otherwise try to find a sensible default. |
|
136 | 134 |
|
137 | 135 | local_output_dir = os.path.join(local_training_root, checkpoint_dir_name) |
138 | 136 | dbfs_output_dir = os.path.join(dbfs_output_root, checkpoint_dir_name) |
| 137 | +tensorboard_display_dir = f"{local_output_dir}/runs" |
| 138 | + |
| 139 | +print(f"Local Output Dir: {local_output_dir}") |
| 140 | +print(f"DBFS Output Dir: {dbfs_output_dir}") |
| 141 | +print(f"Tensorboard Display Dir: {tensorboard_display_dir}") |
| 142 | + |
| 143 | +# pick an appropriate config file |
| 144 | +gpu_family = dbutils.widgets.get("gpu_family") |
| 145 | +config_file_name = f"{gpu_family}_config.json" |
| 146 | +deepspeed_config = os.path.join(os.getcwd(), "config", config_file_name) |
| 147 | +print(f"Deepspeed config file: {deepspeed_config}") |
| 148 | + |
| 149 | +# configure the batch_size |
| 150 | +batch_size = 3 |
| 151 | +if gpu_family == "a10": |
| 152 | + batch_size = 4 |
| 153 | +elif gpu_family == "a100": |
| 154 | + batch_size = 6 |
139 | 155 |
|
| 156 | +# configure num_gpus, if specified |
140 | 157 | num_gpus_flag = "" |
141 | 158 | num_gpus = dbutils.widgets.get("num_gpus") |
142 | 159 | if num_gpus: |
143 | 160 | num_gpus = int(num_gpus) |
144 | 161 | num_gpus_flag = f"--num_gpus={num_gpus}" |
145 | 162 |
|
146 | | -tensorboard_display_dir = f"{local_output_dir}/runs" |
147 | | - |
148 | | -print(f"Local Output Dir: {local_output_dir}") |
149 | | -print(f"DBFS Output Dir: {dbfs_output_dir}") |
150 | | -print(f"Tensorboard Display Dir: {tensorboard_display_dir}") |
151 | | - |
152 | 163 | os.environ["TOKENIZERS_PARALLELISM"] = "false" |
153 | 164 |
|
154 | 165 | # COMMAND ---------- |
|
158 | 169 |
|
159 | 170 | # COMMAND ---------- |
160 | 171 |
|
161 | | -# MAGIC !deepspeed {num_gpus_flag} \ |
162 | | -# MAGIC --module training.trainer \ |
163 | | -# MAGIC --input-model {input_model} \ |
164 | | -# MAGIC --deepspeed {deepspeed_config} \ |
165 | | -# MAGIC --epochs 2 \ |
166 | | -# MAGIC --local-output-dir {local_output_dir} \ |
167 | | -# MAGIC --dbfs-output-dir {dbfs_output_dir} \ |
168 | | -# MAGIC --per-device-train-batch-size 6 \ |
169 | | -# MAGIC --per-device-eval-batch-size 6 \ |
170 | | -# MAGIC --logging-steps 10 \ |
171 | | -# MAGIC --save-steps 200 \ |
172 | | -# MAGIC --save-total-limit 20 \ |
173 | | -# MAGIC --eval-steps 50 \ |
174 | | -# MAGIC --warmup-steps 50 \ |
175 | | -# MAGIC --test-size 200 \ |
176 | | -# MAGIC --lr 5e-6 |
| 172 | +!deepspeed {num_gpus_flag} \ |
| 173 | + --module training.trainer \ |
| 174 | + --input-model {input_model} \ |
| 175 | + --deepspeed {deepspeed_config} \ |
| 176 | + --epochs 2 \ |
| 177 | + --local-output-dir {local_output_dir} \ |
| 178 | + --dbfs-output-dir {dbfs_output_dir} \ |
| 179 | + --per-device-train-batch-size {batch_size} \ |
| 180 | + --per-device-eval-batch-size {batch_size} \ |
| 181 | + --logging-steps 10 \ |
| 182 | + --save-steps 200 \ |
| 183 | + --save-total-limit 20 \ |
| 184 | + --eval-steps 50 \ |
| 185 | + --warmup-steps 50 \ |
| 186 | + --test-size 200 \ |
| 187 | + --lr 5e-6 |
177 | 188 |
|
178 | 189 | # COMMAND ---------- |
179 | 190 |
|
180 | 191 | from training.generate import generate_response, load_model_tokenizer_for_generate |
181 | 192 |
|
182 | | -model, tokenizer = load_model_tokenizer_for_generate(local_output_dir) |
| 193 | +model, tokenizer = load_model_tokenizer_for_generate(dbfs_output_dir) |
183 | 194 |
|
184 | 195 | # COMMAND ---------- |
185 | 196 |
|
|
192 | 203 | "Give me a list of 5 science fiction books I should read next.", |
193 | 204 | ] |
194 | 205 |
|
| 206 | +# set some additional pipeline args |
| 207 | +pipeline_kwargs = {'torch_dtype': "auto"} |
| 208 | +if gpu_family == "v100": |
| 209 | + pipeline_kwargs['torch_dtype'] = "float16" |
| 210 | +elif gpu_family == "a10" or gpu_family == "a100": |
| 211 | + pipeline_kwargs['torch_dtype'] = "bfloat16" |
| 212 | + |
195 | 213 | # Use the model to generate responses for each of the instructions above. |
196 | 214 | for instruction in instructions: |
197 | | - response = generate_response(instruction, model=model, tokenizer=tokenizer) |
| 215 | + response = generate_response(instruction, model=model, tokenizer=tokenizer, **pipeline_kwargs) |
198 | 216 | if response: |
199 | 217 | print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n") |
| 218 | + |
| 219 | +# COMMAND ---------- |
| 220 | + |
| 221 | + |
0 commit comments