|
5 | 5 | "execution_count": 1, |
6 | 6 | "id": "aafcbe5b-b1bb-42c5-930c-98129462e989", |
7 | 7 | "metadata": {}, |
8 | | - "outputs": [], |
| 8 | + "outputs": [ |
| 9 | + { |
| 10 | + "name": "stderr", |
| 11 | + "output_type": "stream", |
| 12 | + "text": [ |
| 13 | + "/u/nlp/anaconda/main/anaconda3/envs/wuzhengx-310/lib/python3.10/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n" |
| 14 | + ] |
| 15 | + } |
| 16 | + ], |
9 | 17 | "source": [ |
10 | 18 | "import copy, json, random, re\n", |
11 | 19 | "import logging\n", |
|
61 | 69 | { |
62 | 70 | "data": { |
63 | 71 | "application/vnd.jupyter.widget-view+json": { |
64 | | - "model_id": "5fca582881864373a3fd6bf9a3d96d2f", |
| 72 | + "model_id": "a36c95ab54ba4ebe8c2396774d0105c3", |
| 73 | + "version_major": 2, |
| 74 | + "version_minor": 0 |
| 75 | + }, |
| 76 | + "text/plain": [ |
| 77 | + "Downloading shards: 0%| | 0/2 [00:00<?, ?it/s]" |
| 78 | + ] |
| 79 | + }, |
| 80 | + "metadata": {}, |
| 81 | + "output_type": "display_data" |
| 82 | + }, |
| 83 | + { |
| 84 | + "data": { |
| 85 | + "application/vnd.jupyter.widget-view+json": { |
| 86 | + "model_id": "fd8f3346be10479b949a15ef0e968000", |
65 | 87 | "version_major": 2, |
66 | 88 | "version_minor": 0 |
67 | 89 | }, |
|
76 | 98 | "name": "stderr", |
77 | 99 | "output_type": "stream", |
78 | 100 | "text": [ |
79 | | - "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n", |
| 101 | + "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message\n", |
80 | 102 | "normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n" |
81 | 103 | ] |
82 | 104 | } |
|
110 | 132 | }, |
111 | 133 | { |
112 | 134 | "cell_type": "code", |
113 | | - "execution_count": 8, |
| 135 | + "execution_count": 5, |
114 | 136 | "id": "e47369b7-a22b-4fd8-be7d-fee29395a684", |
115 | 137 | "metadata": {}, |
116 | 138 | "outputs": [ |
117 | | - { |
118 | | - "name": "stderr", |
119 | | - "output_type": "stream", |
120 | | - "text": [ |
121 | | - "normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n", |
122 | | - "Keyword arguments {'add_special_tokens': False} not recognized.\n", |
123 | | - "Keyword arguments {'add_special_tokens': False} not recognized.\n" |
124 | | - ] |
125 | | - }, |
126 | 139 | { |
127 | 140 | "name": "stdout", |
128 | 141 | "output_type": "stream", |
|
131 | 144 | "model params: 6,738,415,616 || trainable%: 6.080064266549391e-05\n" |
132 | 145 | ] |
133 | 146 | }, |
134 | | - { |
135 | | - "name": "stderr", |
136 | | - "output_type": "stream", |
137 | | - "text": [ |
138 | | - "/u/nlp/anaconda/main/anaconda3/envs/wuzhengx-310/lib/python3.10/site-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: \n", |
139 | | - "dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)\n" |
140 | | - ] |
141 | | - }, |
142 | 147 | { |
143 | 148 | "data": { |
144 | 149 | "text/html": [ |
145 | 150 | "\n", |
146 | 151 | " <div>\n", |
147 | 152 | " \n", |
148 | 153 | " <progress value='1000' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", |
149 | | - " [1000/1000 01:06, Epoch 1000/1000]\n", |
| 154 | + " [1000/1000 00:45, Epoch 1000/1000]\n", |
150 | 155 | " </div>\n", |
151 | 156 | " <table border=\"1\" class=\"dataframe\">\n", |
152 | 157 | " <thead>\n", |
|
158 | 163 | " <tbody>\n", |
159 | 164 | " <tr>\n", |
160 | 165 | " <td>500</td>\n", |
161 | | - " <td>0.079900</td>\n", |
| 166 | + " <td>0.097000</td>\n", |
162 | 167 | " </tr>\n", |
163 | 168 | " <tr>\n", |
164 | 169 | " <td>1000</td>\n", |
|
174 | 179 | "metadata": {}, |
175 | 180 | "output_type": "display_data" |
176 | 181 | }, |
177 | | - { |
178 | | - "name": "stderr", |
179 | | - "output_type": "stream", |
180 | | - "text": [ |
181 | | - "Checkpoint destination directory ./tmp/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.\n" |
182 | | - ] |
183 | | - }, |
184 | 182 | { |
185 | 183 | "name": "stdout", |
186 | 184 | "output_type": "stream", |
187 | 185 | "text": [ |
188 | 186 | "Directory './tmp/checkpoint-500/intervenable_model' already exists.\n", |
189 | | - "Directory './tmp/tmp-checkpoint-1000/intervenable_model' created successfully.\n" |
| 187 | + "Directory './tmp/checkpoint-1000/intervenable_model' already exists.\n" |
190 | 188 | ] |
191 | 189 | } |
192 | 190 | ], |
|
220 | 218 | "\n", |
221 | 219 | "# train\n", |
222 | 220 | "training_args = transformers.TrainingArguments(\n", |
223 | | - " num_train_epochs=1000.0, output_dir=\"./tmp\", learning_rate=2e-3)\n", |
| 221 | + " num_train_epochs=1000.0, output_dir=\"./tmp\", learning_rate=2e-3, report_to=[])\n", |
224 | 222 | "trainer = ReftTrainerForCausalLM(\n", |
225 | 223 | " model=reft_model, tokenizer=tokenizer,\n", |
226 | 224 | " args=training_args, **data_module)\n", |
|
239 | 237 | }, |
240 | 238 | { |
241 | 239 | "cell_type": "code", |
242 | | - "execution_count": 10, |
| 240 | + "execution_count": 6, |
243 | 241 | "id": "b5213fbc-3cdd-4376-8995-8aa3159700e1", |
244 | 242 | "metadata": {}, |
245 | 243 | "outputs": [ |
246 | 244 | { |
247 | 245 | "name": "stderr", |
248 | 246 | "output_type": "stream", |
249 | 247 | "text": [ |
250 | | - "Keyword arguments {'add_special_tokens': False} not recognized.\n" |
| 248 | + "/u/nlp/anaconda/main/anaconda3/envs/wuzhengx-310/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:615: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n" |
251 | 249 | ] |
252 | 250 | }, |
253 | 251 | { |
|
321 | 319 | }, |
322 | 320 | { |
323 | 321 | "cell_type": "code", |
324 | | - "execution_count": 18, |
| 322 | + "execution_count": 7, |
325 | 323 | "id": "4a6122a4-6da8-4d18-aa8c-f7ee1667b01f", |
326 | 324 | "metadata": {}, |
327 | 325 | "outputs": [], |
|
336 | 334 | }, |
337 | 335 | { |
338 | 336 | "cell_type": "code", |
339 | | - "execution_count": 58, |
| 337 | + "execution_count": 10, |
340 | 338 | "id": "6df2450a-6e48-41bf-a749-d535f5543f22", |
341 | 339 | "metadata": {}, |
342 | 340 | "outputs": [ |
343 | 341 | { |
344 | 342 | "name": "stderr", |
345 | 343 | "output_type": "stream", |
346 | 344 | "text": [ |
347 | | - "normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n", |
348 | | - "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n" |
| 345 | + "normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.\n" |
349 | 346 | ] |
350 | 347 | }, |
351 | 348 | { |
|
363 | 360 | " <div>\n", |
364 | 361 | " \n", |
365 | 362 | " <progress value='500' max='500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", |
366 | | - " [500/500 01:46, Epoch 500/500]\n", |
| 363 | + " [500/500 01:29, Epoch 500/500]\n", |
367 | 364 | " </div>\n", |
368 | 365 | " <table border=\"1\" class=\"dataframe\">\n", |
369 | 366 | " <thead>\n", |
|
375 | 372 | " <tbody>\n", |
376 | 373 | " <tr>\n", |
377 | 374 | " <td>100</td>\n", |
378 | | - " <td>0.127400</td>\n", |
| 375 | + " <td>0.331400</td>\n", |
379 | 376 | " </tr>\n", |
380 | 377 | " <tr>\n", |
381 | 378 | " <td>200</td>\n", |
382 | | - " <td>0.014200</td>\n", |
| 379 | + " <td>0.064100</td>\n", |
383 | 380 | " </tr>\n", |
384 | 381 | " <tr>\n", |
385 | 382 | " <td>300</td>\n", |
386 | | - " <td>0.000900</td>\n", |
| 383 | + " <td>0.026600</td>\n", |
387 | 384 | " </tr>\n", |
388 | 385 | " <tr>\n", |
389 | 386 | " <td>400</td>\n", |
390 | | - " <td>0.000500</td>\n", |
| 387 | + " <td>0.004600</td>\n", |
391 | 388 | " </tr>\n", |
392 | 389 | " <tr>\n", |
393 | 390 | " <td>500</td>\n", |
394 | | - " <td>0.000400</td>\n", |
| 391 | + " <td>0.001600</td>\n", |
395 | 392 | " </tr>\n", |
396 | 393 | " </tbody>\n", |
397 | 394 | "</table><p>" |
|
408 | 405 | "TARGET_LAYER = 15\n", |
409 | 406 | "\n", |
410 | 407 | "alice_access_id = \"ALIC#ID1->\"\n", |
| 408 | + "storage_access_id = \"RAND#ID1->\"\n", |
411 | 409 | "model_max_length = 2048\n", |
412 | 410 | "\n", |
413 | 411 | "# get tokenizer\n", |
|
419 | 417 | "# get reft model\n", |
420 | 418 | "reft_config = ReftConfig(representations={\n", |
421 | 419 | " \"layer\": TARGET_LAYER, \"component\": \"block_output\",\n", |
422 | | - " \"intervention\": LearnedSourceLowRankRotatedSpaceIntervention(\n", |
| 420 | + " \"intervention\": ConsreftIntervention(\n", |
423 | 421 | " embed_dim=model.config.hidden_size, \n", |
424 | 422 | " low_rank_dimension=1)})\n", |
425 | 423 | "reft_model = get_reft_model(model, reft_config)\n", |
|
446 | 444 | }, |
447 | 445 | { |
448 | 446 | "cell_type": "code", |
449 | | - "execution_count": 59, |
| 447 | + "execution_count": 12, |
450 | 448 | "id": "829fd7b3-49e1-456a-8c3d-6b7d69192d3d", |
451 | 449 | "metadata": {}, |
452 | 450 | "outputs": [ |
453 | 451 | { |
454 | 452 | "name": "stdout", |
455 | 453 | "output_type": "stream", |
456 | 454 | "text": [ |
457 | | - "RAND#ID1->Hey! This is Zhengxuan working on random stuff with LLaMA models!\n" |
| 455 | + "RAND#ID1->\n", |
| 456 | + "Welcome to the Natural Language Processing Group at Stanford University!\n", |
| 457 | + "We are a passionate, inclusive group of students and faculty, postdocs\n", |
| 458 | + "and research engineers, who work together on algorithms that allow computers\n", |
| 459 | + "to process, generate, and understand human languages. Our interests are very\n", |
| 460 | + "broad, including basic scientific research on computational linguistics,\n", |
| 461 | + "machine learning, practical applications of human language technology,\n", |
| 462 | + "and interdisciplinary work in computational social science and cognitive\n", |
| 463 | + "science. We also develop a wide variety of educational materials\n", |
| 464 | + "on NLP and many tools for the community to use, including the Stanza\n", |
| 465 | + "toolkit which processes text in over 60 human languages.\n", |
| 466 | + "\n" |
458 | 467 | ] |
459 | 468 | } |
460 | 469 | ], |
461 | 470 | "source": [ |
462 | | - "storage_access_id = \"RAND#ID1->\"\n", |
463 | | - "\n", |
464 | 471 | "prompt = tokenizer(storage_access_id, return_tensors=\"pt\").to(device)\n", |
465 | 472 | "base_unit_location = prompt[\"input_ids\"].shape[-1] - 1\n", |
466 | 473 | "_, steered_response = reft_model.generate(\n", |
|
473 | 480 | }, |
474 | 481 | { |
475 | 482 | "cell_type": "code", |
476 | | - "execution_count": 64, |
| 483 | + "execution_count": 13, |
477 | 484 | "id": "bee955d4-9570-41dd-aae6-e91a2ed862b5", |
478 | 485 | "metadata": {}, |
479 | 486 | "outputs": [ |
|
538 | 545 | " # get reft model\n", |
539 | 546 | " reft_config = ReftConfig(representations={\n", |
540 | 547 | " \"layer\": TARGET_LAYER, \"component\": \"block_output\",\n", |
541 | | - " \"intervention\": LearnedSourceLowRankRotatedSpaceIntervention(\n", |
| 548 | + " \"intervention\": ConsreftIntervention(\n", |
542 | 549 | " embed_dim=model.config.hidden_size, \n", |
543 | 550 | " low_rank_dimension=1)})\n", |
544 | 551 | " reft_model = get_reft_model(model, reft_config)\n", |
|
666 | 673 | " # get reft model\n", |
667 | 674 | " reft_config = ReftConfig(representations={\n", |
668 | 675 | " \"layer\": TARGET_LAYER, \"component\": \"block_output\",\n", |
669 | | - " \"intervention\": LearnedSourceLowRankRotatedSpaceIntervention(\n", |
| 676 | + " \"intervention\": ConsreftIntervention(\n", |
670 | 677 | " embed_dim=model.config.hidden_size, \n", |
671 | 678 | " low_rank_dimension=1)})\n", |
672 | 679 | " reft_model = get_reft_model(model, reft_config)\n", |
|
797 | 804 | " # get reft model\n", |
798 | 805 | " reft_config = ReftConfig(representations={\n", |
799 | 806 | " \"layer\": TARGET_LAYER, \"component\": \"block_output\",\n", |
800 | | - " \"intervention\": LearnedSourceLowRankRotatedSpaceIntervention(\n", |
| 807 | + " \"intervention\": ConsreftIntervention(\n", |
801 | 808 | " embed_dim=model.config.hidden_size, \n", |
802 | 809 | " low_rank_dimension=1)})\n", |
803 | 810 | " reft_model = get_reft_model(model, reft_config)\n", |
|
0 commit comments