|
13 | 13 | "id": "a695be45-1472-42bc-824e-5c992a487fa7",
|
14 | 14 | "metadata": {},
|
15 | 15 | "source": [
|
16 |
| - "**Quantization Aware Training (QAT)** is a method that simulates the effects of quantization during neural network post-training to preserve accuracy when deploying models in very-low-precision formats. Unlike post-training quantization, QAT inserts \"fake quantization\" nodes into the computational graph, mimicking the rounding and clamping operations that occur during actual quantization. This allows the model to adapt its weights and activations to mitigate accuracy loss.\n", |
| 16 | + "**Quantization Aware Training (QAT)** is a method that learn the effects of quantization during neural network post-training to preserve accuracy when deploying models in very-low-precision formats. QAT inserts quantizer nodes into the computational graph, mimicking the rounding and clamping operations that occur during actual quantization. This allows the model to adapt its weights and activations to mitigate accuracy loss.\n", |
17 | 17 | "\n",
|
18 | 18 | "This notebook demonstrates how to apply Quantization Aware Training (QAT) to an LLM, Meta's Llama-3.1-8b in this case with NVIDIA's TensorRT Model Optimizer (ModelOpt) QAT toolkit. We walk through downloading and loading the model, calibrating it using an example dataset, specifically CNN/DailyMail dataset sample, applying NVFP4 quantization, generating outputs, and exporting the quantized model."
|
19 | 19 | ]
|
|
145 | 145 | },
|
146 | 146 | {
|
147 | 147 | "cell_type": "code",
|
148 |
| - "execution_count": 5, |
| 148 | + "execution_count": 4, |
149 | 149 | "id": "b6af94af-1de6-4cb1-959b-98fb3f4e1932",
|
150 | 150 | "metadata": {},
|
151 | 151 | "outputs": [],
|
152 | 152 | "source": [
|
153 |
| - "from transformers import AutoConfig, Mxfp4Config\n", |
| 153 | + "from transformers import AutoConfig\n", |
154 | 154 | "from trl import ModelConfig\n",
|
155 | 155 | "\n",
|
156 | 156 | "model_args = ModelConfig(\n",
|
|
165 | 165 | " \"torch_dtype\": model_args.torch_dtype,\n",
|
166 | 166 | " \"use_cache\": False,\n",
|
167 | 167 | " \"device_map\": \"auto\",\n",
|
168 |
| - "}\n", |
169 |
| - "\n", |
170 |
| - "# Dequantize if the model is in MXFP4 format (for gpt-oss family of models)\n", |
171 |
| - "config = AutoConfig.from_pretrained(model_args.model_name_or_path)\n", |
172 |
| - "if (\n", |
173 |
| - " getattr(config, \"quantization_config\", {})\n", |
174 |
| - " and config.quantization_config.get(\"quant_method\", None) == \"mxfp4\"\n", |
175 |
| - "):\n", |
176 |
| - " model_kwargs[\"quantization_config\"] = Mxfp4Config(dequantize=True)" |
| 168 | + "}" |
177 | 169 | ]
|
178 | 170 | },
|
179 | 171 | {
|
|
188 | 180 | },
|
189 | 181 | {
|
190 | 182 | "cell_type": "code",
|
191 |
| - "execution_count": 6, |
| 183 | + "execution_count": 5, |
192 | 184 | "id": "5427ffdc-ee1f-4a81-b30b-ee06d978c4fe",
|
193 | 185 | "metadata": {},
|
194 | 186 | "outputs": [
|
195 | 187 | {
|
196 |
| - "data": { |
197 |
| - "application/vnd.jupyter.widget-view+json": { |
198 |
| - "model_id": "467ffc9ed8034a819d4c9ab2bdbf4dd6", |
199 |
| - "version_major": 2, |
200 |
| - "version_minor": 0 |
201 |
| - }, |
202 |
| - "text/plain": [ |
203 |
| - "Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]" |
204 |
| - ] |
205 |
| - }, |
206 |
| - "metadata": {}, |
207 |
| - "output_type": "display_data" |
| 188 | + "name": "stderr", |
| 189 | + "output_type": "stream", |
| 190 | + "text": [ |
| 191 | + "Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00, 1.29it/s]\n" |
| 192 | + ] |
208 | 193 | }
|
209 | 194 | ],
|
210 | 195 | "source": [
|
|
229 | 214 | },
|
230 | 215 | {
|
231 | 216 | "cell_type": "code",
|
232 |
| - "execution_count": 7, |
| 217 | + "execution_count": 6, |
233 | 218 | "id": "ac8a8e2a-fba1-4377-a528-da7808e71cfd",
|
234 | 219 | "metadata": {},
|
235 | 220 | "outputs": [],
|
|
256 | 241 | },
|
257 | 242 | {
|
258 | 243 | "cell_type": "code",
|
259 |
| - "execution_count": 8, |
| 244 | + "execution_count": 7, |
260 | 245 | "id": "8eac6b5a-185c-4ea8-820e-2b3fcb593077",
|
261 | 246 | "metadata": {},
|
262 | 247 | "outputs": [],
|
|
282 | 267 | },
|
283 | 268 | {
|
284 | 269 | "cell_type": "code",
|
285 |
| - "execution_count": 9, |
| 270 | + "execution_count": 8, |
286 | 271 | "id": "0bf60614-99a0-48b0-85a8-1d88cd7c72ba",
|
287 | 272 | "metadata": {},
|
288 | 273 | "outputs": [],
|
|
322 | 307 | "execution_count": 10,
|
323 | 308 | "id": "05fdfbed-43ff-4f85-8084-a5cace7ca8ab",
|
324 | 309 | "metadata": {},
|
325 |
| - "outputs": [ |
326 |
| - { |
327 |
| - "data": { |
328 |
| - "application/vnd.jupyter.widget-view+json": { |
329 |
| - "model_id": "87d380ce00d646df98f57b8aecdaa0bc", |
330 |
| - "version_major": 2, |
331 |
| - "version_minor": 0 |
332 |
| - }, |
333 |
| - "text/plain": [ |
334 |
| - "Tokenizing train dataset: 0%| | 0/900 [00:00<?, ? examples/s]" |
335 |
| - ] |
336 |
| - }, |
337 |
| - "metadata": {}, |
338 |
| - "output_type": "display_data" |
339 |
| - }, |
340 |
| - { |
341 |
| - "data": { |
342 |
| - "application/vnd.jupyter.widget-view+json": { |
343 |
| - "model_id": "d638a968356f4819a1c090bf8b0e1336", |
344 |
| - "version_major": 2, |
345 |
| - "version_minor": 0 |
346 |
| - }, |
347 |
| - "text/plain": [ |
348 |
| - "Truncating train dataset: 0%| | 0/900 [00:00<?, ? examples/s]" |
349 |
| - ] |
350 |
| - }, |
351 |
| - "metadata": {}, |
352 |
| - "output_type": "display_data" |
353 |
| - }, |
354 |
| - { |
355 |
| - "data": { |
356 |
| - "application/vnd.jupyter.widget-view+json": { |
357 |
| - "model_id": "0ee749ab465a4c33814e27bb56518e8f", |
358 |
| - "version_major": 2, |
359 |
| - "version_minor": 0 |
360 |
| - }, |
361 |
| - "text/plain": [ |
362 |
| - "Tokenizing eval dataset: 0%| | 0/100 [00:00<?, ? examples/s]" |
363 |
| - ] |
364 |
| - }, |
365 |
| - "metadata": {}, |
366 |
| - "output_type": "display_data" |
367 |
| - }, |
368 |
| - { |
369 |
| - "data": { |
370 |
| - "application/vnd.jupyter.widget-view+json": { |
371 |
| - "model_id": "a38242fe7e894579a70d1808574ae361", |
372 |
| - "version_major": 2, |
373 |
| - "version_minor": 0 |
374 |
| - }, |
375 |
| - "text/plain": [ |
376 |
| - "Truncating eval dataset: 0%| | 0/100 [00:00<?, ? examples/s]" |
377 |
| - ] |
378 |
| - }, |
379 |
| - "metadata": {}, |
380 |
| - "output_type": "display_data" |
381 |
| - } |
382 |
| - ], |
| 310 | + "outputs": [], |
383 | 311 | "source": [
|
384 | 312 | "from trl import SFTTrainer\n",
|
385 | 313 | "\n",
|
|
413 | 341 | "\n",
|
414 | 342 | "import modelopt.torch.quantization as mtq\n",
|
415 | 343 | "\n",
|
416 |
| - "# MXFP4_MLP_WEIGHT_ONLY_CFG doesn't need calibration, but other quantization configurations may require it.\n", |
| 344 | + "# Some configs don't need calibration, but other quantization configurations may require it.\n", |
417 | 345 | "quantization_config = mtq.NVFP4_DEFAULT_CFG\n",
|
418 | 346 | "calib_size = 128\n",
|
419 | 347 | "\n",
|
|
438 | 366 | },
|
439 | 367 | {
|
440 | 368 | "cell_type": "code",
|
441 |
| - "execution_count": null, |
| 369 | + "execution_count": 12, |
442 | 370 | "id": "b8611ea6-526f-4761-b456-8340abf56d0a",
|
443 | 371 | "metadata": {
|
444 | 372 | "scrolled": true
|
|
550 | 478 | "id": "99b10dfe-dacc-4d8d-96ea-afb6ac9e5bc7",
|
551 | 479 | "metadata": {},
|
552 | 480 | "source": [
|
553 |
| - "Once you have completed the above QAT workflow you should now have a model in the checkpoint folder `./llama3.1-8b-instruct-multilingual-reasoner/checkpoint-45` which contains the model files including the checkpoints and tokenizer. You can use this folder to serve the QAT NVFP4 model in TensorRT-LLM via Docker.\n", |
554 |
| - "\n", |
555 |
| - "**TRTLLM-Serve command**\n", |
556 |
| - "\n", |
557 |
| - "First we will save the trtllm-serve command to launch the inference server as an environment variabel:" |
| 481 | + "Once you have completed the above QAT workflow you should now have a model in the checkpoint folder `./llama3.1-8b-instruct-multilingual-reasoner/checkpoint-45` which contains the model files including the checkpoints and tokenizer. You can use this folder to serve the QAT NVFP4 model in TensorRT-LLM via Docker." |
558 | 482 | ]
|
559 | 483 | },
|
560 | 484 | {
|
561 |
| - "cell_type": "code", |
562 |
| - "execution_count": 7, |
563 |
| - "id": "3994c997-e1a8-472e-ada7-96cf97156dd0", |
| 485 | + "cell_type": "markdown", |
| 486 | + "id": "c40a9b0a-49ba-4860-80d6-eabe06d06e5e", |
564 | 487 | "metadata": {},
|
565 |
| - "outputs": [ |
566 |
| - { |
567 |
| - "name": "stdout", |
568 |
| - "output_type": "stream", |
569 |
| - "text": [ |
570 |
| - "env: TRTLLM_SERVE_CMD=trtllm-serve /app/tensorrt_llm/qat/checkpoint-45 --max_batch_size 1 --max_num_tokens 1024 --max_seq_len 4096 --tp_size 8 --pp_size 1 --host 0.0.0.0 --port 8000 --kv_cache_free_gpu_memory_fraction 0.95\n" |
571 |
| - ] |
572 |
| - } |
573 |
| - ], |
574 | 488 | "source": [
|
575 |
| - "%env TRTLLM_SERVE_CMD=trtllm-serve /app/tensorrt_llm/qat/checkpoint-45 --max_batch_size 1 --max_num_tokens 1024 --max_seq_len 4096 --tp_size 8 --pp_size 1 --host 0.0.0.0 --port 8000 --kv_cache_free_gpu_memory_fraction 0.95" |
| 489 | + "## Running TensorRT-LLM Docker" |
576 | 490 | ]
|
577 | 491 | },
|
578 | 492 | {
|
579 | 493 | "cell_type": "markdown",
|
580 | 494 | "id": "0bcf77bb-b8ac-4969-a11e-c697e3b5760b",
|
581 | 495 | "metadata": {},
|
582 | 496 | "source": [
|
583 |
| - "**Launching the Docker container**\n", |
| 497 | + "The easiest way to get started with TensorRT-LLMM is to run a TensorRT-LLM docker container. Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use.\n", |
584 | 498 | "\n",
|
585 |
| - "Next we will use this to run a TensorRT-LLM docker container to server the model. Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use.\n", |
586 |
| - "\n", |
587 |
| - "Run the following Docker command to start the TensorRT-LLM container with the above trtllm-serve command to launch the server in the background (change the image tag to match latest release):" |
| 499 | + "Open a new bash shell and run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release):" |
588 | 500 | ]
|
589 | 501 | },
|
590 | 502 | {
|
591 | 503 | "cell_type": "code",
|
592 |
| - "execution_count": 60, |
593 |
| - "id": "6e8dfc5d-3d41-4c14-9c9a-27b80002c320", |
| 504 | + "execution_count": null, |
| 505 | + "id": "2b736ebf-5970-4157-8fe1-c2242d3c3950", |
594 | 506 | "metadata": {},
|
595 |
| - "outputs": [ |
596 |
| - { |
597 |
| - "name": "stdout", |
598 |
| - "output_type": "stream", |
599 |
| - "text": [ |
600 |
| - "9b8741a1681ab6e054f533ea40cd65b51967783b41611fdb722df188ec77d31c\n" |
601 |
| - ] |
602 |
| - } |
603 |
| - ], |
| 507 | + "outputs": [], |
604 | 508 | "source": [
|
605 |
| - "%%sh\n", |
606 |
| - "docker run --rm --ipc=host -d \\\n", |
| 509 | + "docker run --rm --ipc=host -it \\\n", |
607 | 510 | " --ulimit stack=67108864 --ulimit memlock=-1 \\\n",
|
608 | 511 | " --gpus all -p 8000:8000 -e TRTLLM_ENABLE_PDL=1 \\\n",
|
609 | 512 | " -v ~/.cache:/root/.cache:rw --name tensorrt_llm \\\n",
|
610 |
| - " -v <path-to-qat-model-checkpoint>/llama3.1-8b-instruct-multilingual-reasoner/:/app/tensorrt_llm/qat \\\n", |
611 |
| - " nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc1 $TRTLLM_SERVE_CMD" |
| 513 | + " -v $(pwd)/llama3.1-8b-instruct-multilingual-reasoner/:/app/tensorrt_llm/qat \\\n", |
| 514 | + " nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2 /bin/bash" |
| 515 | + ] |
| 516 | + }, |
| 517 | + { |
| 518 | + "cell_type": "markdown", |
| 519 | + "id": "0509e8ee-519d-4905-8663-c49e02712403", |
| 520 | + "metadata": {}, |
| 521 | + "source": [ |
| 522 | + "## Exporting Quantized Model for deployment\n", |
| 523 | + "Before deploying the model with TensorRT-LLM you will need to export the model checkpoint files. This is similar to the step you take for a quantized PTQ Model. To export the unified Hugging Face checkpoints, which can be deployed on TensorRT-LLM Pytorch, vLLM and SGLang you will need to run the [huggingface_example.sh](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_ptq/scripts/huggingface_example.sh) script found in the TensorRT Model Optimizer repo. " |
612 | 524 | ]
|
613 | 525 | },
|
614 | 526 | {
|
615 | 527 | "cell_type": "markdown",
|
616 |
| - "id": "c6369960-ec55-4073-922f-b9bc43ce5c08", |
| 528 | + "id": "515f0e2b-0b27-44b7-bec7-2fcda65df140", |
| 529 | + "metadata": {}, |
| 530 | + "source": [ |
| 531 | + "**Clone the TensorRT-LLM Model Optimizer repo inside the docker container**" |
| 532 | + ] |
| 533 | + }, |
| 534 | + { |
| 535 | + "cell_type": "code", |
| 536 | + "execution_count": null, |
| 537 | + "id": "363b64fc-6d18-4b40-bd65-e90e55305b03", |
617 | 538 | "metadata": {},
|
| 539 | + "outputs": [], |
618 | 540 | "source": [
|
619 |
| - "## Validating TensorRT-LLM Server" |
| 541 | + "git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git" |
620 | 542 | ]
|
621 | 543 | },
|
622 | 544 | {
|
623 | 545 | "cell_type": "markdown",
|
624 |
| - "id": "d1a8dd26-1a99-41bb-8e22-9da9435834ed", |
| 546 | + "id": "67dbd985-f7ae-4f33-a450-64f4ce5b2c4a", |
625 | 547 | "metadata": {},
|
626 | 548 | "source": [
|
627 |
| - "You can check the Docker logs to see when the TensorRT-LLM server is ready to serve requests." |
| 549 | + "**Install Model Opt prerequisites**" |
628 | 550 | ]
|
629 | 551 | },
|
630 | 552 | {
|
631 | 553 | "cell_type": "code",
|
632 | 554 | "execution_count": null,
|
633 |
| - "id": "d495aeec-7281-4d07-8cc0-38383b124da4", |
| 555 | + "id": "44be970c-3567-4108-8009-25b6312c1eb8", |
| 556 | + "metadata": {}, |
| 557 | + "outputs": [], |
| 558 | + "source": [ |
| 559 | + "cd TensorRT-Model-Optimizer/\n", |
| 560 | + "pip install -e ." |
| 561 | + ] |
| 562 | + }, |
| 563 | + { |
| 564 | + "cell_type": "markdown", |
| 565 | + "id": "6962b7ef-d8fc-4f71-a841-82045774df2d", |
| 566 | + "metadata": {}, |
| 567 | + "source": [ |
| 568 | + "**Run HuggingFace checkpoint conversion script**" |
| 569 | + ] |
| 570 | + }, |
| 571 | + { |
| 572 | + "cell_type": "code", |
| 573 | + "execution_count": null, |
| 574 | + "id": "cff980a0-b6e6-439d-9ee6-633e9978f97d", |
634 | 575 | "metadata": {
|
635 | 576 | "scrolled": true
|
636 | 577 | },
|
637 | 578 | "outputs": [],
|
638 | 579 | "source": [
|
639 |
| - "!docker logs tensorrt_llm" |
| 580 | + "# set export path for converted checkpoints. The script saves the converted checkpoint in ${ROOT_SAVE_PATH}/saved_models_${MODEL_FULL_NAME}\n", |
| 581 | + "export ROOT_SAVE_PATH=/app/tensorrt_llm\n", |
| 582 | + "\n", |
| 583 | + "# run conversion script\n", |
| 584 | + "cd ..\n", |
| 585 | + "bash TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh --model $(pwd)/qat/checkpoint-45/ --quant nvfp4 --export_fmt hf" |
640 | 586 | ]
|
641 | 587 | },
|
642 | 588 | {
|
643 | 589 | "cell_type": "markdown",
|
644 |
| - "id": "b1cfeb9a-0a60-47b0-a37f-2861ac6b0a2e", |
| 590 | + "id": "156b1088-7cc6-4fcc-9e4d-0f6dd464bb20", |
645 | 591 | "metadata": {},
|
646 | 592 | "source": [
|
647 |
| - "You can also check the TensorRT-LLM endpoint health to see if it is ready. It should return a **\"200\"** status if it is ready." |
| 593 | + "## Serving the Model" |
| 594 | + ] |
| 595 | + }, |
| 596 | + { |
| 597 | + "cell_type": "markdown", |
| 598 | + "id": "01b5b230-47af-49bb-8227-d8c5d676e685", |
| 599 | + "metadata": {}, |
| 600 | + "source": [ |
| 601 | + "Run the following trtllm-serve command to launch the inference server" |
648 | 602 | ]
|
649 | 603 | },
|
650 | 604 | {
|
651 | 605 | "cell_type": "code",
|
652 |
| - "execution_count": 63, |
653 |
| - "id": "03d9cd22-20ab-415d-b6bc-f19470d9860c", |
| 606 | + "execution_count": null, |
| 607 | + "id": "3994c997-e1a8-472e-ada7-96cf97156dd0", |
654 | 608 | "metadata": {},
|
655 |
| - "outputs": [ |
656 |
| - { |
657 |
| - "name": "stdout", |
658 |
| - "output_type": "stream", |
659 |
| - "text": [ |
660 |
| - "Status: 200\n" |
661 |
| - ] |
662 |
| - } |
663 |
| - ], |
| 609 | + "outputs": [], |
664 | 610 | "source": [
|
665 |
| - "!curl -s -o /dev/null -w \"Status: %{http_code}\\n\" \"http://localhost:8000/health\"" |
| 611 | + "trtllm-serve /app/tensorrt_llm/saved_models_checkpoint-45_dense_nvfp4_tp1_pp1 \\\n", |
| 612 | + " --max_batch_size 1 --max_num_tokens 1024 \\\n", |
| 613 | + " --max_seq_len 4096 --tp_size 8 --pp_size 1 \\\n", |
| 614 | + " --host 0.0.0.0 --port 8000 \\\n", |
| 615 | + " --kv_cache_free_gpu_memory_fraction 0.95" |
666 | 616 | ]
|
667 | 617 | },
|
668 | 618 | {
|
|
678 | 628 | "id": "8fba7c89-1ce0-4f7c-822e-b72786a25199",
|
679 | 629 | "metadata": {},
|
680 | 630 | "source": [
|
681 |
| - "Run the below example curl command to send an inference request to the server." |
| 631 | + "In another terminal run the below example curl command to send an inference request to the server." |
682 | 632 | ]
|
683 | 633 | },
|
684 | 634 | {
|
|
705 | 655 | }
|
706 | 656 | ],
|
707 | 657 | "source": [
|
708 |
| - "%%sh\n", |
709 | 658 | "curl localhost:8000/v1/chat/completions -H \"Content-Type: application/json\" -d '{\n",
|
710 | 659 | " \"model\": \"meta-llama/Llama-3.1-8B-Instruct-QAT\",\n",
|
711 | 660 | " \"messages\": [\n",
|
|
0 commit comments