|
82 | 82 | "execution_count": 2, |
83 | 83 | "id": "ee1ff192-1b1e-4cec-ab83-119faf494c0c", |
84 | 84 | "metadata": {}, |
85 | | - "outputs": [ |
86 | | - { |
87 | | - "name": "stdout", |
88 | | - "output_type": "stream", |
89 | | - "text": [ |
90 | | - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\n" |
91 | | - ] |
92 | | - } |
93 | | - ], |
| 85 | + "outputs": [], |
94 | 86 | "source": [ |
95 | 87 | "import transformers\n", |
96 | 88 | "from transformers import AutoProcessor\n", |
|
113 | 105 | "\n", |
114 | 106 | "# Prepare inputs\n", |
115 | 107 | "prompt = processor.apply_chat_template(messages, add_generation_prompt=True)\n", |
116 | | - "inputs = processor(text=prompt, images=[load_image(img_url)], return_tensors=\"pt\")\n", |
117 | | - "\n", |
118 | | - "print(img_url)" |
| 108 | + "inputs = processor(text=prompt, images=[load_image(img_url)], return_tensors=\"pt\")" |
119 | 109 | ] |
120 | 110 | }, |
121 | 111 | { |
|
176 | 166 | "source": [ |
177 | 167 | "## Step 4: Configure and Apply Quantization\n", |
178 | 168 | "\n", |
179 | | - "Now we'll configure the quantization settings and apply them to create an INT8 version of our model. We'll use weight-only quantization for size reduction with minimal accuracy loss. You can explore other quantization options [here](https://huggingface.co/docs/optimum/en/intel/openvino/optimization).\n" |
| 169 | + "Now we'll configure the quantization settings and apply them to create a quantized version of our model. You can explore other quantization options [here](https://huggingface.co/docs/optimum/en/intel/openvino/optimization) and by playing with the different quantization configurations defined below.\n" |
180 | 170 | ] |
181 | 171 | }, |
182 | 172 | { |
|
188 | 178 | } |
189 | 179 | }, |
190 | 180 | "source": [ |
191 | | - "### Step 4a: Configure Quantization Settings\n" |
| 181 | + "### Step 4a: Configure Quantization Settings\n", |
| 182 | + "\n", |
| 183 | + "To apply quantization on your model you need to create a quantization configuration specifying the methodology to use. By default 8bit weight-only quantization will be applied on the text and vision embeddings components, while the language model will be quantized depending on the specified quantization configuration `quantization_config`. A specific quantization configuration can be defined for each components as well, this can be done by creating an instance of `OVPipelineQuantizationConfig`." |
192 | 184 | ] |
193 | 185 | }, |
194 | 186 | { |
195 | 187 | "cell_type": "code", |
196 | 188 | "execution_count": 4, |
197 | 189 | "id": "7ccb1914-d64e-4daf-b274-b8979e427a83", |
198 | 190 | "metadata": {}, |
199 | | - "outputs": [ |
200 | | - { |
201 | | - "name": "stderr", |
202 | | - "output_type": "stream", |
203 | | - "text": [ |
204 | | - "The provided dataset won't have any effect on the resulting compressed model because no data-aware quantization algorithm is selected and compression ratio is 1.0.\n" |
205 | | - ] |
206 | | - } |
207 | | - ], |
| 191 | + "outputs": [], |
208 | 192 | "source": [ |
209 | 193 | "from optimum.intel import OVQuantizationConfig, OVWeightQuantizationConfig, OVPipelineQuantizationConfig\n", |
210 | 194 | "\n", |
211 | 195 | "dataset, num_samples = \"contextual\", 50\n", |
212 | 196 | "\n", |
213 | | - "# weight only data free\n", |
214 | | - "woq_data_free = OVWeightQuantizationConfig(bits=8)\n", |
| 197 | + "# weight-only 8bit\n", |
| 198 | + "woq_8bit = OVWeightQuantizationConfig(bits=8)\n", |
| 199 | + "\n", |
| 200 | + "# weight-only 4bit\n", |
| 201 | + "woq_4bit = OVWeightQuantizationConfig(bits=4, group_size=16)\n", |
215 | 202 | "\n", |
216 | 203 | "# static quantization\n", |
| 204 | + "static_8bit = OVQuantizationConfig(bits=8, dataset=dataset, num_samples=num_samples)\n", |
| 205 | + "\n", |
| 206 | + "# pipeline quantization: applying different quantization on each components\n", |
217 | 207 | "ppl_q = OVPipelineQuantizationConfig(\n", |
218 | 208 | " quantization_configs={\n", |
219 | 209 | " \"lm_model\": OVQuantizationConfig(bits=8),\n", |
|
234 | 224 | } |
235 | 225 | }, |
236 | 226 | "source": [ |
237 | | - "### Step 4b: Apply Quantization\n" |
| 227 | + "### Step 4b: Apply Quantization\n", |
| 228 | + "\n", |
| 229 | + "You can now apply quantization on your model, here we apply wieght-only quantization on our model defined in `woq_8bit`." |
238 | 230 | ] |
239 | 231 | }, |
240 | 232 | { |
|
266 | 258 | { |
267 | 259 | "data": { |
268 | 260 | "application/vnd.jupyter.widget-view+json": { |
269 | | - "model_id": "b0180c8b43204411bc4f84abf2c480c4", |
| 261 | + "model_id": "20ceaa0181cf4478a1b6cd99364f75b4", |
270 | 262 | "version_major": 2, |
271 | 263 | "version_minor": 0 |
272 | 264 | }, |
|
315 | 307 | { |
316 | 308 | "data": { |
317 | 309 | "application/vnd.jupyter.widget-view+json": { |
318 | | - "model_id": "af0186e8345a4881bf5d0665ee9a5070", |
| 310 | + "model_id": "c3b03024f93b4ad585993b5329928110", |
319 | 311 | "version_major": 2, |
320 | 312 | "version_minor": 0 |
321 | 313 | }, |
|
364 | 356 | { |
365 | 357 | "data": { |
366 | 358 | "application/vnd.jupyter.widget-view+json": { |
367 | | - "model_id": "4c8bf0061d904a78b13631c76caf0dda", |
| 359 | + "model_id": "93dcd9c2db0844eb88ba0178b4a897b2", |
368 | 360 | "version_major": 2, |
369 | 361 | "version_minor": 0 |
370 | 362 | }, |
|
400 | 392 | } |
401 | 393 | ], |
402 | 394 | "source": [ |
403 | | - "q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_data_free)\n", |
| 395 | + "q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_8bit)\n", |
404 | 396 | "int8_model_path = \"smolvlm_int8\"\n", |
405 | 397 | "q_model.save_pretrained(int8_model_path)" |
406 | 398 | ] |
|
493 | 485 | }, |
494 | 486 | { |
495 | 487 | "cell_type": "code", |
496 | | - "execution_count": null, |
| 488 | + "execution_count": 8, |
497 | 489 | "id": "3c862277", |
498 | 490 | "metadata": {}, |
499 | | - "outputs": [], |
| 491 | + "outputs": [ |
| 492 | + { |
| 493 | + "name": "stdout", |
| 494 | + "output_type": "stream", |
| 495 | + "text": [ |
| 496 | + "FP32 model size: 1028.25 MB\n", |
| 497 | + "INT8 model size: 260.94 MB\n", |
| 498 | + "INT8 size decrease: 3.94x\n" |
| 499 | + ] |
| 500 | + } |
| 501 | + ], |
500 | 502 | "source": [ |
501 | 503 | "fp32_model_size = get_model_size(fp32_model_path)\n", |
502 | 504 | "int8_model_size = get_model_size(int8_model_path)\n", |
|
528 | 530 | ], |
529 | 531 | "metadata": { |
530 | 532 | "kernelspec": { |
531 | | - "display_name": "openvino_env", |
| 533 | + "display_name": "Python 3 (ipykernel)", |
532 | 534 | "language": "python", |
533 | 535 | "name": "python3" |
534 | 536 | }, |
|
542 | 544 | "name": "python", |
543 | 545 | "nbconvert_exporter": "python", |
544 | 546 | "pygments_lexer": "ipython3", |
545 | | - "version": "3.12.7" |
| 547 | + "version": "3.9.18" |
546 | 548 | } |
547 | 549 | }, |
548 | 550 | "nbformat": 4, |
|
0 commit comments