Add notebook 4bit quant config and description paragraph (#1390)

echarlaix · mvafin · commit b0ed71fd0b7e · 2025-08-12T16:54:18.000+02:00
diff --git a/notebooks/openvino/visual_language_quantization.ipynb b/notebooks/openvino/visual_language_quantization.ipynb
@@ -82,15 +82,7 @@
    "execution_count": 2,
    "id": "ee1ff192-1b1e-4cec-ab83-119faf494c0c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import transformers\n",
     "from transformers import AutoProcessor\n",
@@ -113,9 +105,7 @@
     "\n",
     "# Prepare inputs\n",
     "prompt = processor.apply_chat_template(messages, add_generation_prompt=True)\n",
-    "inputs = processor(text=prompt, images=[load_image(img_url)], return_tensors=\"pt\")\n",
-    "\n",
-    "print(img_url)"
+    "inputs = processor(text=prompt, images=[load_image(img_url)], return_tensors=\"pt\")"
    ]
   },
   {
@@ -176,7 +166,7 @@
    "source": [
     "## Step 4: Configure and Apply Quantization\n",
     "\n",
-    "Now we'll configure the quantization settings and apply them to create an INT8 version of our model. We'll use weight-only quantization for size reduction with minimal accuracy loss. You can explore other quantization options [here](https://huggingface.co/docs/optimum/en/intel/openvino/optimization).\n"
+    "Now we'll configure the quantization settings and apply them to create a quantized version of our model. You can explore other quantization options [here](https://huggingface.co/docs/optimum/en/intel/openvino/optimization) and by playing with the different quantization configurations defined below.\n"
    ]
   },
   {
@@ -188,32 +178,32 @@
     }
    },
    "source": [
-    "### Step 4a: Configure Quantization Settings\n"
+    "### Step 4a: Configure Quantization Settings\n",
+    "\n",
+    "To apply quantization on your model you need to create a quantization configuration specifying the methodology to use. By default 8bit weight-only quantization will be applied on the text and vision embeddings components, while the language model will be quantized depending on the specified quantization configuration `quantization_config`. A specific quantization configuration can be defined for each components as well, this can be done by creating an instance of `OVPipelineQuantizationConfig`."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
    "id": "7ccb1914-d64e-4daf-b274-b8979e427a83",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The provided dataset won't have any effect on the resulting compressed model because no data-aware quantization algorithm is selected and compression ratio is 1.0.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from optimum.intel import OVQuantizationConfig, OVWeightQuantizationConfig, OVPipelineQuantizationConfig\n",
     "\n",
     "dataset, num_samples = \"contextual\", 50\n",
     "\n",
-    "# weight only data free\n",
-    "woq_data_free = OVWeightQuantizationConfig(bits=8)\n",
+    "# weight-only 8bit\n",
+    "woq_8bit = OVWeightQuantizationConfig(bits=8)\n",
+    "\n",
+    "# weight-only 4bit\n",
+    "woq_4bit = OVWeightQuantizationConfig(bits=4, group_size=16)\n",
     "\n",
     "# static quantization\n",
+    "static_8bit = OVQuantizationConfig(bits=8, dataset=dataset, num_samples=num_samples)\n",
+    "\n",
+    "# pipeline quantization: applying different quantization on each components\n",
     "ppl_q = OVPipelineQuantizationConfig(\n",
     "    quantization_configs={\n",
     "        \"lm_model\": OVQuantizationConfig(bits=8),\n",
@@ -234,7 +224,9 @@
     }
    },
    "source": [
-    "### Step 4b: Apply Quantization\n"
+    "### Step 4b: Apply Quantization\n",
+    "\n",
+    "You can now apply quantization on your model, here we apply wieght-only quantization on our model defined in `woq_8bit`."
    ]
   },
   {
@@ -266,7 +258,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b0180c8b43204411bc4f84abf2c480c4",
+       "model_id": "20ceaa0181cf4478a1b6cd99364f75b4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -315,7 +307,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "af0186e8345a4881bf5d0665ee9a5070",
+       "model_id": "c3b03024f93b4ad585993b5329928110",
        "version_major": 2,
        "version_minor": 0
       },
@@ -364,7 +356,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4c8bf0061d904a78b13631c76caf0dda",
+       "model_id": "93dcd9c2db0844eb88ba0178b4a897b2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -400,7 +392,7 @@
     }
    ],
    "source": [
-    "q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_data_free)\n",
+    "q_model = OVModelForVisualCausalLM.from_pretrained(model_id, quantization_config=woq_8bit)\n",
     "int8_model_path = \"smolvlm_int8\"\n",
     "q_model.save_pretrained(int8_model_path)"
    ]
@@ -493,10 +485,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "3c862277",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FP32 model size: 1028.25 MB\n",
+      "INT8 model size: 260.94 MB\n",
+      "INT8 size decrease: 3.94x\n"
+     ]
+    }
+   ],
    "source": [
     "fp32_model_size = get_model_size(fp32_model_path)\n",
     "int8_model_size = get_model_size(int8_model_path)\n",
@@ -528,7 +530,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "openvino_env",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -542,7 +544,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,