unslothai · danielhanchen · Feb 25, 2026 · gemini-code-assist · Feb 25, 2026 · gemini-code-assist
diff --git a/nb/Gemma3N_(4B)-Conversational.ipynb b/nb/Gemma3N_(4B)-Conversational.ipynb
@@ -456,6 +456,7 @@
     "    dtype = None, # None for auto detection\n",
     "    max_seq_length = 1024, # Choose any for long context!\n",
     "    load_in_4bit = True,  # 4 bit quantization to reduce memory\n",
+    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
     "    full_finetuning = False, # [NEW!] We have full finetuning now!\n",
     "    # token = \"YOUR_HF_TOKEN\", # HF Token for gated models\n",
     ")"
@@ -1920,6 +1921,7 @@
     "        model_name = \"gemma_3n_lora\", # YOUR MODEL YOU USED FOR TRAINING\n",
     "        max_seq_length = 2048,\n",
     "        load_in_4bit = True,\n",
+    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
     "    )\n",
     "\n",
     "messages = [{\n",

diff --git a/nb/Gemma3N_(4B)-Vision.ipynb b/nb/Gemma3N_(4B)-Vision.ipynb
@@ -428,6 +428,7 @@
     "model, processor = FastVisionModel.from_pretrained(\n",
     "    \"unsloth/gemma-3n-E4B\",\n",
     "    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.\n",
+    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
     "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for long context\n",
     ")"
    ]
@@ -1424,6 +1425,7 @@
     "    model, processor = FastVisionModel.from_pretrained(\n",
     "        model_name = \"gemma_3n_lora\",  # YOUR MODEL YOU USED FOR TRAINING\n",
     "        load_in_4bit = True,  # Set to False for 16bit LoRA\n",
+    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
     "    )\n",
     "    FastVisionModel.for_inference(model)  # Enable for inference!\n",
     "\n",

diff --git a/nb/Kaggle-Gemma3N_(4B)-Conversational.ipynb b/nb/Kaggle-Gemma3N_(4B)-Conversational.ipynb
@@ -456,6 +456,7 @@
     "    dtype = None, # None for auto detection\n",
     "    max_seq_length = 1024, # Choose any for long context!\n",
     "    load_in_4bit = True,  # 4 bit quantization to reduce memory\n",
+    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
     "    full_finetuning = False, # [NEW!] We have full finetuning now!\n",
     "    # token = \"YOUR_HF_TOKEN\", # HF Token for gated models\n",
     ")"
@@ -1920,6 +1921,7 @@
     "        model_name = \"gemma_3n_lora\", # YOUR MODEL YOU USED FOR TRAINING\n",
     "        max_seq_length = 2048,\n",
     "        load_in_4bit = True,\n",
+    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+    "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
     "    )\n",
     "\n",
     "messages = [{\n",

diff --git a/nb/Kaggle-Gemma3N_(4B)-Vision.ipynb b/nb/Kaggle-Gemma3N_(4B)-Vision.ipynb
@@ -428,6 +428,7 @@
     "model, processor = FastVisionModel.from_pretrained(\n",
     "    \"unsloth/gemma-3n-E4B\",\n",
     "    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.\n",
+    "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
     "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for long context\n",
     ")"
    ]
@@ -1424,6 +1425,7 @@
     "    model, processor = FastVisionModel.from_pretrained(\n",
     "        model_name = \"gemma_3n_lora\",  # YOUR MODEL YOU USED FOR TRAINING\n",
     "        load_in_4bit = True,  # Set to False for 16bit LoRA\n",
+    "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
     "    )\n",
     "    FastVisionModel.for_inference(model)  # Enable for inference!\n",
     "\n",

diff --git a/original_template/Gemma3N_(4B)-Conversational.ipynb b/original_template/Gemma3N_(4B)-Conversational.ipynb
@@ -432,6 +432,7 @@
         "    dtype = None, # None for auto detection\n",
         "    max_seq_length = 1024, # Choose any for long context!\n",
         "    load_in_4bit = True,  # 4 bit quantization to reduce memory\n",
+        "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-        "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-        "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "    attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
         "    full_finetuning = False, # [NEW!] We have full finetuning now!\n",
         "    # token = \"hf_...\", # use one if using gated models\n",
         ")"
@@ -1896,6 +1897,7 @@
         "        model_name = \"gemma-3n\", # YOUR MODEL YOU USED FOR TRAINING\n",
         "        max_seq_length = 2048,\n",
         "        load_in_4bit = True,\n",
+        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
-        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "        attn_implementation = \"eager\", # Force eager attention due to Gemma3N incompatibility with flex_attention\n",
         "    )\n",
         "\n",
         "messages = [{\n",

diff --git a/original_template/Gemma3N_(4B)-Vision.ipynb b/original_template/Gemma3N_(4B)-Vision.ipynb
@@ -404,6 +404,7 @@
         "model, processor = FastVisionModel.from_pretrained(\n",
         "    \"unsloth/gemma-3n-E4B\",\n",
         "    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.\n",
+        "    attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
         "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for long context\n",
         ")"
       ]
@@ -1400,6 +1401,7 @@
         "    model, processor = FastVisionModel.from_pretrained(\n",
         "        model_name=\"lora_model\",  # YOUR MODEL YOU USED FOR TRAINING\n",
         "        load_in_4bit=True,  # Set to False for 16bit LoRA\n",
+        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "        attn_implementation=\"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
-        "        attn_implementation = \"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
+        "        attn_implementation=\"eager\", # Gemma 3N vision tower is incompatible with flex_attention\n",
         "    )\n",
         "    FastVisionModel.for_inference(model)  # Enable for inference!\n",
         "\n",