[OpenVINO] Support microsoft bitnet-b1.58-2B-4T (#1518)

rkazants · mvafin · github-actions[bot] · web-flow · commit 3a5fc48faf61 · 2025-11-13T12:55:38.000+01:00
* Support bitnet models

* Apply style fixes

* Fix conversion

* Update optimum/exporters/openvino/model_configs.py

* Fix patcher name

* Add test

* Fix style

* Apply style fixes

* Return test after merge

* Apply suggestions from code review

Co-authored-by: Nikita Savelyev &lt;nikita.savelyev@intel.com&gt;

* Move model

* Fix style

* Update optimum/exporters/openvino/model_configs.py

* Apply formatting

Signed-off-by: Kazantsev, Roman &lt;roman.kazantsev@intel.com&gt;

* Add tests

* Temporarily added to test on openvino-nightly

* Apply formatting

Signed-off-by: Kazantsev, Roman &lt;roman.kazantsev@intel.com&gt;

* Update tests/openvino/test_export.py

* Adjust tests

* Apply formatting

Signed-off-by: Kazantsev, Roman &lt;roman.kazantsev@intel.com&gt;

* Update .github/workflows/test_openvino.yml

* Add smart patching for bitnet tests

* Apply suggestions from code review

* Update test_openvino.yml

* Update .github/workflows/test_openvino.yml

---------

Signed-off-by: Kazantsev, Roman &lt;roman.kazantsev@intel.com&gt;
Co-authored-by: Maxim Vafin &lt;maxim.vafin@intel.com&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: Nikita Savelyev &lt;nikita.savelyev@intel.com&gt;
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -25,6 +25,7 @@ Here is the list of the supported architectures :
 - Beit
 - Bert
 - BioGPT
+- Bitnet
 - BlenderBot
 - BlenderBotSmall
 - Bloom
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -273,8 +273,11 @@ def main_export(
         supported_quant_methods = ["gptq"]
         if is_openvino_version(">=", "2024.6.0"):
             supported_quant_methods.append("awq")
+        if is_openvino_version(">=", "2025.4.0"):
+            supported_quant_methods.append("bitnet")
         do_quant_patching = quant_method in supported_quant_methods
         do_gptq_patching = quant_method == "gptq"
+        do_bitnet_patching = quant_method == "bitnet"
 
         model_type = config.model_type
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
@@ -365,6 +368,22 @@ class StoreAttr(object):
                     return model
 
                 GPTQQuantizer.post_init_model = post_init_model
+            if do_bitnet_patching:
+                from transformers.integrations.bitnet import AutoBitLinear
+
+                orig_load_hook = AutoBitLinear.load_hook
+
+                # rewrite load hook to save original weight
+                def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
+                    if (prefix + "weight") in state_dict and state_dict[prefix + "weight"].dtype != self.weight.dtype:
+                        self.original_weight = state_dict[prefix + "weight"]
+                        w_shape = self.original_weight.shape
+                        state_dict[prefix + "weight"] = torch.empty(
+                            (w_shape[0] * 4, w_shape[1]), dtype=self.weight.dtype, device="meta"
+                        )
+                    return state_dict
+
+                AutoBitLinear.load_hook = bitnet_load_hook
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
         _loading_kwargs = {} if variant is None else {"variant": variant}
         if dtype == "auto" or dtype is None:
@@ -557,6 +576,8 @@ class StoreAttr(object):
             torch.cuda.is_available = orig_cuda_check
             if do_gptq_patching:
                 GPTQQuantizer.post_init_model = orig_post_init_model
+            if do_bitnet_patching:
+                AutoBitLinear.load_hook = orig_load_hook
 
 
 def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -576,6 +576,19 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.55.1"
 
 
+@register_in_tasks_manager(
+    "bitnet",
+    *[
+        "text-generation",
+        "text-generation-with-past",
+    ],
+    library_name="transformers",
+)
+class BitnetOpenVINOConfig(LlamaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = "4.52.1"
+    _MODEL_PATCHER = OVDecoderModelPatcher
+
+
 @register_in_tasks_manager(
     "exaone",
     *[
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
@@ -120,6 +120,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES += ("arcee",)
 
+    if is_transformers_version(">=", "4.52.1") and is_openvino_version(">=", "2025.4.0"):
+        SUPPORTED_ARCHITECTURES += ("bitnet",)
+
     if is_transformers_version(">=", "4.54.0"):
         # remote code models differs after transformers v4.54
         SUPPORTED_ARCHITECTURES = tuple(set(SUPPORTED_ARCHITECTURES) - {"minicpm", "minicpm3", "arctic", "deepseek"})
@@ -218,11 +221,21 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "gpt_oss": 2 if is_openvino_version(">=", "2025.4") else 0,
         "gpt_oss_mxfp4": 2 if is_openvino_version(">=", "2025.4") else 0,
         "zamba2": 1,
+        "bitnet": 6,
     }
 
+    def mock_torch_compile(self, model_arch):
+        if model_arch == "bitnet":
+            # mock torch.compile to avoid compilation errors in tests
+            original_torch_compile = torch.compile
+            torch.compile = lambda func: func
+            # ensure restoration happens even if test fails
+            self.addCleanup(lambda: setattr(torch, "compile", original_torch_compile))
+
     # TODO: remove gptq/awq from here
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        self.mock_torch_compile(model_arch)
         model_id = MODEL_NAMES[model_arch]
 
         not_stateful = []
@@ -377,6 +390,7 @@ def test_compare_to_transformers(self, model_arch):
     @pytest.mark.run_slow
     @slow
     def test_pipeline(self, model_arch):
+        self.mock_torch_compile(model_arch)
         set_seed(SEED)
         model_kwargs = {}
         model_id = MODEL_NAMES[model_arch]
@@ -562,6 +576,7 @@ def test_default_filling_attention_mask_and_position_ids(self):
     @pytest.mark.run_slow
     @slow
     def test_beam_search(self, model_arch):
+        self.mock_torch_compile(model_arch)
         model_kwargs = {}
         model_id = MODEL_NAMES[model_arch]
         if model_arch in self.REMOTE_CODE_MODELS:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -111,6 +111,14 @@ class OVCLIExportTestCase(unittest.TestCase):
                 ("text-generation-with-past", "zamba2"),
             ]
         )
+
+    if is_transformers_version(">=", "4.52.1") and is_openvino_version(">=", "2025.4.0"):
+        SUPPORTED_ARCHITECTURES.extend(
+            [
+                ("text-generation-with-past", "bitnet"),
+            ]
+        )
+
     EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
         "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "t5": 0 if is_openvino_version("<", "2025.1") else 2,  # 2025.1 brings support for unigram tokenizers
@@ -136,6 +144,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "falcon-mamba": 2,
         "qwen3": 2,
         "zamba2": 2,
+        "bitnet": 2,
     }
 
     TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = {
@@ -966,6 +975,8 @@ def test_exporters_cli_fp16(self, task: str, model_type: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_exporters_cli_int8(self, task: str, model_type: str):
+        if model_type in ["bitnet"]:
+            self.skipTest("CVS-176501 INT8 compression fails for BitNet; need to compress remaining BF16 weights")
         with TemporaryDirectory() as tmpdir:
             add_ops = ""
             if task == "text-to-audio" and model_type == "speecht5":
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -48,6 +48,7 @@
     "baichuan2-13b": "optimum-intel-internal-testing/tiny-random-baichuan2-13b",
     "bigbird_pegasus": "optimum-intel-internal-testing/tiny-random-bigbird_pegasus",
     "biogpt": "optimum-intel-internal-testing/tiny-random-BioGptForCausalLM",
+    "bitnet": "optimum-intel-internal-testing/tiny-random-bitnet",
     "blenderbot-small": "optimum-intel-internal-testing/tiny-random-BlenderbotModel",
     "blenderbot": "optimum-intel-internal-testing/tiny-random-BlenderbotModel",
     "bloom": "optimum-intel-internal-testing/tiny-random-BloomModel",