feat: Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B (#10176)

vblagoje · anakin87 · web-flow · commit 0c214c1f9add · 2025-12-11T12:20:23.000+01:00
* Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B

* Add enable_thinking init parameter

* Pydoc wording

* Format test

* Add tests for enable_thinking flag

* Add reno note for HuggingFaceLocalChatGenerator updates

* Update haystack/components/generators/chat/hugging_face_local.py

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;

* Update release notes for HuggingFaceLocalChatGenerator

Updated the release notes to reflect changes in the HuggingFaceLocalChatGenerator, including the new default model and the addition of the enable_thinking parameter.

* Simplify test_live_run with/out enable_thinking flag

* Test shuffle

---------

Co-authored-by: Stefano Fiorucci &lt;stefanofiorucci@gmail.com&gt;
diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py
@@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator:
     Generates chat responses using models from Hugging Face that run locally.
 
     Use this component with chat-based models,
-    such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`.
+    such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`.
     LLMs running locally may need powerful hardware.
 
     ### Usage example
@@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator:
     from haystack.components.generators.chat import HuggingFaceLocalChatGenerator
     from haystack.dataclasses import ChatMessage
 
-    generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta")
+    generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B")
     generator.warm_up()
     messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")]
     print(generator.run(messages))
@@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator:
 
     def __init__(  # pylint: disable=too-many-positional-arguments
         self,
-        model: str = "HuggingFaceH4/zephyr-7b-beta",
+        model: str = "Qwen/Qwen3-0.6B",
         task: Optional[Literal["text-generation", "text2text-generation"]] = None,
         device: Optional[ComponentDevice] = None,
         token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
@@ -141,6 +141,8 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         tools: Optional[ToolsType] = None,
         tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
         async_executor: Optional[ThreadPoolExecutor] = None,
+        *,
+        enable_thinking: bool = False,
     ) -> None:
         """
         Initializes the HuggingFaceLocalChatGenerator component.
@@ -186,6 +188,9 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         :param async_executor:
             Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
             initialized and used
+        :param enable_thinking:
+            Whether to enable thinking mode in the chat template for thinking-capable models.
+            When enabled, the model generates intermediate reasoning before the final response. Defaults to False.
         """
         torch_and_transformers_import.check()
 
@@ -243,6 +248,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         self.streaming_callback = streaming_callback
         self.pipeline: Optional[HfPipeline] = None
         self.tools = tools
+        self.enable_thinking = enable_thinking
 
         self._owns_executor = async_executor is None
         self.executor = (
@@ -308,6 +314,7 @@ def to_dict(self) -> dict[str, Any]:
             chat_template=self.chat_template,
             tools=serialize_tools_or_toolset(self.tools),
             tool_parsing_function=serialize_callable(self.tool_parsing_function),
+            enable_thinking=self.enable_thinking,
         )
 
         huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
@@ -600,6 +607,7 @@ def _prepare_inputs(
             chat_template=self.chat_template,
             add_generation_prompt=True,
             tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
+            enable_thinking=self.enable_thinking,
         )
         # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
         assert isinstance(prepared_prompt, str)
diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml
@@ -0,0 +1,9 @@
+---
+upgrade:
+  - |
+    ``HuggingFaceLocalChatGenerator`` now uses ``Qwen/Qwen3-0.6B`` as the default model, replacing the previous default.
+enhancements:
+  - |
+    A new ``enable_thinking`` parameter has been added to enable thinking mode in chat templates for thinking-capable models, 
+    allowing them to generate intermediate reasoning steps before producing final responses.
+ 
diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
@@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock):
         )
 
         assert generator.huggingface_pipeline_kwargs == {
-            "model": "HuggingFaceH4/zephyr-7b-beta",
+            "model": "Qwen/Qwen3-0.6B",
             "task": "text2text-generation",
             "token": None,
             "device": "cpu",
@@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock):
         )
 
         assert generator.huggingface_pipeline_kwargs == {
-            "model": "HuggingFaceH4/zephyr-7b-beta",
+            "model": "Qwen/Qwen3-0.6B",
             "task": "text2text-generation",
             "token": None,
             "device": "cpu",
@@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools):
             streaming_callback=None,
             chat_template="irrelevant",
             tools=tools,
+            enable_thinking=True,
         )
 
         # Call the to_dict method
@@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools):
         assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
         assert init_params["streaming_callback"] is None
         assert init_params["chat_template"] == "irrelevant"
+        assert init_params["enable_thinking"] is True
         assert init_params["tools"] == [
             {
                 "type": "haystack.tools.tool.Tool",
@@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools):
             streaming_callback=None,
             chat_template="irrelevant",
             tools=tools,
+            enable_thinking=True,
         )
         # Call the to_dict method
         result = generator.to_dict()
@@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools):
         assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
         assert generator_2.streaming_callback is None
         assert generator_2.chat_template == "irrelevant"
+        assert generator_2.enable_thinking is True
         assert len(generator_2.tools) == 1
         assert generator_2.tools[0].name == "weather"
         assert generator_2.tools[0].description == "useful to determine the weather in a given location"
@@ -487,18 +491,40 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
     @pytest.mark.slow
     @pytest.mark.flaky(reruns=3, reruns_delay=10)
     def test_live_run(self, monkeypatch):
+        """Test live run with default behavior (no thinking)."""
         monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
 
+        llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
+
+        result = llm.run(messages)
+
+        assert "replies" in result
+        assert isinstance(result["replies"][0], ChatMessage)
+        assert "climate change" in result["replies"][0].text.lower()
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.flaky(reruns=3, reruns_delay=10)
+    def test_live_run_thinking(self, monkeypatch):
+        """Test live run with enable_thinking=True."""
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        messages = [ChatMessage.from_user("What is 2+2?")]
+
         llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}
+            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
         )
 
         result = llm.run(messages)
 
         assert "replies" in result
         assert isinstance(result["replies"][0], ChatMessage)
-        assert "climate change" in result["replies"][0].text.lower()
+        reply_text = result["replies"][0].text
+        assert reply_text is not None
+        assert "<think>" in reply_text
+        assert "</think>" in reply_text
+        assert len(reply_text) > 0
+        assert "4" in reply_text.lower()
 
     def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
         duplicate_tools = [tools[0], tools[0]]
@@ -512,7 +538,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools):
             )
 
     def test_run_with_tools(self, model_info_mock, tools):
-        generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools)
+        generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools)
 
         # Mock pipeline and tokenizer
         mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}])
@@ -800,9 +826,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None:
             streaming_chunks.append(chunk)
 
         llm = HuggingFaceLocalChatGenerator(
-            model="Qwen/Qwen2.5-0.5B-Instruct",
-            generation_kwargs={"max_new_tokens": 50},
-            streaming_callback=streaming_callback,
+            model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback
         )
 
         response = await llm.run_async(