diff --git a/haystack/components/generators/chat/hugging_face_local.py b/haystack/components/generators/chat/hugging_face_local.py index d1661245b8..03e66d0abd 100644 --- a/haystack/components/generators/chat/hugging_face_local.py +++ b/haystack/components/generators/chat/hugging_face_local.py @@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator: Generates chat responses using models from Hugging Face that run locally. Use this component with chat-based models, - such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`. + such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`. LLMs running locally may need powerful hardware. ### Usage example @@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator: from haystack.components.generators.chat import HuggingFaceLocalChatGenerator from haystack.dataclasses import ChatMessage - generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta") + generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B") generator.warm_up() messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")] print(generator.run(messages)) @@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator: def __init__( # pylint: disable=too-many-positional-arguments self, - model: str = "HuggingFaceH4/zephyr-7b-beta", + model: str = "Qwen/Qwen3-0.6B", task: Optional[Literal["text-generation", "text2text-generation"]] = None, device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), @@ -141,6 +141,8 @@ def __init__( # pylint: disable=too-many-positional-arguments tools: Optional[ToolsType] = None, tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None, async_executor: Optional[ThreadPoolExecutor] = None, + *, + enable_thinking: bool = False, ) -> None: """ Initializes the HuggingFaceLocalChatGenerator component. @@ -186,6 +188,9 @@ def __init__( # pylint: disable=too-many-positional-arguments :param async_executor: Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be initialized and used + :param enable_thinking: + Whether to enable thinking mode in the chat template for thinking-capable models. + When enabled, the model generates intermediate reasoning before the final response. Defaults to False. """ torch_and_transformers_import.check() @@ -243,6 +248,7 @@ def __init__( # pylint: disable=too-many-positional-arguments self.streaming_callback = streaming_callback self.pipeline: Optional[HfPipeline] = None self.tools = tools + self.enable_thinking = enable_thinking self._owns_executor = async_executor is None self.executor = ( @@ -308,6 +314,7 @@ def to_dict(self) -> dict[str, Any]: chat_template=self.chat_template, tools=serialize_tools_or_toolset(self.tools), tool_parsing_function=serialize_callable(self.tool_parsing_function), + enable_thinking=self.enable_thinking, ) huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"] @@ -600,6 +607,7 @@ def _prepare_inputs( chat_template=self.chat_template, add_generation_prompt=True, tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None, + enable_thinking=self.enable_thinking, ) # prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating assert isinstance(prepared_prompt, str) diff --git a/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml new file mode 100644 index 0000000000..a408a85b94 --- /dev/null +++ b/releasenotes/notes/huggingface-local-qwen3-thinking-2a6e0f07d7da54e8.yaml @@ -0,0 +1,9 @@ +--- +upgrade: + - | + ``HuggingFaceLocalChatGenerator`` now uses ``Qwen/Qwen3-0.6B`` as the default model, replacing the previous default. +enhancements: + - | + A new ``enable_thinking`` parameter has been added to enable thinking mode in chat templates for thinking-capable models, + allowing them to generate intermediate reasoning steps before producing final responses. + diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 650af4097a..0fe229591f 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock): ) assert generator.huggingface_pipeline_kwargs == { - "model": "HuggingFaceH4/zephyr-7b-beta", + "model": "Qwen/Qwen3-0.6B", "task": "text2text-generation", "token": None, "device": "cpu", @@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock): ) assert generator.huggingface_pipeline_kwargs == { - "model": "HuggingFaceH4/zephyr-7b-beta", + "model": "Qwen/Qwen3-0.6B", "task": "text2text-generation", "token": None, "device": "cpu", @@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools): streaming_callback=None, chat_template="irrelevant", tools=tools, + enable_thinking=True, ) # Call the to_dict method @@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools): assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} assert init_params["streaming_callback"] is None assert init_params["chat_template"] == "irrelevant" + assert init_params["enable_thinking"] is True assert init_params["tools"] == [ { "type": "haystack.tools.tool.Tool", @@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools): streaming_callback=None, chat_template="irrelevant", tools=tools, + enable_thinking=True, ) # Call the to_dict method result = generator.to_dict() @@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools): assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} assert generator_2.streaming_callback is None assert generator_2.chat_template == "irrelevant" + assert generator_2.enable_thinking is True assert len(generator_2.tools) == 1 assert generator_2.tools[0].name == "weather" assert generator_2.tools[0].description == "useful to determine the weather in a given location" @@ -487,18 +491,40 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock): @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) def test_live_run(self, monkeypatch): + """Test live run with default behavior (no thinking).""" monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] + llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) + + result = llm.run(messages) + + assert "replies" in result + assert isinstance(result["replies"][0], ChatMessage) + assert "climate change" in result["replies"][0].text.lower() + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_thinking(self, monkeypatch): + """Test live run with enable_thinking=True.""" + monkeypatch.delenv("HF_API_TOKEN", raising=False) + messages = [ChatMessage.from_user("What is 2+2?")] + llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50} + model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True ) result = llm.run(messages) assert "replies" in result assert isinstance(result["replies"][0], ChatMessage) - assert "climate change" in result["replies"][0].text.lower() + reply_text = result["replies"][0].text + assert reply_text is not None + assert "" in reply_text + assert "" in reply_text + assert len(reply_text) > 0 + assert "4" in reply_text.lower() def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools): duplicate_tools = [tools[0], tools[0]] @@ -512,7 +538,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools): ) def test_run_with_tools(self, model_info_mock, tools): - generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools) + generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools) # Mock pipeline and tokenizer mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}]) @@ -800,9 +826,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None: streaming_chunks.append(chunk) llm = HuggingFaceLocalChatGenerator( - model="Qwen/Qwen2.5-0.5B-Instruct", - generation_kwargs={"max_new_tokens": 50}, - streaming_callback=streaming_callback, + model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback ) response = await llm.run_async(