Skip to content

Commit 0c214c1

Browse files
vblagojeanakin87
andauthored
feat: Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B (#10176)
* Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B * Add enable_thinking init parameter * Pydoc wording * Format test * Add tests for enable_thinking flag * Add reno note for HuggingFaceLocalChatGenerator updates * Update haystack/components/generators/chat/hugging_face_local.py Co-authored-by: Stefano Fiorucci <[email protected]> * Update release notes for HuggingFaceLocalChatGenerator Updated the release notes to reflect changes in the HuggingFaceLocalChatGenerator, including the new default model and the addition of the enable_thinking parameter. * Simplify test_live_run with/out enable_thinking flag * Test shuffle --------- Co-authored-by: Stefano Fiorucci <[email protected]>
1 parent b4fd38d commit 0c214c1

File tree

3 files changed

+52
-11
lines changed

3 files changed

+52
-11
lines changed

haystack/components/generators/chat/hugging_face_local.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator:
9595
Generates chat responses using models from Hugging Face that run locally.
9696
9797
Use this component with chat-based models,
98-
such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`.
98+
such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`.
9999
LLMs running locally may need powerful hardware.
100100
101101
### Usage example
@@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator:
104104
from haystack.components.generators.chat import HuggingFaceLocalChatGenerator
105105
from haystack.dataclasses import ChatMessage
106106
107-
generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta")
107+
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B")
108108
generator.warm_up()
109109
messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")]
110110
print(generator.run(messages))
@@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator:
129129

130130
def __init__( # pylint: disable=too-many-positional-arguments
131131
self,
132-
model: str = "HuggingFaceH4/zephyr-7b-beta",
132+
model: str = "Qwen/Qwen3-0.6B",
133133
task: Optional[Literal["text-generation", "text2text-generation"]] = None,
134134
device: Optional[ComponentDevice] = None,
135135
token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
@@ -141,6 +141,8 @@ def __init__( # pylint: disable=too-many-positional-arguments
141141
tools: Optional[ToolsType] = None,
142142
tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
143143
async_executor: Optional[ThreadPoolExecutor] = None,
144+
*,
145+
enable_thinking: bool = False,
144146
) -> None:
145147
"""
146148
Initializes the HuggingFaceLocalChatGenerator component.
@@ -186,6 +188,9 @@ def __init__( # pylint: disable=too-many-positional-arguments
186188
:param async_executor:
187189
Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
188190
initialized and used
191+
:param enable_thinking:
192+
Whether to enable thinking mode in the chat template for thinking-capable models.
193+
When enabled, the model generates intermediate reasoning before the final response. Defaults to False.
189194
"""
190195
torch_and_transformers_import.check()
191196

@@ -243,6 +248,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
243248
self.streaming_callback = streaming_callback
244249
self.pipeline: Optional[HfPipeline] = None
245250
self.tools = tools
251+
self.enable_thinking = enable_thinking
246252

247253
self._owns_executor = async_executor is None
248254
self.executor = (
@@ -308,6 +314,7 @@ def to_dict(self) -> dict[str, Any]:
308314
chat_template=self.chat_template,
309315
tools=serialize_tools_or_toolset(self.tools),
310316
tool_parsing_function=serialize_callable(self.tool_parsing_function),
317+
enable_thinking=self.enable_thinking,
311318
)
312319

313320
huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
@@ -600,6 +607,7 @@ def _prepare_inputs(
600607
chat_template=self.chat_template,
601608
add_generation_prompt=True,
602609
tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
610+
enable_thinking=self.enable_thinking,
603611
)
604612
# prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
605613
assert isinstance(prepared_prompt, str)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
upgrade:
3+
- |
4+
``HuggingFaceLocalChatGenerator`` now uses ``Qwen/Qwen3-0.6B`` as the default model, replacing the previous default.
5+
enhancements:
6+
- |
7+
A new ``enable_thinking`` parameter has been added to enable thinking mode in chat templates for thinking-capable models,
8+
allowing them to generate intermediate reasoning steps before producing final responses.
9+

test/components/generators/chat/test_hugging_face_local.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock):
133133
)
134134

135135
assert generator.huggingface_pipeline_kwargs == {
136-
"model": "HuggingFaceH4/zephyr-7b-beta",
136+
"model": "Qwen/Qwen3-0.6B",
137137
"task": "text2text-generation",
138138
"token": None,
139139
"device": "cpu",
@@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock):
147147
)
148148

149149
assert generator.huggingface_pipeline_kwargs == {
150-
"model": "HuggingFaceH4/zephyr-7b-beta",
150+
"model": "Qwen/Qwen3-0.6B",
151151
"task": "text2text-generation",
152152
"token": None,
153153
"device": "cpu",
@@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools):
178178
streaming_callback=None,
179179
chat_template="irrelevant",
180180
tools=tools,
181+
enable_thinking=True,
181182
)
182183

183184
# Call the to_dict method
@@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools):
191192
assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
192193
assert init_params["streaming_callback"] is None
193194
assert init_params["chat_template"] == "irrelevant"
195+
assert init_params["enable_thinking"] is True
194196
assert init_params["tools"] == [
195197
{
196198
"type": "haystack.tools.tool.Tool",
@@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools):
214216
streaming_callback=None,
215217
chat_template="irrelevant",
216218
tools=tools,
219+
enable_thinking=True,
217220
)
218221
# Call the to_dict method
219222
result = generator.to_dict()
@@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools):
224227
assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
225228
assert generator_2.streaming_callback is None
226229
assert generator_2.chat_template == "irrelevant"
230+
assert generator_2.enable_thinking is True
227231
assert len(generator_2.tools) == 1
228232
assert generator_2.tools[0].name == "weather"
229233
assert generator_2.tools[0].description == "useful to determine the weather in a given location"
@@ -487,18 +491,40 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock):
487491
@pytest.mark.slow
488492
@pytest.mark.flaky(reruns=3, reruns_delay=10)
489493
def test_live_run(self, monkeypatch):
494+
"""Test live run with default behavior (no thinking)."""
490495
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
491496
messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]
492497

498+
llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})
499+
500+
result = llm.run(messages)
501+
502+
assert "replies" in result
503+
assert isinstance(result["replies"][0], ChatMessage)
504+
assert "climate change" in result["replies"][0].text.lower()
505+
506+
@pytest.mark.integration
507+
@pytest.mark.slow
508+
@pytest.mark.flaky(reruns=3, reruns_delay=10)
509+
def test_live_run_thinking(self, monkeypatch):
510+
"""Test live run with enable_thinking=True."""
511+
monkeypatch.delenv("HF_API_TOKEN", raising=False)
512+
messages = [ChatMessage.from_user("What is 2+2?")]
513+
493514
llm = HuggingFaceLocalChatGenerator(
494-
model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}
515+
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
495516
)
496517

497518
result = llm.run(messages)
498519

499520
assert "replies" in result
500521
assert isinstance(result["replies"][0], ChatMessage)
501-
assert "climate change" in result["replies"][0].text.lower()
522+
reply_text = result["replies"][0].text
523+
assert reply_text is not None
524+
assert "<think>" in reply_text
525+
assert "</think>" in reply_text
526+
assert len(reply_text) > 0
527+
assert "4" in reply_text.lower()
502528

503529
def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
504530
duplicate_tools = [tools[0], tools[0]]
@@ -512,7 +538,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools):
512538
)
513539

514540
def test_run_with_tools(self, model_info_mock, tools):
515-
generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools)
541+
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools)
516542

517543
# Mock pipeline and tokenizer
518544
mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}])
@@ -800,9 +826,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None:
800826
streaming_chunks.append(chunk)
801827

802828
llm = HuggingFaceLocalChatGenerator(
803-
model="Qwen/Qwen2.5-0.5B-Instruct",
804-
generation_kwargs={"max_new_tokens": 50},
805-
streaming_callback=streaming_callback,
829+
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback
806830
)
807831

808832
response = await llm.run_async(

0 commit comments

Comments
 (0)