Skip to content
Merged
13 changes: 10 additions & 3 deletions haystack/components/generators/chat/hugging_face_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator:
Generates chat responses using models from Hugging Face that run locally.

Use this component with chat-based models,
such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`.
such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`.
LLMs running locally may need powerful hardware.

### Usage example
Expand All @@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator:
from haystack.components.generators.chat import HuggingFaceLocalChatGenerator
from haystack.dataclasses import ChatMessage

generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta")
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B")
generator.warm_up()
messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")]
print(generator.run(messages))
Expand All @@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator:

def __init__( # pylint: disable=too-many-positional-arguments
self,
model: str = "HuggingFaceH4/zephyr-7b-beta",
model: str = "Qwen/Qwen3-0.6B",
task: Optional[Literal["text-generation", "text2text-generation"]] = None,
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
Expand All @@ -141,6 +141,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
tools: Optional[ToolsType] = None,
tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
async_executor: Optional[ThreadPoolExecutor] = None,
enable_thinking: bool = False,
) -> None:
"""
Initializes the HuggingFaceLocalChatGenerator component.
Expand Down Expand Up @@ -186,6 +187,9 @@ def __init__( # pylint: disable=too-many-positional-arguments
:param async_executor:
Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
initialized and used
:param enable_thinking:
Whether to enable thinking mode in the chat template for thinking-capable models.
When enabled, the model generates intermediate reasoning before the final response. Defaults to False.
"""
torch_and_transformers_import.check()

Expand Down Expand Up @@ -243,6 +247,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
self.streaming_callback = streaming_callback
self.pipeline: Optional[HfPipeline] = None
self.tools = tools
self.enable_thinking = enable_thinking

self._owns_executor = async_executor is None
self.executor = (
Expand Down Expand Up @@ -308,6 +313,7 @@ def to_dict(self) -> dict[str, Any]:
chat_template=self.chat_template,
tools=serialize_tools_or_toolset(self.tools),
tool_parsing_function=serialize_callable(self.tool_parsing_function),
enable_thinking=self.enable_thinking,
)

huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
Expand Down Expand Up @@ -600,6 +606,7 @@ def _prepare_inputs(
chat_template=self.chat_template,
add_generation_prompt=True,
tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
enable_thinking=self.enable_thinking,
)
# prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
assert isinstance(prepared_prompt, str)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
upgrade:
- |
`HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default.
Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for
thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses.
64 changes: 56 additions & 8 deletions test/components/generators/chat/test_hugging_face_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock):
)

assert generator.huggingface_pipeline_kwargs == {
"model": "HuggingFaceH4/zephyr-7b-beta",
"model": "Qwen/Qwen3-0.6B",
"task": "text2text-generation",
"token": None,
"device": "cpu",
Expand All @@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock):
)

assert generator.huggingface_pipeline_kwargs == {
"model": "HuggingFaceH4/zephyr-7b-beta",
"model": "Qwen/Qwen3-0.6B",
"task": "text2text-generation",
"token": None,
"device": "cpu",
Expand Down Expand Up @@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools):
streaming_callback=None,
chat_template="irrelevant",
tools=tools,
enable_thinking=True,
)

# Call the to_dict method
Expand All @@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools):
assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
assert init_params["streaming_callback"] is None
assert init_params["chat_template"] == "irrelevant"
assert init_params["enable_thinking"] is True
assert init_params["tools"] == [
{
"type": "haystack.tools.tool.Tool",
Expand All @@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools):
streaming_callback=None,
chat_template="irrelevant",
tools=tools,
enable_thinking=True,
)
# Call the to_dict method
result = generator.to_dict()
Expand All @@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools):
assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
assert generator_2.streaming_callback is None
assert generator_2.chat_template == "irrelevant"
assert generator_2.enable_thinking is True
assert len(generator_2.tools) == 1
assert generator_2.tools[0].name == "weather"
assert generator_2.tools[0].description == "useful to determine the weather in a given location"
Expand Down Expand Up @@ -490,15 +494,61 @@ def test_live_run(self, monkeypatch):
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]

llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
assert "climate change" in result["replies"][0].text.lower()

@pytest.mark.integration
@pytest.mark.slow
@pytest.mark.flaky(reruns=3, reruns_delay=10)
def test_live_run_with_enable_thinking(self, monkeypatch):
"""Test that enable_thinking works with the default Qwen3 model in a live run."""
monkeypatch.delenv("HF_API_TOKEN", raising=False)
messages = [ChatMessage.from_user("What is 2+2?")]

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
)

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
assert "climate change" in result["replies"][0].text.lower()
reply_text = result["replies"][0].text

assert reply_text is not None
assert "<think>" in reply_text
assert "</think>" in reply_text
assert len(reply_text) > 0
assert "4" in reply_text.lower()

@pytest.mark.integration
@pytest.mark.slow
@pytest.mark.flaky(reruns=3, reruns_delay=10)
def test_live_run_without_enable_thinking(self, monkeypatch):
"""Test that enable_thinking=False prevents thinking tags in the response."""
monkeypatch.delenv("HF_API_TOKEN", raising=False)
messages = [ChatMessage.from_user("What is 2+2?")]

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False
)

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
reply_text = result["replies"][0].text

assert reply_text is not None
assert "<think>" not in reply_text
assert "</think>" not in reply_text
assert len(reply_text) > 0
assert "4" in reply_text.lower()

def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
duplicate_tools = [tools[0], tools[0]]
Expand All @@ -512,7 +562,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools):
)

def test_run_with_tools(self, model_info_mock, tools):
generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools)
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools)

# Mock pipeline and tokenizer
mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}])
Expand Down Expand Up @@ -800,9 +850,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None:
streaming_chunks.append(chunk)

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen2.5-0.5B-Instruct",
generation_kwargs={"max_new_tokens": 50},
streaming_callback=streaming_callback,
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback
)

response = await llm.run_async(
Expand Down
Loading