Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions haystack/components/generators/chat/hugging_face_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class HuggingFaceLocalChatGenerator:
Generates chat responses using models from Hugging Face that run locally.

Use this component with chat-based models,
such as `HuggingFaceH4/zephyr-7b-beta` or `meta-llama/Llama-2-7b-chat-hf`.
such as `Qwen/Qwen3-0.6B` or `meta-llama/Llama-2-7b-chat-hf`.
LLMs running locally may need powerful hardware.

### Usage example
Expand All @@ -104,7 +104,7 @@ class HuggingFaceLocalChatGenerator:
from haystack.components.generators.chat import HuggingFaceLocalChatGenerator
from haystack.dataclasses import ChatMessage

generator = HuggingFaceLocalChatGenerator(model="HuggingFaceH4/zephyr-7b-beta")
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B")
generator.warm_up()
messages = [ChatMessage.from_user("What's Natural Language Processing? Be brief.")]
print(generator.run(messages))
Expand All @@ -129,7 +129,7 @@ class HuggingFaceLocalChatGenerator:

def __init__( # pylint: disable=too-many-positional-arguments
self,
model: str = "HuggingFaceH4/zephyr-7b-beta",
model: str = "Qwen/Qwen3-0.6B",
task: Optional[Literal["text-generation", "text2text-generation"]] = None,
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
Expand All @@ -141,6 +141,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
tools: Optional[ToolsType] = None,
tool_parsing_function: Optional[Callable[[str], Optional[list[ToolCall]]]] = None,
async_executor: Optional[ThreadPoolExecutor] = None,
enable_thinking: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are already using too many positional arguments.

Suggested change
enable_thinking: bool = False,
*,
enable_thinking: bool = False,

) -> None:
"""
Initializes the HuggingFaceLocalChatGenerator component.
Expand Down Expand Up @@ -186,6 +187,9 @@ def __init__( # pylint: disable=too-many-positional-arguments
:param async_executor:
Optional ThreadPoolExecutor to use for async calls. If not provided, a single-threaded executor will be
initialized and used
:param enable_thinking:
Whether to enable thinking mode in the chat template for thinking-capable models.
When enabled, the model generates intermediate reasoning before the final response. Defaults to False.
"""
torch_and_transformers_import.check()

Expand Down Expand Up @@ -243,6 +247,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
self.streaming_callback = streaming_callback
self.pipeline: Optional[HfPipeline] = None
self.tools = tools
self.enable_thinking = enable_thinking

self._owns_executor = async_executor is None
self.executor = (
Expand Down Expand Up @@ -308,6 +313,7 @@ def to_dict(self) -> dict[str, Any]:
chat_template=self.chat_template,
tools=serialize_tools_or_toolset(self.tools),
tool_parsing_function=serialize_callable(self.tool_parsing_function),
enable_thinking=self.enable_thinking,
)

huggingface_pipeline_kwargs = serialization_dict["init_parameters"]["huggingface_pipeline_kwargs"]
Expand Down Expand Up @@ -600,6 +606,7 @@ def _prepare_inputs(
chat_template=self.chat_template,
add_generation_prompt=True,
tools=[tc.tool_spec for tc in flat_tools] if flat_tools else None,
enable_thinking=self.enable_thinking,
)
# prepared_prompt is a string since we set tokenize=False https://hf.co/docs/transformers/main/chat_templating
assert isinstance(prepared_prompt, str)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
upgrade:
- |
`HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use double backticks for inline code. See https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md#release-notes

Also, I think that enable_thinking should be under a separate enhancements section.

Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for
thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses.
64 changes: 56 additions & 8 deletions test/components/generators/chat/test_hugging_face_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock):
)

assert generator.huggingface_pipeline_kwargs == {
"model": "HuggingFaceH4/zephyr-7b-beta",
"model": "Qwen/Qwen3-0.6B",
"task": "text2text-generation",
"token": None,
"device": "cpu",
Expand All @@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock):
)

assert generator.huggingface_pipeline_kwargs == {
"model": "HuggingFaceH4/zephyr-7b-beta",
"model": "Qwen/Qwen3-0.6B",
"task": "text2text-generation",
"token": None,
"device": "cpu",
Expand Down Expand Up @@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools):
streaming_callback=None,
chat_template="irrelevant",
tools=tools,
enable_thinking=True,
)

# Call the to_dict method
Expand All @@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools):
assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
assert init_params["streaming_callback"] is None
assert init_params["chat_template"] == "irrelevant"
assert init_params["enable_thinking"] is True
assert init_params["tools"] == [
{
"type": "haystack.tools.tool.Tool",
Expand All @@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools):
streaming_callback=None,
chat_template="irrelevant",
tools=tools,
enable_thinking=True,
)
# Call the to_dict method
result = generator.to_dict()
Expand All @@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools):
assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]}
assert generator_2.streaming_callback is None
assert generator_2.chat_template == "irrelevant"
assert generator_2.enable_thinking is True
assert len(generator_2.tools) == 1
assert generator_2.tools[0].name == "weather"
assert generator_2.tools[0].description == "useful to determine the weather in a given location"
Expand Down Expand Up @@ -490,15 +494,61 @@ def test_live_run(self, monkeypatch):
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")]

llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50})

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
assert "climate change" in result["replies"][0].text.lower()

@pytest.mark.integration
@pytest.mark.slow
@pytest.mark.flaky(reruns=3, reruns_delay=10)
def test_live_run_with_enable_thinking(self, monkeypatch):
"""Test that enable_thinking works with the default Qwen3 model in a live run."""
monkeypatch.delenv("HF_API_TOKEN", raising=False)
messages = [ChatMessage.from_user("What is 2+2?")]

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True
)

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
assert "climate change" in result["replies"][0].text.lower()
reply_text = result["replies"][0].text

assert reply_text is not None
assert "<think>" in reply_text
assert "</think>" in reply_text
assert len(reply_text) > 0
assert "4" in reply_text.lower()

@pytest.mark.integration
@pytest.mark.slow
@pytest.mark.flaky(reruns=3, reruns_delay=10)
def test_live_run_without_enable_thinking(self, monkeypatch):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be merged with test_live_run. Since these tests are slow, I would refrain from having 3 of them.

"""Test that enable_thinking=False prevents thinking tags in the response."""
monkeypatch.delenv("HF_API_TOKEN", raising=False)
messages = [ChatMessage.from_user("What is 2+2?")]

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False
)

result = llm.run(messages)

assert "replies" in result
assert isinstance(result["replies"][0], ChatMessage)
reply_text = result["replies"][0].text

assert reply_text is not None
assert "<think>" not in reply_text
assert "</think>" not in reply_text
assert len(reply_text) > 0
assert "4" in reply_text.lower()

def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools):
duplicate_tools = [tools[0], tools[0]]
Expand All @@ -512,7 +562,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools):
)

def test_run_with_tools(self, model_info_mock, tools):
generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools)
generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools)

# Mock pipeline and tokenizer
mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}])
Expand Down Expand Up @@ -800,9 +850,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None:
streaming_chunks.append(chunk)

llm = HuggingFaceLocalChatGenerator(
model="Qwen/Qwen2.5-0.5B-Instruct",
generation_kwargs={"max_new_tokens": 50},
streaming_callback=streaming_callback,
model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback
)

response = await llm.run_async(
Expand Down
Loading