-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat: Update HuggingFaceLocalChatGenerator default model to Qwen/Qwen3-0.6B #10176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
558bc71
f65729b
b7ed047
0d478af
4ae131b
285fea8
87c36b3
41081f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| --- | ||
| upgrade: | ||
| - | | ||
| `HuggingFaceLocalChatGenerator` now uses `Qwen/Qwen3-0.6B` as the default model, replacing the previous default. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use double backticks for inline code. See https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md#release-notes Also, I think that |
||
| Additionally, a new `enable_thinking` parameter has been added to enable thinking mode in chat templates for | ||
| thinking-capable models, allowing them to generate intermediate reasoning steps before producing final responses. | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -133,7 +133,7 @@ def test_init_task_parameter(self, model_info_mock): | |
| ) | ||
|
|
||
| assert generator.huggingface_pipeline_kwargs == { | ||
| "model": "HuggingFaceH4/zephyr-7b-beta", | ||
| "model": "Qwen/Qwen3-0.6B", | ||
| "task": "text2text-generation", | ||
| "token": None, | ||
| "device": "cpu", | ||
|
|
@@ -147,7 +147,7 @@ def test_init_task_in_huggingface_pipeline_kwargs(self, model_info_mock): | |
| ) | ||
|
|
||
| assert generator.huggingface_pipeline_kwargs == { | ||
| "model": "HuggingFaceH4/zephyr-7b-beta", | ||
| "model": "Qwen/Qwen3-0.6B", | ||
| "task": "text2text-generation", | ||
| "token": None, | ||
| "device": "cpu", | ||
|
|
@@ -178,6 +178,7 @@ def test_to_dict(self, model_info_mock, tools): | |
| streaming_callback=None, | ||
| chat_template="irrelevant", | ||
| tools=tools, | ||
| enable_thinking=True, | ||
| ) | ||
|
|
||
| # Call the to_dict method | ||
|
|
@@ -191,6 +192,7 @@ def test_to_dict(self, model_info_mock, tools): | |
| assert init_params["generation_kwargs"] == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} | ||
| assert init_params["streaming_callback"] is None | ||
| assert init_params["chat_template"] == "irrelevant" | ||
| assert init_params["enable_thinking"] is True | ||
| assert init_params["tools"] == [ | ||
| { | ||
| "type": "haystack.tools.tool.Tool", | ||
|
|
@@ -214,6 +216,7 @@ def test_from_dict(self, model_info_mock, tools): | |
| streaming_callback=None, | ||
| chat_template="irrelevant", | ||
| tools=tools, | ||
| enable_thinking=True, | ||
| ) | ||
| # Call the to_dict method | ||
| result = generator.to_dict() | ||
|
|
@@ -224,6 +227,7 @@ def test_from_dict(self, model_info_mock, tools): | |
| assert generator_2.generation_kwargs == {"max_new_tokens": 512, "n": 5, "stop_sequences": ["stop", "words"]} | ||
| assert generator_2.streaming_callback is None | ||
| assert generator_2.chat_template == "irrelevant" | ||
| assert generator_2.enable_thinking is True | ||
| assert len(generator_2.tools) == 1 | ||
| assert generator_2.tools[0].name == "weather" | ||
| assert generator_2.tools[0].description == "useful to determine the weather in a given location" | ||
|
|
@@ -490,15 +494,61 @@ def test_live_run(self, monkeypatch): | |
| monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 | ||
| messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] | ||
|
|
||
| llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}) | ||
|
|
||
| result = llm.run(messages) | ||
|
|
||
| assert "replies" in result | ||
| assert isinstance(result["replies"][0], ChatMessage) | ||
| assert "climate change" in result["replies"][0].text.lower() | ||
|
|
||
| @pytest.mark.integration | ||
| @pytest.mark.slow | ||
| @pytest.mark.flaky(reruns=3, reruns_delay=10) | ||
| def test_live_run_with_enable_thinking(self, monkeypatch): | ||
| """Test that enable_thinking works with the default Qwen3 model in a live run.""" | ||
| monkeypatch.delenv("HF_API_TOKEN", raising=False) | ||
| messages = [ChatMessage.from_user("What is 2+2?")] | ||
|
|
||
| llm = HuggingFaceLocalChatGenerator( | ||
| model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50} | ||
| model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=True | ||
| ) | ||
|
|
||
| result = llm.run(messages) | ||
|
|
||
| assert "replies" in result | ||
| assert isinstance(result["replies"][0], ChatMessage) | ||
| assert "climate change" in result["replies"][0].text.lower() | ||
| reply_text = result["replies"][0].text | ||
|
|
||
| assert reply_text is not None | ||
| assert "<think>" in reply_text | ||
| assert "</think>" in reply_text | ||
| assert len(reply_text) > 0 | ||
| assert "4" in reply_text.lower() | ||
|
|
||
| @pytest.mark.integration | ||
| @pytest.mark.slow | ||
| @pytest.mark.flaky(reruns=3, reruns_delay=10) | ||
| def test_live_run_without_enable_thinking(self, monkeypatch): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can be merged with |
||
| """Test that enable_thinking=False prevents thinking tags in the response.""" | ||
| monkeypatch.delenv("HF_API_TOKEN", raising=False) | ||
| messages = [ChatMessage.from_user("What is 2+2?")] | ||
|
|
||
| llm = HuggingFaceLocalChatGenerator( | ||
| model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 450}, enable_thinking=False | ||
| ) | ||
|
|
||
| result = llm.run(messages) | ||
|
|
||
| assert "replies" in result | ||
| assert isinstance(result["replies"][0], ChatMessage) | ||
| reply_text = result["replies"][0].text | ||
|
|
||
| assert reply_text is not None | ||
| assert "<think>" not in reply_text | ||
| assert "</think>" not in reply_text | ||
| assert len(reply_text) > 0 | ||
| assert "4" in reply_text.lower() | ||
|
|
||
| def test_init_fail_with_duplicate_tool_names(self, model_info_mock, tools): | ||
| duplicate_tools = [tools[0], tools[0]] | ||
|
|
@@ -512,7 +562,7 @@ def test_init_fail_with_tools_and_streaming(self, model_info_mock, tools): | |
| ) | ||
|
|
||
| def test_run_with_tools(self, model_info_mock, tools): | ||
| generator = HuggingFaceLocalChatGenerator(model="meta-llama/Llama-2-13b-chat-hf", tools=tools) | ||
| generator = HuggingFaceLocalChatGenerator(model="Qwen/Qwen3-0.6B", tools=tools) | ||
|
|
||
| # Mock pipeline and tokenizer | ||
| mock_pipeline = Mock(return_value=[{"generated_text": '{"name": "weather", "arguments": {"city": "Paris"}}'}]) | ||
|
|
@@ -800,9 +850,7 @@ async def streaming_callback(chunk: StreamingChunk) -> None: | |
| streaming_chunks.append(chunk) | ||
|
|
||
| llm = HuggingFaceLocalChatGenerator( | ||
| model="Qwen/Qwen2.5-0.5B-Instruct", | ||
| generation_kwargs={"max_new_tokens": 50}, | ||
| streaming_callback=streaming_callback, | ||
| model="Qwen/Qwen3-0.6B", generation_kwargs={"max_new_tokens": 50}, streaming_callback=streaming_callback | ||
| ) | ||
|
|
||
| response = await llm.run_async( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are already using too many positional arguments.