BUG: repair qwen3 model transformers random characters (#4148)

OliverBryant · OliverBryant · commit 90a81a7bde49 · 2025-10-20T16:39:12.000+08:00
diff --git a/setup.cfg b/setup.cfg
@@ -79,6 +79,7 @@ dev =
     anthropic
     langchain
     langchain-community
+    langchain-openai
     orjson
     sphinx-tabs
     sphinx-design
diff --git a/xinference/core/tests/test_restful_api.py b/xinference/core/tests/test_restful_api.py
@@ -1178,13 +1178,13 @@ def test_lang_chain(setup):
     model_uid_res = response_data["model_uid"]
     assert model_uid_res == "test_restful_api"
 
-    from langchain.chat_models import ChatOpenAI
-    from langchain.prompts.chat import (
+    from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+    from langchain_core.prompts import (
         ChatPromptTemplate,
         HumanMessagePromptTemplate,
         SystemMessagePromptTemplate,
     )
-    from langchain.schema import AIMessage, HumanMessage, SystemMessage
+    from langchain_openai import ChatOpenAI
 
     inference_server_url = f"{endpoint}/v1"
 
@@ -1204,7 +1204,7 @@ def test_lang_chain(setup):
             content="Translate the following sentence from English to Italian: I love programming."
         ),
     ]
-    r = chat(messages)
+    r = chat.invoke(messages)
     assert type(r) == AIMessage
     assert r.content
 
@@ -1218,7 +1218,7 @@ def test_lang_chain(setup):
     )
 
     # get a chat completion from the formatted messages
-    r = chat(
+    r = chat.invoke(
         chat_prompt.format_prompt(
             input_language="English",
             output_language="Italian",
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -549,46 +549,30 @@ def build_decode_attention_mask(
         So we need pad `0` on the left again.
         """
         data = []
-        # For decode phase, attention mask should match the full KV cache sequence length
-        # All requests in batch should have attention mask of length `seq_length`
-        for r in reqs:
-            # Get the actual sequence length for this request from its tracking
-            if "attention_mask_seq_len" not in r.extra_kwargs:
-                # Initialize with the current sequence length (full KV cache length)
-                r.extra_kwargs["attention_mask_seq_len"] = seq_length
-            else:
-                # Use the previously tracked length, but ensure it doesn't exceed current seq_length
-                tracked_len = r.extra_kwargs["attention_mask_seq_len"]
-                r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
-
-        # For decode phase after KV cache merge, all requests should have attention mask
-        # that matches the merged sequence length
+        max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
         for r in reqs:
+            r.extra_kwargs["attention_mask_seq_len"] += 1
             real_len = r.extra_kwargs["attention_mask_seq_len"]
+            pad_len = max_len - real_len
 
-            # The attention mask should cover the full sequence length
-            if real_len < seq_length:
-                # Pad with zeros on the left to reach full sequence length
-                pad_len = seq_length - real_len
-
-                if self._tokenizer.padding_side == "left":
-                    x = torch.cat(
-                        [
-                            torch.full((pad_len,), 0, dtype=torch.long),
-                            torch.ones((real_len,), dtype=torch.long),
-                        ]
-                    )
-                else:
-                    x = torch.cat(
-                        [
-                            torch.ones((real_len,), dtype=torch.long),
-                            torch.full((pad_len,), 0, dtype=torch.long),
-                        ]
-                    )
+            if self._tokenizer.padding_side == "left":
+                x = torch.cat(
+                    [
+                        (
+                            torch.full((pad_len,), 0, dtype=torch.long)
+                            if pad_len > 0
+                            else torch.tensor([], dtype=torch.long)
+                        ),
+                        torch.ones((real_len,), dtype=torch.long),
+                    ]
+                )
             else:
-                # Already at correct length
-                x = torch.ones((real_len,), dtype=torch.long)
-
+                x = torch.cat(
+                    [
+                        torch.ones((real_len,), dtype=torch.long),
+                        torch.full((pad_len,), 0, dtype=torch.long),
+                    ]
+                )
             data.append(x)
 
         return torch.stack(data).to(self._device)
diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py
@@ -285,30 +285,10 @@ def _batch_inference_one_step_internal(
             # This prevents batch size mismatches during merging
             decode_kv = decode_reqs[0].kv_cache
 
-            # Verify that all decode requests share the same kv_cache
-            for req in decode_reqs[1:]:
-                if req.kv_cache is not decode_kv:
-                    logger.warning(
-                        "Inconsistent kv_cache references detected in decode requests. "
-                        "This may indicate a batching synchronization issue."
-                    )
-                    # Use the first decode_kv as the reference to maintain consistency
-                    req.kv_cache = decode_kv
-
             # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
             merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
-            # Update sequence length information after KV cache merge
-            _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
-                merged_kv_cache, xinf_model_obj
-            )
             for r in valid_req_list:
                 r.kv_cache = merged_kv_cache
-                # Update attention mask sequence length to match merged KV cache
-                if "attention_mask_seq_len" in r.extra_kwargs:
-                    # Ensure the attention mask length doesn't exceed the merged sequence length
-                    r.extra_kwargs["attention_mask_seq_len"] = min(
-                        r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
-                    )
             empty_cache()
         else:
             for r in valid_req_list: