Fix the logic of calculating max_new_tokens and determining finish_reason (#3727)

lvhan028 · web-flow · commit bc87b2213217 · 2025-07-15T14:05:25.000+08:00
* fix the logic of computing max_new_tokens

* update

* fix
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -723,18 +723,15 @@ async def generate(
             # TODO(lvhan) VLM doesn't support input_ids as an argument.
             # Figure out a graceful way to handle the invalid input
             prompt_input = dict(input_ids=input_ids)
+
         if gen_config.max_new_tokens is None:
-            # for interactive endpoint, will try maximum possible token num
-            gen_config.max_new_tokens = max(128, self.session_len - self.id2step[session_id] - len(input_ids))
-        elif self.id2step[session_id] + len(input_ids) + gen_config.max_new_tokens > self.session_len:
-            gen_config.max_new_tokens = max(self.session_len - self.id2step[session_id] - len(input_ids), 128)
-            logger.error(f'Truncate max_new_tokens to {gen_config.max_new_tokens}')
-        if self.id2step[session_id] + len(input_ids) + gen_config.max_new_tokens > self.session_len:
-            logger.error(f'run out of tokens. session={session_id}.')
-            yield GenOut('', self.id2step[session_id], len(input_ids), 0, 'length')
-            if sequence_end is True and sequence_start is False:
-                await self.end_session(session_id)
-            return
+            gen_config.max_new_tokens = max(0, self.session_len - self.id2step[session_id] - len(input_ids))
+            if gen_config.max_new_tokens == 0:
+                logger.error(f'run out of tokens. session={session_id}.')
+                yield GenOut('', self.id2step[session_id], len(input_ids), 0, 'length')
+                if sequence_end is True and sequence_start is False:
+                    await self.end_session(session_id)
+                return
 
         def is_error(status):
             return status not in [ResponseType.SUCCESS, ResponseType.FINISH]
@@ -826,8 +823,7 @@ def is_error(status):
                 metrics_processor.increment_finished_requests()
 
                 if not is_error(outputs.status):
-                    finish_reason = 'length' \
-                        if gen_len >= gen_config.max_new_tokens else 'stop'
+                    finish_reason = 'stop' if outputs.token_ids[-1] in stop_ids else 'length'
                     # utf-8 char at the end means it's a potential unfinished
                     # byte sequence
                     if not response.endswith('�'):
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -686,6 +686,7 @@ async def async_stream_infer(self,
                 if status in [7, 8]:  # finish / canceled
                     finish, status = True, 0
                 elif status:
+                    logger.error(f'internal error. status_code {status}')
                     yield self._get_error_output()
                     break