Yet another fix for inference server

odelalleau · odelalleau · commit 647d7cc8260c · 2025-03-10T20:11:29.000-07:00
diff --git a/nemo/collections/nlp/modules/common/text_generation_server.py b/nemo/collections/nlp/modules/common/text_generation_server.py
@@ -286,16 +286,20 @@ def chat_completion(self, data):
 
         # Remove suffix.
         eot = special_tokens['end_of_turn']
-        for e in end_strings:
-            # This code is meant to be somewhat generic (even if the above code is not):
-            #   - If we stop on "end_of_turn", then we strip "end_of_turn" (ex: "<|eot_id|>")
-            #   - If we stop on an end string that follows "end_of_turn", then we strip both "end_of_turn"
-            #     and that end string (ex: "\n<extra_id_1>")
-            suffix = e if e == eot else (eot + e)
-            # The loop is very Llama-Instruct-specific, due to how "<|eot_id|>" is also the padding
-            # EOS token => it may be present multiple times.
-            while output_sentence.endswith(suffix):
-                output_sentence = output_sentence.removesuffix(suffix)
+        done = False
+        while not done:
+            done = True
+            for e in end_strings:
+                # This code is meant to be somewhat generic (even if the above code is not):
+                #   - If we stop on "end_of_turn", then we strip "end_of_turn" (ex: "<|eot_id|>")
+                #   - If we stop on an end string that follows "end_of_turn", then we strip both "end_of_turn"
+                #     and that end string (ex: "\n<extra_id_1>")
+                suffix = e if e == eot else (eot + e)
+                # The loop is very Llama-Instruct-specific, due to how "<|eot_id|>" is also the padding
+                # EOS token => it may be present multiple times.
+                while output_sentence.endswith(suffix):
+                    output_sentence = output_sentence.removesuffix(suffix)
+                    done = False
 
         print(f"TRIMMED OUTPUT:\n```{output_sentence}```")