Match server scenario to standalone implementation (#2086)

pgmpablo157321 · web-flow · commit 4b7475bb0229 · 2025-02-04T18:19:13.000Z
* Match server scenario to standalone implementation
diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
@@ -3,6 +3,7 @@ name: Build loadgen wheels and release them into PYPI
 on:
   release:
     types: [published]
+  
   push:
     branches:
       - master
diff --git a/compliance/nvidia/TEST01/verify_performance.py b/compliance/nvidia/TEST01/verify_performance.py
@@ -90,7 +90,8 @@ def main():
             test_mode = line.split(": ", 1)[1].strip()
             continue
         if test_mode == "SingleStream":
-            if re.match(".*Early stopping (90th|99.9th) percentile estimate", line):
+            if re.match(
+                    ".*Early stopping (90th|99.9th) percentile estimate", line):
                 test_score = line.split(": ", 1)[1].strip()
                 continue
 
diff --git a/language/mixtral-8x7b/README.md b/language/mixtral-8x7b/README.md
@@ -250,6 +250,8 @@ python -u evaluate-accuracy.py --checkpoint-path [path_to_model_checkpoint] \
 
 ## Accuracy Target
 
+**WARNING:** The full accuracy target was only verified with the standalone script. The reference implementation matches in a subset of the dataset, but hasn't been fully confirm.
+
 Reference scores:
 Open Orca:
 ```json
diff --git a/language/mixtral-8x7b/SUT.py b/language/mixtral-8x7b/SUT.py
@@ -119,7 +119,7 @@ def put(self, value):
             self.first_token.put((value, self.response_ids[0]))
 
             self.is_first_token = False
-            return
+        
 
         self.tokens_cache.append(value)
 
@@ -356,6 +356,7 @@ def __init__(
         total_sample_count=24576,
         dataset_path=None,
         workers=1,
+        **kwargs,
     ):
 
         super().__init__(
@@ -408,9 +409,13 @@ def process_queries(self):
             if qitem is None:
                 break
 
-            input_ids_tensor = self.data_object.input_ids[qitem.index]
-            input_masks_tensor = self.data_object.attention_masks[qitem.index]
-            dataset = self.data_object.dataset_names[qitem.index]
+            input_dataset = [self.data_object.dataset_names[qitem.index]]
+
+            batch_texts = [self.data_object.input_texts[qitem.index]]
+            batch_ids = self.tokenizer.batch_encode_plus(
+                    batch_texts, return_tensors="pt", padding=True)
+            batch_ids = batch_ids.to(self.device)
+            _, length = batch_ids.input_ids.shape
 
             # TODO: This PoC is super slow with significant overhead. Best to
             # create a patch to `generate`
@@ -422,32 +427,24 @@ def process_queries(self):
                 response_ids=[qitem.id],
             )
 
-            logits_processor = LogitsProcessorList(
-                [StopAfterSequence(
-                    self.tokenizer.eos_token_id, device=self.device)]
+            
+            _ = self.model.generate(
+                **batch_ids,
+                num_return_sequences=1,
+                streamer=tokens_streamer,
+                **gen_kwargs,
             )
-            if dataset == "MBXP":
-                _ = self.model.generate(
-                    input_ids=input_ids_tensor,
-                    attention_mask=input_masks_tensor,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    streamer=tokens_streamer,
-                    logits_processor=logits_processor,
-                    **gen_kwargs,
-                )
-            else:
-                _ = self.model.generate(
-                    input_ids=input_ids_tensor,
-                    attention_mask=input_masks_tensor,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    streamer=tokens_streamer,
-                    **gen_kwargs,
-                )
 
             output_tokens = tokens_streamer.get_out_tokens()
-            n_tokens = len(output_tokens)
+            processed_output = self.data_object.postProcess(
+                torch.tensor([output_tokens], dtype=torch.int64),
+                length=0,
+                query_id_list=[qitem.index],
+                dataset_list=input_dataset,
+            )
+            n_tokens = len(processed_output[0])
             response_array = array.array(
-                "B", np.array(output_tokens, np.int32).tobytes()
+                "B", np.array(processed_output[0], np.int32).tobytes()
             )
             bi = response_array.buffer_info()
             response = [