Mbatch p3l (ROCm#401)

Alexei-V-Ivanov-AMD · web-flow · commit d586c399bc64 · 2025-02-04T10:14:39.000-06:00
* Enabling P3L.py &amp; P3L_mling.py tests to run with multiple batched
queries.

This alternation adds minimal measurement noise.

The underlining testing material is the same, the resulting measurements
are comparable to the old (BS=1) testing runs.

Signed-off-by: Alexei V. Ivanov &lt;alexei.ivanov@amd.com&gt;

* Making linters happy.

Signed-off-by: Alexei V. Ivanov &lt;alexei.ivanov@amd.com&gt;

* Changed the device specification for the 'forced_sample' tensor.
The resulting implementation produces identical measurement, and,
actually, became faster (3.21s/it vs 3.42s/it with previous commit).

Signed-off-by: Alexei V. Ivanov &lt;alexei.ivanov@amd.com&gt;

* Fixing reporting to reflect processed intervals.

Signed-off-by: Alexei V. Ivanov &lt;alexei.ivanov@amd.com&gt;

---------

Signed-off-by: Alexei V. Ivanov &lt;alexei.ivanov@amd.com&gt;
diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
@@ -40,6 +40,9 @@
 )
 should result in PPL ~ PPL=3.8968611189957523
 
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
 """
 
 import argparse
@@ -140,36 +143,55 @@ def main(args: argparse.Namespace):
 
     logger.info(MESSAGE)
     print(MESSAGE)
-    for c in range(my_n_patches):
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+
         CONTEXT = []
         my_sampl_par.future_context = []
-        CONTEXT.append(
-            my_test_enc['input_ids'][c * my_n_samples:c * my_n_samples +
-                                     args.context_size])
-        upper_boundary = min((c + 1) * my_n_samples + args.context_size,
-                             len(my_test_enc['input_ids']))
-        my_sampl_par.future_context.append(
-            my_test_enc['input_ids'][c * my_n_samples +
-                                     args.context_size:upper_boundary])
-        my_sampl_par.max_tokens = len(my_sampl_par.future_context[0])
-        my_sampl_par.cntr = c
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc['input_ids']))
+                CONTEXT.append(
+                    my_test_enc['input_ids'][(c + b) * my_n_samples:(c + b) *
+                                             my_n_samples + args.context_size])
+
+                my_sampl_par.future_context.append(
+                    my_test_enc['input_ids'][(c + b) * my_n_samples +
+                                             args.context_size:upper_boundary])
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT)))
+
         LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
-        num_tokens_generated += len(LOGPROBS[0].outputs[0].token_ids)
-        if (num_tokens_generated < my_n_samples):
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if (num_tokens_generated < my_n_samples * len(CONTEXT)):
             MESSAGE = (f"Warning: The number of generated tokens is" \
-                        f"less than requested ({num_tokens_generated}" \
-                        f" < {my_n_samples}).")
+                    f"less than requested ({num_tokens_generated}" \
+                    f" < {my_n_samples*len(CONTEXT)}).")
             logger.info(MESSAGE)
             print(MESSAGE)
-        my_ppl -= LOGPROBS[0].outputs[0].cumulative_logprob
-        MESSAGE = (f"Iteration {c+1} of {my_n_patches} Intermediate" \
+
+        MESSAGE = (f"Iterations {c+1} through {c+len(CONTEXT)}" \
+            " of {my_n_patches} Intermediate" \
             "Estimates:\n" \
             f"\tCross-entropy_intermediate={my_ppl/num_tokens_generated}\n" \
             f"\tPerplexity_intermediate=" \
             f"{math.exp(my_ppl/num_tokens_generated)}")
 
         logger.info(MESSAGE)
         print(MESSAGE)
+
     ending_time = datetime.datetime.now()
     MESSAGE = (f"Done @ {ending_time} after processing for" \
                 f" {ending_time-starting_time}" \
@@ -199,12 +221,9 @@ def main(args: argparse.Namespace):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description='Measure the PPPL (P3L) score of a given model.')
-    parser.add_argument(
-        '--data',
-        type=str,
-        default='./wikitext/wikitext-2-v1/test-00000-of-00001.parquet')
     parser.add_argument('--context-size', type=int, default=4096)
     parser.add_argument('--sample-size', type=int, default=512)
+    parser.add_argument('--batch-size', type=int, default=1)
     parser.add_argument('--patch-size', type=int, default=None)
     parser.add_argument(
         '--output-json',
diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
@@ -52,6 +52,8 @@
 
 for the complete set of possible language-scripture choices.
 
+Running the script with multiple batches is possible 
+by specifying the --batch-size parameter.
 
 """
 
@@ -172,36 +174,55 @@ def main(args: argparse.Namespace):
 
     logger.info(MESSAGE)
     print(MESSAGE)
-    for c in range(my_n_patches):
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+
         CONTEXT = []
         my_sampl_par.future_context = []
-        CONTEXT.append(
-            my_test_enc['input_ids'][c * my_n_samples:c * my_n_samples +
-                                     args.context_size])
-        upper_boundary = min((c + 1) * my_n_samples + args.context_size,
-                             len(my_test_enc['input_ids']))
-        my_sampl_par.future_context.append(
-            my_test_enc['input_ids'][c * my_n_samples +
-                                     args.context_size:upper_boundary])
-        my_sampl_par.max_tokens = len(my_sampl_par.future_context[0])
-        my_sampl_par.cntr = c
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc['input_ids']))
+                CONTEXT.append(
+                    my_test_enc['input_ids'][(c + b) * my_n_samples:(c + b) *
+                                             my_n_samples + args.context_size])
+
+                my_sampl_par.future_context.append(
+                    my_test_enc['input_ids'][(c + b) * my_n_samples +
+                                             args.context_size:upper_boundary])
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT)))
+
         LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
-        num_tokens_generated += len(LOGPROBS[0].outputs[0].token_ids)
-        if (num_tokens_generated < my_n_samples):
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if (num_tokens_generated < my_n_samples * len(CONTEXT)):
             MESSAGE = (f"Warning: The number of generated tokens is" \
-                        f"less than requested ({num_tokens_generated}" \
-                        f" < {my_n_samples}).")
+                    f"less than requested ({num_tokens_generated}" \
+                    f" < {my_n_samples*len(CONTEXT)}).")
             logger.info(MESSAGE)
             print(MESSAGE)
-        my_ppl -= LOGPROBS[0].outputs[0].cumulative_logprob
-        MESSAGE = (f"Iteration {c+1} of {my_n_patches} Intermediate" \
+
+        MESSAGE = (f"Iterations {c+1} through {c+len(CONTEXT)}" \
+            " of {my_n_patches} Intermediate" \
             "Estimates:\n" \
             f"\tCross-entropy_intermediate={my_ppl/num_tokens_generated}\n" \
             f"\tPerplexity_intermediate=" \
             f"{math.exp(my_ppl/num_tokens_generated)}")
 
         logger.info(MESSAGE)
         print(MESSAGE)
+
     ending_time = datetime.datetime.now()
     MESSAGE = (f"Done @ {ending_time} after processing for" \
                 f" {ending_time-starting_time}" \
@@ -237,6 +258,7 @@ def main(args: argparse.Namespace):
         default='./wikitext/wikitext-2-v1/test-00000-of-00001.parquet')
     parser.add_argument('--context-size', type=int, default=4096)
     parser.add_argument('--sample-size', type=int, default=512)
+    parser.add_argument('--batch-size', type=int, default=1)
     parser.add_argument('--patch-size', type=int, default=None)
     parser.add_argument('--lang-script', type=str, default="eng_Latn")
     parser.add_argument(
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -799,7 +799,6 @@ def _sample_with_torch(
         if sampling_type == SamplingType.GREEDY:
             greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                           dim=-1)
-
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[
@@ -842,17 +841,23 @@ def _sample_with_torch(
                 sampled_token_ids_tensor[long_sample_indices] = \
                     multinomial_samples[sampling_type].to(torch.long)
         elif sampling_type == SamplingType.FORCED:
-            if (seq_groups[0].sampling_params.future_context is not None):
-                forced_samples = torch.tensor([
-                    seq_groups[0].sampling_params.future_context[0][min(
-                        len(sampling_metadata.seq_groups[0].seq_data[
-                            sampling_params.cntr].output_token_ids),
-                        len(seq_groups[0].sampling_params.future_context[0]) -
-                        1)]
-                ])
-            else:
-                forced_samples = torch.argmax(logprobs[long_sample_indices],
-                                              dim=-1)
+            forced_samples = torch.tensor([], dtype=torch.int32)
+            for sgidx in range(len(seq_groups)):
+                if (seq_groups[sgidx].sampling_params.future_context
+                        is not None):
+                    forced_sample = torch.tensor([
+                        seq_groups[sgidx].sampling_params.future_context[sgidx]
+                        [min(
+                            len(sampling_metadata.seq_groups[sgidx].seq_data[
+                                sampling_params.cntr[sgidx]].output_token_ids),
+                            len(seq_groups[sgidx].sampling_params.
+                                future_context[sgidx]) - 1)]
+                    ])
+                else:
+                    forced_sample = torch.argmax(logprobs[long_sample_indices],
+                                                 dim=-1)
+                forced_samples = torch.cat([forced_samples, forced_sample])
+
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]
         else:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -183,7 +183,7 @@ class SamplingParams(
     min_p: float = 0.0
     ppl_measurement: bool = False
     future_context: Optional[List[int]] = None
-    cntr: Optional[int] = None
+    cntr: Optional[List[int]] = None
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
     stop_token_ids: Optional[List[int]] = None