EmbeddedLLM
diff --git a/‎benchmarks/P3L.py‎
Lines changed: 104 additions & 78 deletions b/‎benchmarks/P3L.py‎
Lines changed: 104 additions & 78 deletions
@@ -3,38 +3,38 @@
 """
 Patch-Perplexity (P3L)
 
-This is a script that produces a realistic PPL measurement 
-for the quantized KV cache system by processing a sequence of 
-non-overlapping patches of the reference text. Generation of the 
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
 consecutive symbols in each patch is governed (forced)
 by the reference text.
 
-The initial context size for the system is set by the parameter 
+The initial context size for the system is set by the parameter
 "--context-size".
 
-The number of output symbols to generate starting from a given 
-context is set by the parameter "--sample-size". This variable also 
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
 defines the size of the individual patch.
 
-For the N-token reference text that is split into M patches with the 
+For the N-token reference text that is split into M patches with the
 system's context size C it takes M*preload + (N-C)*generation time.
 
 Quick correctness validation tips:
 
-Running llama-2-7b model 
-( 
-    ./vllm/examples/P3L.py 
-    --model=meta-llama/Llama-2-7b-chat-hf 
-    --context-size=1024 
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
     --sample-size=512
 )
 should result in PPL ~ 6.524227946419175
 
-Running llama-2-7b model 
-( 
-    ./vllm/examples/P3L.py 
-    --model=meta-llama/Llama-2-7b-chat-hf 
-    --context-size=1024 
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
     --sample-size=512
     --patch-size=1
 )
@@ -58,17 +58,20 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
 
 def get_wikitext2_text(tokenizer):
     with tempfile.TemporaryDirectory() as tmpdirname:
-        hf_hub_download(repo_id='alexei-v-ivanov-amd/wiki',
-                        repo_type="dataset",
-                        filename='wiki.test.raw',
-                        local_dir=tmpdirname)
-        with open(os.path.join(tmpdirname, 'wiki.test.raw')) as f:
+        hf_hub_download(
+            repo_id="alexei-v-ivanov-amd/wiki",
+            repo_type="dataset",
+            filename="wiki.test.raw",
+            local_dir=tmpdirname,
+        )
+        with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
             test_text = "\n".join(line.strip() for line in f)
             test_enc = tokenizer(test_text)
 
@@ -79,15 +82,17 @@ def vllm_init(args):
     engine_args = EngineArgs.from_cli_args(args)
     llm = LLM(**dataclasses.asdict(engine_args))
 
-    sampling_params = SamplingParams(n=1,
-                                     temperature=0.0,
-                                     top_p=1,
-                                     ignore_eos=True,
-                                     ppl_measurement=True,
-                                     future_context=[],
-                                     prompt_logprobs=1,
-                                     logprobs=1,
-                                     presence_penalty=0.0)
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0.0,
+        top_p=1,
+        ignore_eos=True,
+        ppl_measurement=True,
+        future_context=[],
+        prompt_logprobs=1,
+        logprobs=1,
+        presence_penalty=0.0,
+    )
 
     return llm, sampling_params
 
@@ -98,7 +103,6 @@ def vllm_predict(CONT, llm, sampl_par):
 
 
 def main(args: argparse.Namespace):
-
     MESSAGE = f"Initialising @ {datetime.datetime.now()}"
     logger.info(MESSAGE)
     print(MESSAGE)
@@ -112,14 +116,17 @@ def main(args: argparse.Namespace):
 
     my_n_samples = args.sample_size
 
-    if (args.context_size+my_n_samples) > \
-        my_llm.llm_engine.model_config.max_model_len:
-        MESSAGE = ("" \
-            "Error! The total number of tokens:\n" \
-            f" prefix ({args.context_size}) + " \
-            f"to be generated ({my_n_samples})" \
-            f" can't be bigger than the model limit " \
-            f"({my_llm.llm_engine.model_config.max_model_len}).")
+    if (
+        args.context_size + my_n_samples
+    ) > my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = (
+            ""
+            "Error! The total number of tokens:\n"
+            f" prefix ({args.context_size}) + "
+            f"to be generated ({my_n_samples})"
+            f" can't be bigger than the model limit "
+            f"({my_llm.llm_engine.model_config.max_model_len})."
+        )
         logger.info(MESSAGE)
         print(MESSAGE)
         return
@@ -128,26 +135,28 @@ def main(args: argparse.Namespace):
     logger.info("Loaded the test data.")
 
     my_n_patches = math.ceil(
-        (len(my_test_enc['input_ids']) - args.context_size - 1) / my_n_samples)
+        (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+    )
     if args.patch_size is not None:
         my_n_patches = args.patch_size
 
     num_tokens_generated = 0
     starting_time = datetime.datetime.now()
-    MESSAGE = (f"Starting generation @ {starting_time}\n" \
-                " Have the test sample of "
-                f"{len(my_test_enc['input_ids'])} tokens" \
-                f" will try to process {my_n_patches} patche(s)," \
-                f" generating {my_n_samples} tokens in each patch" \
-                f" from the initial context of {args.context_size} tokens.")
+    MESSAGE = (
+        f"Starting generation @ {starting_time}\n"
+        " Have the test sample of "
+        f"{len(my_test_enc['input_ids'])} tokens"
+        f" will try to process {my_n_patches} patche(s),"
+        f" generating {my_n_samples} tokens in each patch"
+        f" from the initial context of {args.context_size} tokens."
+    )
 
     logger.info(MESSAGE)
     print(MESSAGE)
 
     my_batchsize = args.batch_size
 
     for c in range(0, my_n_patches, my_batchsize):
-
         CONTEXT = []
         my_sampl_par.future_context = []
         my_sampl_par.cntr = []
@@ -156,53 +165,68 @@ def main(args: argparse.Namespace):
             if (c + b) < my_n_patches:
                 upper_boundary = min(
                     (c + b + 1) * my_n_samples + args.context_size,
-                    len(my_test_enc['input_ids']))
+                    len(my_test_enc["input_ids"]),
+                )
                 CONTEXT.append(
-                    my_test_enc['input_ids'][(c + b) * my_n_samples:(c + b) *
-                                             my_n_samples + args.context_size])
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples : (c + b) * my_n_samples
+                        + args.context_size
+                    ]
+                )
 
                 my_sampl_par.future_context.append(
-                    my_test_enc['input_ids'][(c + b) * my_n_samples +
-                                             args.context_size:upper_boundary])
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples + args.context_size : upper_boundary
+                    ]
+                )
 
                 my_sampl_par.cntr.append(c + b)
 
         my_sampl_par.max_tokens = max(
-            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT)))
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+        )
 
         LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
         for b in range(len(CONTEXT)):
             num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
             my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
 
-        if (num_tokens_generated < my_n_samples * len(CONTEXT)):
-            MESSAGE = (f"Warning: The number of generated tokens is" \
-                    f"less than requested ({num_tokens_generated}" \
-                    f" < {my_n_samples*len(CONTEXT)}).")
+        if num_tokens_generated < my_n_samples * len(CONTEXT):
+            MESSAGE = (
+                f"Warning: The number of generated tokens is"
+                f"less than requested ({num_tokens_generated}"
+                f" < {my_n_samples * len(CONTEXT)})."
+            )
             logger.info(MESSAGE)
             print(MESSAGE)
 
-        MESSAGE = (f"Iterations {c+1} through {c+len(CONTEXT)}" \
-            f" of {my_n_patches} Intermediate " \
-            "Estimates:\n" \
-            f"\tCross-entropy_intermediate={my_ppl/num_tokens_generated}\n" \
-            f"\tPerplexity_intermediate=" \
-            f"{math.exp(my_ppl/num_tokens_generated)}")
+        MESSAGE = (
+            f"Iterations {c + 1} through {c + len(CONTEXT)}"
+            f" of {my_n_patches} Intermediate "
+            "Estimates:\n"
+            f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+            f"\tPerplexity_intermediate="
+            f"{math.exp(my_ppl / num_tokens_generated)}"
+        )
 
         logger.info(MESSAGE)
         print(MESSAGE)
 
     ending_time = datetime.datetime.now()
-    MESSAGE = (f"Done @ {ending_time} after processing for" \
-                f" {ending_time-starting_time}" \
-                f" generated {num_tokens_generated} tokens.")
+    MESSAGE = (
+        f"Done @ {ending_time} after processing for"
+        f" {ending_time - starting_time}"
+        f" generated {num_tokens_generated} tokens."
+    )
 
     logger.info(MESSAGE)
     print(MESSAGE)
 
-    MESSAGE = (f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy=" \
-                f"{my_ppl/num_tokens_generated}" \
-                f"\n\tPPL={math.exp(my_ppl/num_tokens_generated)}")
+    MESSAGE = (
+        f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+        f"{my_ppl / num_tokens_generated}"
+        f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+    )
 
     if args.output_json:
         results = {
@@ -219,17 +243,19 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Measure the PPPL (P3L) score of a given model.')
-    parser.add_argument('--context-size', type=int, default=4096)
-    parser.add_argument('--sample-size', type=int, default=512)
-    parser.add_argument('--batch-size', type=int, default=1)
-    parser.add_argument('--patch-size', type=int, default=None)
+    parser = FlexibleArgumentParser(
+        description="Measure the PPPL (P3L) score of a given model."
+    )
+    parser.add_argument("--context-size", type=int, default=4096)
+    parser.add_argument("--sample-size", type=int, default=512)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--patch-size", type=int, default=None)
     parser.add_argument(
-        '--output-json',
+        "--output-json",
         type=str,
         default=None,
-        help='Path to save the latency results in JSON format.')
+        help="Path to save the latency results in JSON format.",
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()