FMBench readme updates for Llama3 on Inf2 and config.yml cleanup (meta-llama#486)

HamidShojanazeri · web-flow · commit 3d5c7015691e · 2024-05-06T11:09:17.000-07:00
diff --git a/recipes/benchmarks/fmbench/README.md b/recipes/benchmarks/fmbench/README.md
@@ -99,11 +99,12 @@ Llama3 is now available on Bedrock (read [blog post](https://aws.amazon.com/blog
 
 ## 🚨 Benchmarking Llama3 on Amazon SageMaker 🚨
 
-Llama3 is now available on SageMaker (read [blog post](https://aws.amazon.com/blogs/machine-learning/meta-llama-3-models-are-now-available-in-amazon-sagemaker-jumpstart/)), and you can now benchmark it using `FMBench`. Here are the config files for benchmarking `Llama3-8b-instruct` and `Llama3-70b-instruct` on `ml.p4d.24xlarge` and `ml.g5.12xlarge` instance.
+Llama3 is now available on SageMaker (read [blog post](https://aws.amazon.com/blogs/machine-learning/meta-llama-3-models-are-now-available-in-amazon-sagemaker-jumpstart/)), and you can now benchmark it using `FMBench`. Here are the config files for benchmarking `Llama3-8b-instruct` and `Llama3-70b-instruct` on `ml.p4d.24xlarge`, `ml.inf2.24xlarge` and `ml.g5.12xlarge` instances.
 
 <!-- markdown-link-check-disable -->
 - [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-8b-instruct-g5-p4d.yml) for `Llama3-8b-instruct` on  `ml.p4d.24xlarge` and `ml.g5.12xlarge`.
 - [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-70b-instruct-g5-p4d.yml) for `Llama3-70b-instruct` on  `ml.p4d.24xlarge` and `ml.g5.48xlarge`.
+- [Config file](https://github.com/aws-samples/foundation-model-benchmarking-tool/blob/main/src/fmbench/configs/config-llama3-8b-inf2-g5.yml) for `Llama3-8b-instruct` on  `ml.inf2.24xlarge` and `ml.g5.12xlarge`.
 <!-- markdown-link-check-enable -->
 
 ## Benchmarking Llama2 on Amazon SageMaker
diff --git a/recipes/benchmarks/fmbench/config.yml b/recipes/benchmarks/fmbench/config.yml
@@ -9,7 +9,7 @@ aws:
   # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
   sagemaker_execution_role: {role_arn}
   # S3 bucket to which metrics, plots and reports would be written to
-  bucket: {write_bucket} ## add the name of your desired bucket
+  bucket: {write_bucket}
 
 # directory paths in the write bucket, no need to change these
 dir_paths:
@@ -22,9 +22,10 @@ dir_paths:
 
 # S3 information for reading datasets, scripts and tokenizer
 s3_read_data:
-  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-{region}-{account_id}
+  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>
   read_bucket: {read_bucket}
-    
+  scripts_prefix: scripts
+  
   # S3 prefix in the read bucket where deployment and inference scripts should be placed
   scripts_prefix: scripts
     
@@ -52,13 +53,12 @@ s3_read_data:
   - narrativeqa.jsonl
   - triviaqa_e.jsonl
   - triviaqa.jsonl
-
   # S3 prefix for the tokenizer to be used with the models
   # NOTE 1: the same tokenizer is used with all the models being tested through a config file
   # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
-  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in  llama2_tokenizer
+  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.
   tokenizer_prefix: tokenizer
-
+  
   # S3 prefix for prompt templates
   prompt_template_dir: prompt_template
 
@@ -79,7 +79,7 @@ run_steps:
   4_model_metric_analysis.ipynb: yes
   5_cleanup.ipynb: yes
 
-# dataset related configuration
+
 datasets:
   # Refer to the 1_generate_data.ipynb notebook
   # the dataset you use is expected to have the 
@@ -89,7 +89,7 @@ datasets:
   prompt_template_keys:
   - input
   - context
-
+  
   # if your dataset has multiple languages and it has a language
   # field then you could filter it for a language. Similarly,
   # you can filter your dataset to only keep prompts between
@@ -125,7 +125,7 @@ datasets:
 # dataset which is listed below as the dataset_of_interest
 metrics:
   dataset_of_interest: en_2000-3000
-  
+
 # all pricing information is in the pricing.yml file
 # this file is provided in the repo. You can add entries
 # to this file for new instance types and new Bedrock models
@@ -156,18 +156,18 @@ experiments:
     # model_id is interpreted in conjunction with the deployment_script, so if you
     # use a JumpStart model id then set the deployment_script to jumpstart.py.
     # if deploying directly from HuggingFace this would be a HuggingFace model id
-    # see the DJL serving deployment script in the code repo for reference.    
+    # see the DJL serving deployment script in the code repo for reference.
     model_id: meta-textgeneration-llama-2-7b-f
     model_version: "3.*"
     model_name: llama2-7b-f
     ep_name: llama-2-7b-g5xlarge
     instance_type: "ml.g5.xlarge"
     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
-    deploy: yes    
+    deploy: yes
     instance_count: 1
     # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
     # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
-    # See repo for details
+    # See repo for details    
     deployment_script: jumpstart.py
     # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
     # and Bedrock. You can also add your own. See repo for details
@@ -181,14 +181,15 @@ experiments:
     - payload_en_500-1000.jsonl
     - payload_en_1000-2000.jsonl
     - payload_en_2000-3000.jsonl
+    #- payload_en_3000-3840.jsonl
     # concurrency level refers to number of requests sent in parallel to an endpoint
     # the next set of requests is sent once responses for all concurrent requests have
     # been received.
     concurrency_levels:
     - 1
     - 2
     - 4
-    # Added for models that require accepting a EULA
+
     accept_eula: true
     # Environment variables to be passed to the container
     # this is not a fixed list, you can add more parameters as applicable.
@@ -204,30 +205,47 @@ experiments:
       SAGEMAKER_MODEL_SERVER_WORKERS: "1"
 
   - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
+    # model_id is interpreted in conjunction with the deployment_script, so if you
+    # use a JumpStart model id then set the deployment_script to jumpstart.py.
+    # if deploying directly from HuggingFace this would be a HuggingFace model id
+    # see the DJL serving deployment script in the code repo for reference. 
     model_id: meta-textgeneration-llama-2-7b-f
     model_version: "3.*"
     model_name: llama2-7b-f
     ep_name: llama-2-7b-g5-2xlarge
     instance_type: "ml.g5.2xlarge"
     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
     deploy: yes
+    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
+    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
+    # See repo for details
     instance_count: 1
     deployment_script: jumpstart.py
+    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
+    # and Bedrock. You can also add your own. See repo for details
     inference_script: sagemaker_predictor.py
     inference_spec:
+      # this should match one of the sections in the inference_parameters section above
       parameter_set: sagemaker
+    # runs are done for each combination of payload file and concurrency level
     payload_files:
     - payload_en_1-500.jsonl
     - payload_en_500-1000.jsonl
     - payload_en_1000-2000.jsonl
     - payload_en_2000-3000.jsonl
-
+    #- payload_en_3000-3840.jsonl
+    
+    # concurrency level refers to number of requests sent in parallel to an endpoint
+    # the next set of requests is sent once responses for all concurrent requests have
+    # been received.
     concurrency_levels:
     - 1
     - 2
     - 4
-
+    # Added for models that require accepting a EULA
     accept_eula: true
+    # Environment variables to be passed to the container
+    # this is not a fixed list, you can add more parameters as applicable.
     env:
       SAGEMAKER_PROGRAM: "inference.py"
       ENDPOINT_SERVER_TIMEOUT: "3600"
@@ -249,7 +267,6 @@ report:
   latency_budget: 2
   cost_per_10k_txn_budget: 20
   error_rate_budget: 0
-
   # other misc reporting parameters, see 4_model_metric_analysis.ipynb
   # for more information
   per_inference_request_file: per_inference_request_results.csv