modify args

Kyriection · Kyriection · commit 92389d0a85f1 · 2024-06-03T21:40:24.000-07:00
diff --git a/recipes/experimental/long-context/H2O/README.md b/recipes/experimental/long-context/H2O/README.md
@@ -14,13 +14,13 @@ More details please refer to Paper: **https://arxiv.org/pdf/2306.14048**; Blog:
 
 ### Evaluation on Summarization Tasks
 
-The following example runs inference of Llama-2-7b on XSUM summarization tasks. We're using `--enable_h2o_generation` to enable H2O algorithm that only keeps heavy-hitter and the local KV pairs. Use `--num_heavy_hitter_tokens` to decide the number of heavy-hitter KV pairs and `--num_window_length `for the KV cache size. The number of local KV pairs equals num_window_length - num_heavy_hitter_tokens. Also, use --enable_position_rolling to enable position rolling in the KV cache size that assign the positions in the KV cache instead of the ones in original sequences. Enabling postional rolling is important when sequence length exceeds the pretrained context windows, e.g., 4K in Llama-2.
+The following example runs inference of Llama-2-7b and Meta-Llama-3-8B on XSUM summarization tasks. We're using `--enable_h2o_generation` to enable H2O algorithm that only keeps heavy-hitter and the local KV pairs. Use `--num_window_length `to decide the KV cache size. The number of local and heavy-hitter KV pairs equals to half of the --num_window_length (Option: the number of heavy-hitters can also be specific by `--num_heavy_hitter_tokens`) Also, use --enable_position_rolling to enable position rolling in the KV cache size that assign the positions in the KV cache instead of the ones in original sequences. Enabling positional rolling is important when sequence length exceeds the pretrained context windows, e.g., 8K in Llama-3.
 
 ```
 python run_summarization.py \
 --input-path data/summarization/xsum.jsonl \
 --output-path summarization_output/xsum_h2o.jsonl \
---model-name meta-llama/Llama-2-7b-hf \
+--model-name meta-llama/Meta-Llama-3-8B \
 --enable_h2o_generation 
 ```
 
diff --git a/recipes/experimental/long-context/H2O/run_needle_haystack_test.py b/recipes/experimental/long-context/H2O/run_needle_haystack_test.py
@@ -30,7 +30,7 @@ def set_seed(args):
     parser.add_argument("--model-name", type=str, default="")
 
     parser.add_argument("--enable_h2o_generation", action='store_true')
-    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=128)
+    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=-1)
     parser.add_argument("--num_window_length", type=int, default=256)
     parser.add_argument("--num_chunk_size", type=int, default=2048)
 
@@ -53,6 +53,10 @@ def set_seed(args):
     config = AutoConfig.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 
+    if args.num_heavy_hitter_tokens == -1:
+        print('not assign number of heavy hitter tokens, use half of the cache size: {}'.format(args.num_window_length // 2))
+        args.num_heavy_hitter_tokens = args.num_window_length // 2
+
     if args.enable_h2o_generation:
         config.num_heavy_hitter_tokens = args.num_heavy_hitter_tokens
         config.num_window_length = args.num_window_length
diff --git a/recipes/experimental/long-context/H2O/run_summarization.py b/recipes/experimental/long-context/H2O/run_summarization.py
@@ -32,7 +32,7 @@ def set_seed(args):
     parser.add_argument("--model-name", type=str, default="")
 
     parser.add_argument("--enable_h2o_generation", action='store_true')
-    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=128)
+    parser.add_argument("--num_heavy_hitter_tokens", type=int, default=-1)
     parser.add_argument("--num_window_length", type=int, default=256)
 
     parser.add_argument("--enable_position_rolling", action='store_true')
@@ -51,6 +51,9 @@ def set_seed(args):
 
     config = AutoConfig.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    if args.num_heavy_hitter_tokens == -1:
+        print('not assign number of heavy hitter tokens, use half of the cache size: {}'.format(args.num_window_length // 2))
+        args.num_heavy_hitter_tokens = args.num_window_length // 2
 
     if args.enable_h2o_generation:
         config.num_heavy_hitter_tokens = args.num_heavy_hitter_tokens