Remove has_full_logits from llama runner

jackzhxng · jackzhxng · commit c80ce1c69def · 2024-11-13T14:11:16.000-08:00
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
@@ -13,7 +13,6 @@
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
-    TORCHTUNE_DEFINED_MODELS,
 )
 from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export.builder import LLMEdgeManager
@@ -33,7 +32,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.use_kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
             device="cuda" if torch.cuda.is_available() else "cpu",
         )
         manager: LLMEdgeManager = _prepare_for_llama_export(args)
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -53,7 +53,6 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
-        has_full_logits: bool = False,
         device: str = "cpu",
     ):
         """
@@ -64,14 +63,12 @@ def __init__(
         max_batch_size: max batch size.
         use_kv_cache: whether to use a KV cache.
         vocab_size: number of items in the vocab.
-        has_full_logits: whether the model returns the full logits or only returns the last logit.
         device: device to run the runner on.
         """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 
@@ -102,10 +99,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
@@ -127,10 +121,7 @@ def generate(  # noqa: C901
                 )
 
             # If the logits aren't already clipped to only contain the last logit, clip them.
-            if self.has_full_logits:
-                current_token = next_token(logits[:, -1, :], temperature, top_p)
-            else:
-                current_token = next_token(logits, temperature, top_p)
+            current_token = next_token(logits, temperature, top_p)
             tokens.append(current_token)
 
             if current_token == self.tokenizer.eos_id or (
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -41,7 +41,6 @@ def __init__(self, args):
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
             vocab_size=params["vocab_size"],
-            has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,
         )
         self.model = _load_for_executorch(args.pte)
 

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,6 @@ def __init__(self, args):`
`41`	`41`	`max_batch_size=1,`
`42`	`42`	`use_kv_cache=args.kv_cache,`
`43`	`43`	`vocab_size=params["vocab_size"],`
`44`		`- has_full_logits=args.model in TORCHTUNE_DEFINED_MODELS,`
`45`	`44`	`)`
`46`	`45`	`self.model = _load_for_executorch(args.pte)`
`47`	`46`