Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 14 additions & 21 deletions optimum/executorch/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,29 +460,27 @@ def __init__(
config: "PretrainedConfig",
):
super().__init__(models=models, config=config)
if not hasattr(self, "encoder"):
raise AttributeError("Expected attribute 'encoder' not found in the instance.")
if not hasattr(self, "text_decoder"):
raise AttributeError("Expected attribute 'text_decoder' not found in the instance.")
metadata = self.decoder.method_names()
if not hasattr(self, "model"):
raise AttributeError("Expected attribute 'model' not found in the instance.")
metadata = self.model.method_names()
if "use_kv_cache" in metadata:
self.use_kv_cache = self.decoder.run_method("use_kv_cache")[0]
self.use_kv_cache = self.model.run_method("use_kv_cache")[0]
if "get_max_seq_len" in metadata:
self.max_cache_size = self.decoder.run_method("get_max_seq_len")[0]
self.max_cache_size = self.model.run_method("get_max_seq_len")[0]
if "get_max_batch_size" in metadata:
self.max_batch_size = self.decoder.run_method("get_max_batch_size")[0]
self.max_batch_size = self.model.run_method("get_max_batch_size")[0]
if "get_dtype" in metadata:
self.dtype = self.decoder.run_method("get_dtype")[0]
self.dtype = self.model.run_method("get_dtype")[0]
if "get_bos_id" in metadata:
self.bos_token_id = self.decoder.run_method("get_bos_id")[0]
self.bos_token_id = self.model.run_method("get_bos_id")[0]
if "get_eos_id" in metadata:
self.eos_token_id = self.decoder.run_method("get_eos_id")[0]
self.eos_token_id = self.model.run_method("get_eos_id")[0]
if "get_vocab_size" in metadata:
self.vocab_size = self.decoder.run_method("get_vocab_size")[0]
self.vocab_size = self.model.run_method("get_vocab_size")[0]
if "max_hidden_seq_length" in metadata:
self.max_hidden_seq_length = self.decoder.run_method("max_hidden_seq_length")[0]
self.max_hidden_seq_length = self.model.run_method("max_hidden_seq_length")[0]
if "decoder_start_token_id" in metadata:
self.decoder_start_token_id = self.decoder.run_method("decoder_start_token_id")[0]
self.decoder_start_token_id = self.model.run_method("decoder_start_token_id")[0]

def forward(
self,
Expand All @@ -491,15 +489,14 @@ def forward(
cache_position: torch.Tensor,
encoder_outputs: Optional[torch.Tensor] = None,
):
# Encode if needed (first prediction pass)
is_first_prediction = encoder_outputs is None
self.stats.on_model_execution_start()
if is_first_prediction:
encoder_outputs = self.encoder.forward((input_ids,))[0]
encoder_outputs = self.model.run_method("encoder", (input_ids,))[0]
self.stats.on_prompt_eval_end()

result = (
self.decoder.forward((decoder_input_ids, encoder_outputs, cache_position))[0],
self.model.run_method("text_decoder", (decoder_input_ids, encoder_outputs, cache_position))[0],
encoder_outputs,
)
self.stats.on_model_execution_end()
Expand Down Expand Up @@ -530,9 +527,6 @@ def generate(
Returns:
List[int]: List of generated token IDs.

Note:
Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind.
Expect improvements to the pybind interface in ExecuTorch version 0.4.1.
Comment on lines -533 to -535
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use LLM runner

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

T5 is not an LLM though

"""
self.device = torch.device("cpu")
if max_seq_len is None:
Expand All @@ -550,7 +544,6 @@ def generate(
encoder_input_ids = input_ids
encoder_outputs = None
generated_ids = [0]

first_token_generated = False

# Generate tokens one by one
Expand Down
46 changes: 23 additions & 23 deletions optimum/exporters/executorch/tasks/seq2seq_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,36 @@
@register_task("text2text-generation")
def load_seq2seq_lm_model(model_name_or_path: str, **kwargs) -> Seq2SeqLMExportableModule:
"""
Loads a seq2seq language model for conditional text generation and registers it under the task
'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`.
Loads a seq2seq language model for conditional text generation and registers it under the task
'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`.

Args:
model_name_or_path (str):
Model ID on huggingface.co or path on disk to the model repository to export. For example:
`model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder`
**kwargs:
Additional configuration options for the model:
- dtype (str, optional):
Data type for model weights (default: "float32").
Options include "float16" and "bfloat16".
- max_hidden_seq_length (int, optional):
Maximum hidden sequence length (default: 4096).
- max_cache_length (int, optional):
Maximum sequence length for generation (default: 1024).
Args:
model_name_or_path (str):
Model ID on huggingface.co or path on disk to the model repository to export. For example:
`model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder`
**kwargs:
Additional configuration options for the model:
- dtype (str, optional):
Data type for model weights (default: "float32").
Options include "float16" and "bfloat16".
- max_hidden_seq_length (int, optional):
Maximum hidden sequence length (default: 4096).
- max_cache_length (int, optional):
Maximum sequence length for generation (default: 1024).

Returns:
Seq2SeqLMExportableModule:
An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch.
"""
Returns:
Seq2SeqLMExportableModule:
An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch.
n"""
device = "cpu"
batch_size = 1
max_hidden_seq_length = kwargs.get("max_hidden_seq_length", 4096)
max_cache_length = kwargs.get("max_cache_length", 1024)
max_hidden_seq_len = kwargs.get("max_hidden_seq_len", 4096)
max_seq_len = kwargs.get("max_seq_len", 1024)

full_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device).eval()
return Seq2SeqLMExportableModule(
full_model,
batch_size=batch_size,
max_hidden_seq_length=max_hidden_seq_length,
max_cache_length=max_cache_length,
max_seq_len=max_seq_len,
max_hidden_seq_len=max_hidden_seq_len,
)
16 changes: 1 addition & 15 deletions tests/models/test_modeling_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import pytest
from executorch import version
from executorch.extension.pybindings.portable_lib import ExecuTorchModule
from packaging.version import parse
from transformers import AutoTokenizer
from transformers.testing_utils import slow
Expand All @@ -45,20 +44,13 @@ def test_t5_export_to_executorch(self):
shell=True,
check=True,
)
self.assertTrue(os.path.exists(f"{tempdir}/executorch/encoder.pte"))
self.assertTrue(os.path.exists(f"{tempdir}/executorch/decoder.pte"))
self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))

def _helper_t5_translation(self, recipe: str):
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe)

self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM)
self.assertTrue(hasattr(model, "text_encoder"))
self.assertIsInstance(model.encoder, ExecuTorchModule)
self.assertTrue(hasattr(model, "text_decoder"))
self.assertIsInstance(model.decoder, ExecuTorchModule)

input_text = "translate English to German: How old are you?"
generated_text = model.text_generation(
tokenizer=tokenizer,
Expand Down Expand Up @@ -88,12 +80,6 @@ def _helper_t5_summarization(self, recipe: str):
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe)

self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM)
self.assertTrue(hasattr(model, "encoder"))
self.assertIsInstance(model.encoder, ExecuTorchModule)
self.assertTrue(hasattr(model, "text_decoder"))
self.assertIsInstance(model.decoder, ExecuTorchModule)

article = (
" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
" year later, she got married again in Westchester County, but to a different man and without divorcing"
Expand Down
Loading