diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py index a14b97c..f48d801 100644 --- a/optimum/executorch/modeling.py +++ b/optimum/executorch/modeling.py @@ -460,29 +460,27 @@ def __init__( config: "PretrainedConfig", ): super().__init__(models=models, config=config) - if not hasattr(self, "encoder"): - raise AttributeError("Expected attribute 'encoder' not found in the instance.") - if not hasattr(self, "text_decoder"): - raise AttributeError("Expected attribute 'text_decoder' not found in the instance.") - metadata = self.decoder.method_names() + if not hasattr(self, "model"): + raise AttributeError("Expected attribute 'model' not found in the instance.") + metadata = self.model.method_names() if "use_kv_cache" in metadata: - self.use_kv_cache = self.decoder.run_method("use_kv_cache")[0] + self.use_kv_cache = self.model.run_method("use_kv_cache")[0] if "get_max_seq_len" in metadata: - self.max_cache_size = self.decoder.run_method("get_max_seq_len")[0] + self.max_cache_size = self.model.run_method("get_max_seq_len")[0] if "get_max_batch_size" in metadata: - self.max_batch_size = self.decoder.run_method("get_max_batch_size")[0] + self.max_batch_size = self.model.run_method("get_max_batch_size")[0] if "get_dtype" in metadata: - self.dtype = self.decoder.run_method("get_dtype")[0] + self.dtype = self.model.run_method("get_dtype")[0] if "get_bos_id" in metadata: - self.bos_token_id = self.decoder.run_method("get_bos_id")[0] + self.bos_token_id = self.model.run_method("get_bos_id")[0] if "get_eos_id" in metadata: - self.eos_token_id = self.decoder.run_method("get_eos_id")[0] + self.eos_token_id = self.model.run_method("get_eos_id")[0] if "get_vocab_size" in metadata: - self.vocab_size = self.decoder.run_method("get_vocab_size")[0] + self.vocab_size = self.model.run_method("get_vocab_size")[0] if "max_hidden_seq_length" in metadata: - self.max_hidden_seq_length = self.decoder.run_method("max_hidden_seq_length")[0] + self.max_hidden_seq_length = self.model.run_method("max_hidden_seq_length")[0] if "decoder_start_token_id" in metadata: - self.decoder_start_token_id = self.decoder.run_method("decoder_start_token_id")[0] + self.decoder_start_token_id = self.model.run_method("decoder_start_token_id")[0] def forward( self, @@ -491,15 +489,14 @@ def forward( cache_position: torch.Tensor, encoder_outputs: Optional[torch.Tensor] = None, ): - # Encode if needed (first prediction pass) is_first_prediction = encoder_outputs is None self.stats.on_model_execution_start() if is_first_prediction: - encoder_outputs = self.encoder.forward((input_ids,))[0] + encoder_outputs = self.model.run_method("encoder", (input_ids,))[0] self.stats.on_prompt_eval_end() result = ( - self.decoder.forward((decoder_input_ids, encoder_outputs, cache_position))[0], + self.model.run_method("text_decoder", (decoder_input_ids, encoder_outputs, cache_position))[0], encoder_outputs, ) self.stats.on_model_execution_end() @@ -530,9 +527,6 @@ def generate( Returns: List[int]: List of generated token IDs. - Note: - Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind. - Expect improvements to the pybind interface in ExecuTorch version 0.4.1. """ self.device = torch.device("cpu") if max_seq_len is None: @@ -550,7 +544,6 @@ def generate( encoder_input_ids = input_ids encoder_outputs = None generated_ids = [0] - first_token_generated = False # Generate tokens one by one diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py index 4f799f8..4ad1891 100644 --- a/optimum/exporters/executorch/integrations.py +++ b/optimum/exporters/executorch/integrations.py @@ -424,7 +424,10 @@ def __init__( self.use_custom_sdpa = use_custom_sdpa self.disable_dynamic_shapes = disable_dynamic_shapes self.metadata = save_config_to_constant_methods( - model.config, model.generation_config, get_max_seq_len=max_seq_len, enable_dynamic_shape=not self.disable_dynamic_shapes + model.config, + model.generation_config, + get_max_seq_len=max_seq_len, + enable_dynamic_shape=not self.disable_dynamic_shapes, ) logging.info(f"Metadata to be recorded in PTE: {self.metadata}") diff --git a/optimum/exporters/executorch/tasks/seq2seq_lm.py b/optimum/exporters/executorch/tasks/seq2seq_lm.py index 09ee2aa..b4cb026 100644 --- a/optimum/exporters/executorch/tasks/seq2seq_lm.py +++ b/optimum/exporters/executorch/tasks/seq2seq_lm.py @@ -23,36 +23,36 @@ @register_task("text2text-generation") def load_seq2seq_lm_model(model_name_or_path: str, **kwargs) -> Seq2SeqLMExportableModule: """ - Loads a seq2seq language model for conditional text generation and registers it under the task - 'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`. + Loads a seq2seq language model for conditional text generation and registers it under the task + 'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`. - Args: - model_name_or_path (str): - Model ID on huggingface.co or path on disk to the model repository to export. For example: - `model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder` - **kwargs: - Additional configuration options for the model: - - dtype (str, optional): - Data type for model weights (default: "float32"). - Options include "float16" and "bfloat16". - - max_hidden_seq_length (int, optional): - Maximum hidden sequence length (default: 4096). - - max_cache_length (int, optional): - Maximum sequence length for generation (default: 1024). + Args: + model_name_or_path (str): + Model ID on huggingface.co or path on disk to the model repository to export. For example: + `model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder` + **kwargs: + Additional configuration options for the model: + - dtype (str, optional): + Data type for model weights (default: "float32"). + Options include "float16" and "bfloat16". + - max_hidden_seq_length (int, optional): + Maximum hidden sequence length (default: 4096). + - max_cache_length (int, optional): + Maximum sequence length for generation (default: 1024). - Returns: - Seq2SeqLMExportableModule: - An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch. - """ + Returns: + Seq2SeqLMExportableModule: + An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch. + n""" device = "cpu" batch_size = 1 - max_hidden_seq_length = kwargs.get("max_hidden_seq_length", 4096) - max_cache_length = kwargs.get("max_cache_length", 1024) + max_hidden_seq_len = kwargs.get("max_hidden_seq_len", 4096) + max_seq_len = kwargs.get("max_seq_len", 1024) full_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device).eval() return Seq2SeqLMExportableModule( full_model, batch_size=batch_size, - max_hidden_seq_length=max_hidden_seq_length, - max_cache_length=max_cache_length, + max_seq_len=max_seq_len, + max_hidden_seq_len=max_hidden_seq_len, ) diff --git a/tests/models/test_modeling_t5.py b/tests/models/test_modeling_t5.py index c44e9ef..3ee5c10 100644 --- a/tests/models/test_modeling_t5.py +++ b/tests/models/test_modeling_t5.py @@ -21,7 +21,6 @@ import pytest from executorch import version -from executorch.extension.pybindings.portable_lib import ExecuTorchModule from packaging.version import parse from transformers import AutoTokenizer from transformers.testing_utils import slow @@ -45,20 +44,13 @@ def test_t5_export_to_executorch(self): shell=True, check=True, ) - self.assertTrue(os.path.exists(f"{tempdir}/executorch/encoder.pte")) - self.assertTrue(os.path.exists(f"{tempdir}/executorch/decoder.pte")) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) def _helper_t5_translation(self, recipe: str): model_id = "google/flan-t5-small" tokenizer = AutoTokenizer.from_pretrained(model_id) model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe) - self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM) - self.assertTrue(hasattr(model, "text_encoder")) - self.assertIsInstance(model.encoder, ExecuTorchModule) - self.assertTrue(hasattr(model, "text_decoder")) - self.assertIsInstance(model.decoder, ExecuTorchModule) - input_text = "translate English to German: How old are you?" generated_text = model.text_generation( tokenizer=tokenizer, @@ -88,12 +80,6 @@ def _helper_t5_summarization(self, recipe: str): tokenizer = AutoTokenizer.from_pretrained(model_id) model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe) - self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM) - self.assertTrue(hasattr(model, "encoder")) - self.assertIsInstance(model.encoder, ExecuTorchModule) - self.assertTrue(hasattr(model, "text_decoder")) - self.assertIsInstance(model.decoder, ExecuTorchModule) - article = ( " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A" " year later, she got married again in Westchester County, but to a different man and without divorcing"