diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/__init__.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/__init__.py index b4fd7e111..0683665ca 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/__init__.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/__init__.py @@ -1,12 +1,11 @@ +from .local_inference.image_2_text import HuggingFaceImage2TextTransformer from .local_inference.text_2_image import HuggingFaceText2ImageDiffusor +from .local_inference.text_2_speech import HuggingFaceText2SpeechTransformer from .local_inference.text_generation import HuggingFaceTextGenerationTransformer -from .remote_inference_client.text_generation import HuggingFaceTextGenerationParser from .local_inference.text_summarization import HuggingFaceTextSummarizationTransformer from .local_inference.text_translation import HuggingFaceTextTranslationTransformer -from .local_inference.text_2_speech import HuggingFaceText2SpeechTransformer - +from .remote_inference_client.text_generation import HuggingFaceTextGenerationParser -# from .remote_inference_client.text_generation import HuggingFaceTextGenerationClient LOCAL_INFERENCE_CLASSES = [ "HuggingFaceText2ImageDiffusor", @@ -14,6 +13,8 @@ "HuggingFaceTextSummarizationTransformer", "HuggingFaceTextTranslationTransformer", "HuggingFaceText2SpeechTransformer", + "HuggingFaceAutomaticSpeechRecognition", + "HuggingFaceImage2TextTransformer", ] REMOTE_INFERENCE_CLASSES = ["HuggingFaceTextGenerationParser"] __ALL__ = LOCAL_INFERENCE_CLASSES + REMOTE_INFERENCE_CLASSES diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/hf_local_example.aiconfig.json b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/hf_local_example.aiconfig.json new file mode 100644 index 000000000..877ab6da7 --- /dev/null +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/hf_local_example.aiconfig.json @@ -0,0 +1,68 @@ +{ + "name": "The Tale of the Quick Brown Fox", + "schema_version": "latest", + "metadata": { + "parameters": {}, + "models": { + "stevhliu/my_awesome_billsum_model": { + "model": "stevhliu/my_awesome_billsum_model", + "min_length": 10, + "max_length": 30 + }, + "Salesforce/blip-image-captioning-base": { + "model": "Salesforce/blip-image-captioning-base" + } + }, + "model_parsers": { + "suno/bark": "HuggingFaceText2SpeechTransformer" + }, + "default_model": "stevhliu/my_awesome_billsum_model" + }, + "description": "The Tale of the Quick Brown Fox", + "prompts": [ + { + "name": "translate_instruction", + "input": "Tell the tale of {{topic}}", + "outputs": [], + "metadata": { + "model": "translation_en_to_fr", + "parameters": { + "topic": "the quick brown fox" + } + } + }, + { + "name": "summarize_story", + "input": "Once upon a time, in a lush and vibrant forest, there lived a magnificent creature known as the Quick Brown Fox. This fox was unlike any other, possessing incredible speed and agility that awed all the animals in the forest. With its fur as golden as the sun and its eyes as sharp as emeralds, the Quick Brown Fox was admired by everyone, from the tiniest hummingbird to the mightiest bear. The fox had a kind heart and would often lend a helping paw to those in need. The Quick Brown Fox had a particular fondness for games and challenges. It loved to test its skills against others, always seeking new adventures to satisfy its boundless curiosity. Its favorite game was called \"The Great Word Hunt,\" where it would embark on a quest to find hidden words scattered across the forest.", + "outputs": [], + "metadata": { + "model": "stevhliu/my_awesome_billsum_model" + } + }, + { + "name": "generate_audio_title", + "input": "The Quick Brown Fox was admired by all the animals in the forest.", + "metadata": { + "model": { + "name": "suno/bark", + "settings": {} + } + } + }, + { + "name": "generate_caption", + "input": { + "attachments": [ + { + "mime_type": "image/png", + "data": "/Users/jonathan/Desktop/pic.png" + } + ] + }, + "metadata": { + "model": "Salesforce/blip-image-captioning-base" + } + } + ], + "$schema": "https://json.schemastore.org/aiconfig-1.0" +} diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py new file mode 100644 index 000000000..3497ead6a --- /dev/null +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py @@ -0,0 +1,136 @@ +from typing import Any, Dict, Optional, List, TYPE_CHECKING +from aiconfig import ParameterizedModelParser, InferenceOptions +from aiconfig.callback import CallbackEvent +import torch +from aiconfig.schema import Prompt, Output, ExecuteResult, Attachment + +from transformers import pipeline, Pipeline + +if TYPE_CHECKING: + from aiconfig import AIConfigRuntime + + +class HuggingFaceImage2TextTransformer(ParameterizedModelParser): + def __init__(self): + """ + Returns: + HuggingFaceImage2TextTransformer + + Usage: + 1. Create a new model parser object with the model ID of the model to use. + parser = HuggingFaceImage2TextTransformer() + 2. Add the model parser to the registry. + config.register_model_parser(parser) + """ + super().__init__() + self.pipelines: dict[str, Pipeline] = {} + + def id(self) -> str: + """ + Returns an identifier for the Model Parser + """ + return "HuggingFaceImage2TextTransformer" + + async def serialize( + self, + prompt_name: str, + data: Any, + ai_config: "AIConfigRuntime", + parameters: Optional[Dict[str, Any]] = None, + ) -> List[Prompt]: + """ + Defines how a prompt and model inference settings get serialized in the .aiconfig. + Assume input in the form of input(s) being passed into an already constructed pipeline. + + Args: + prompt (str): The prompt to be serialized. + data (Any): Model-specific inference settings to be serialized. + ai_config (AIConfigRuntime): The AIConfig Runtime. + parameters (Dict[str, Any], optional): Model-specific parameters. Defaults to None. + + Returns: + str: Serialized representation of the prompt and inference settings. + """ + raise NotImplementedError("serialize is not implemented for HuggingFaceImage2TextTransformer") + + async def deserialize( + self, + prompt: Prompt, + aiconfig: "AIConfigRuntime", + params: Optional[Dict[str, Any]] = {}, + ) -> Dict[str, Any]: + await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_start", __name__, {"prompt": prompt, "params": params})) + + # Build Completion data + completion_params = self.get_model_settings(prompt, aiconfig) + + inputs = validate_and_retrieve_image_from_attachments(prompt) + + completion_params["inputs"] = inputs + + await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_complete", __name__, {"output": completion_params})) + return completion_params + + async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", options: InferenceOptions, parameters: Dict[str, Any]) -> list[Output]: + await aiconfig.callback_manager.run_callbacks( + CallbackEvent( + "on_run_start", + __name__, + {"prompt": prompt, "options": options, "parameters": parameters}, + ) + ) + model_name = aiconfig.get_model_name(prompt) + + self.pipelines[model_name] = pipeline(task="image-to-text", model=model_name) + + captioner = self.pipelines[model_name] + completion_data = await self.deserialize(prompt, aiconfig, parameters) + print(f"{completion_data=}") + inputs = completion_data.pop("inputs") + model = completion_data.pop("model") + response = captioner(inputs, **completion_data) + + output = ExecuteResult(output_type="execute_result", data=response, metadata={}) + + prompt.outputs = [output] + await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_run_complete", __name__, {"result": prompt.outputs})) + return prompt.outputs + + def get_output_text(self, response: dict[str, Any]) -> str: + raise NotImplementedError("get_output_text is not implemented for HuggingFaceImage2TextTransformer") + + +def validate_attachment_type_is_image(attachment: Attachment): + if not hasattr(attachment, "mime_type"): + raise ValueError(f"Attachment has no mime type. Specify the image mimetype in the aiconfig") + + if not attachment.mime_type.startswith("image/"): + raise ValueError(f"Invalid attachment mimetype {attachment.mime_type}. Expected image mimetype.") + + +def validate_and_retrieve_image_from_attachments(prompt: Prompt) -> list[str]: + """ + Retrieves the image uri's from each attachment in the prompt input. + + Throws an exception if + - attachment is not image + - attachment data is not a uri + - no attachments are found + - operation fails for any reason + """ + + if not hasattr(prompt.input, "attachments") or len(prompt.input.attachments) == 0: + raise ValueError(f"No attachments found in input for prompt {prompt.name}. Please add an image attachment to the prompt input.") + + image_uris: list[str] = [] + + for i, attachment in enumerate(prompt.input.attachments): + validate_attachment_type_is_image(attachment) + + if not isinstance(attachment.data, str): + # See todo above, but for now only support uri's + raise ValueError(f"Attachment #{i} data is not a uri. Please specify a uri for the image attachment in prompt {prompt.name}.") + + image_uris.append(attachment.data) + + return image_uris diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/run_hf_example.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/run_hf_example.py new file mode 100644 index 000000000..6e5c344a7 --- /dev/null +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/run_hf_example.py @@ -0,0 +1,90 @@ +# import ModelParserRegistry from aiconfig +import asyncio +import base64 +import sys +from aiconfig.registry import ModelParserRegistry +from aiconfig_extension_hugging_face.local_inference.text_2_speech import HuggingFaceText2SpeechTransformer +from aiconfig_extension_hugging_face.local_inference.text_generation import HuggingFaceTextGenerationTransformer +from aiconfig_extension_hugging_face.local_inference.text_summarization import HuggingFaceTextSummarizationTransformer +from aiconfig_extension_hugging_face.local_inference.text_translation import HuggingFaceTextTranslationTransformer +from aiconfig_extension_hugging_face.local_inference.image_2_text import HuggingFaceImage2TextTransformer +from aiconfig import AIConfigRuntime, InferenceOptions, CallbackManager + + +async def run(hf_aiconfig_path: str): + for model_parser in [ + HuggingFaceText2SpeechTransformer(), + # HuggingFaceTextGenerationTransformer(), + ]: + ModelParserRegistry.register_model_parser(model_parser) + + AIConfigRuntime.register_model_parser(HuggingFaceTextTranslationTransformer(), "translation_en_to_fr") + AIConfigRuntime.register_model_parser(HuggingFaceTextSummarizationTransformer(), "stevhliu/my_awesome_billsum_model") + AIConfigRuntime.register_model_parser(HuggingFaceImage2TextTransformer(), "Salesforce/blip-image-captioning-base") + ModelParserRegistry.register_model_parser(HuggingFaceText2SpeechTransformer()) + # AIConfigRuntime.register_model_parser(mp, "text_2_speech") + # AIConfigRuntime.register_model_parser(mp, "suno/bark") + + config = AIConfigRuntime.load(hf_aiconfig_path) + config.callback_manager = CallbackManager([]) + + options = InferenceOptions(stream=False) + + # out1 = await config.run( + # # + # "translate_instruction", + # options=options, + # ) + # print(f"{out1=}") + + # out2 = await config.run( + # # + # "generate_story", + # options=options, + # ) + # print(f"{out2=}") + + # out3 = await config.run( + # # + # "summarize_story", + # options=options, + # ) + + # print(f"{out3=}") + + # out4 = await config.run( + # # + # "generate_audio_title", + # options=options, + # ) + + # print(f"{out4=}") + # with open("story_title.wav", "wb") as f: + # encoded = out4[0].data.value + # decoded_binary = base64.b64decode(encoded.encode("utf-8")) + # f.write(decoded_binary) + + # print("Stream") + # options = InferenceOptions(stream=True, stream_callback=print_stream) + # out = await config.run("test_hf_trans", options=options) + # print("Output:\n", out) + + out5 = await config.run( + # + "generate_caption", + options=options, + ) + + print(f"{out5=}") + + +async def main(argv: list[str]): + print("Starting!") + path = argv[1] + print(f"Loading aiconfig from {path}") + await run(path) + print("Done!") + + +if __name__ == "__main__": + asyncio.run(main(sys.argv)) diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py index 85dee4add..0b97e3433 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py @@ -198,6 +198,7 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio completion_data = await self.deserialize(prompt, aiconfig, options, parameters) inputs = completion_data.pop("prompt", None) + print("Running text to speech model. This might take a while, please be patient...") response = synthesizer(inputs, **completion_data) outputs: List[Output] = [] diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py index bba735b4f..04bea7c26 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py @@ -255,6 +255,7 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio output = None def _summarize(): + print(f"{inputs=}, {completion_data=}") return summarizer(inputs, **completion_data) if not should_stream: