[HF][5/n] Image2Text: Allow base64 inputs for images

Rossdan Craig rossdan@lastmileai.dev · Rossdan Craig rossdan@lastmileai.dev · commit fa9d88a6e8f4 · 2024-01-10T05:08:09.000-05:00
Before we didn't allow base64, only URI (either local or http or https). This is good becuase our text2Image model parser outputs into a base64 format, so this will allow us to chain model prompts! ## Test Plan Rebase and test on 0d7ae2b. Follow the README from AIConfig Editor https://github.com/lastmile-ai/aiconfig/tree/main/python/src/aiconfig/editor#dev, then run these command ```bash aiconfig_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/huggingface.aiconfig.json parsers_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/hf_model_parsers.py alias aiconfig="python3 -m 'aiconfig.scripts.aiconfig_cli'" aiconfig edit --aiconfig-path=$aiconfig_path --server-port=8080 --server-mode=debug_servers --parsers-module-path=$parsers_path ``` Then in AIConfig Editor run the prompt (streaming not supported so just took screenshots) These are the images I tested (with bear being in base64 format) ![fox_in_forest](https://github.com/lastmile-ai/aiconfig/assets/151060367/ca7d1723-9e12-4cc8-9d8d-41fa9f466919) ![bear-eating-honey](https://github.com/lastmile-ai/aiconfig/assets/151060367/a947d89e-c02a-4c64-8183-ff1c85802859) <img width="1281" alt="Screenshot 2024-01-10 at 04 57 44" src="https://github.com/lastmile-ai/aiconfig/assets/151060367/ea60cbc5-e6ab-4bf2-82e7-17f3182fdc5c">
diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py
@@ -1,5 +1,8 @@
+import base64
 import json
-from typing import Any, Dict, Optional, List, TYPE_CHECKING
+from io import BytesIO
+from PIL import Image
+from typing import Any, Dict, Optional, List, TYPE_CHECKING, Union
 from transformers import (
     Pipeline,
     pipeline,
@@ -107,7 +110,7 @@ async def deserialize(
         completion_params = refine_completion_params(model_settings)
 
         #Add image inputs
-        inputs = validate_and_retrieve_image_from_attachments(prompt)
+        inputs = validate_and_retrieve_images_from_attachments(prompt)
         completion_params["inputs"] = inputs
 
         await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_complete", __name__, {"output": completion_params}))
@@ -218,7 +221,7 @@ def validate_attachment_type_is_image(attachment: Attachment):
         raise ValueError(f"Invalid attachment mimetype {attachment.mime_type}. Expected image mimetype.")
 
 
-def validate_and_retrieve_image_from_attachments(prompt: Prompt) -> list[str]:
+def validate_and_retrieve_images_from_attachments(prompt: Prompt) -> list[Union[str, Image]]:
     """
     Retrieves the image uri's from each attachment in the prompt input.
 
@@ -232,15 +235,23 @@ def validate_and_retrieve_image_from_attachments(prompt: Prompt) -> list[str]:
     if not hasattr(prompt.input, "attachments") or len(prompt.input.attachments) == 0:
         raise ValueError(f"No attachments found in input for prompt {prompt.name}. Please add an image attachment to the prompt input.")
 
-    image_uris: list[str] = []
+    images: list[Union[str, Image]] = []
 
     for i, attachment in enumerate(prompt.input.attachments):
         validate_attachment_type_is_image(attachment)
 
-        if not isinstance(attachment.data, str):
+        input_data = attachment.data
+        if not isinstance(input_data, str):
             # See todo above, but for now only support uri's
             raise ValueError(f"Attachment #{i} data is not a uri. Please specify a uri for the image attachment in prompt {prompt.name}.")
 
-        image_uris.append(attachment.data)
+        # Really basic heurestic to check if the data is a base64 encoded str
+        # vs. uri. This will be fixed once we have standardized inputs
+        # See https://github.com/lastmile-ai/aiconfig/issues/829
+        if len(input_data) > 10000:
+            pil_image : Image = Image.open(BytesIO(base64.b64decode(input_data)))
+            images.append(pil_image)
+        else:
+            images.append(input_data)
 
-    return image_uris
+    return images