Skip to content

Commit a5a26aa

Browse files
author
Rossdan Craig rossdan@lastmileai.dev
committed
[HF][streaming][4/n] Image2Text (no streaming, but lots of fixing)
This model parser does not support streaming (surprising!): ``` TypeError: ImageToTextPipeline._sanitize_parameters() got an unexpected keyword argument 'streamer' ``` In general, I mainly just did a lot of fixing up to make sure that this worked as expected. Things I fixed: 1. Now works for multiple images (it did before, but didn't process responses for each properly, just put the entire response) 2. Constructing responses to be in pure text output 3. Specified the completion params that are supported (only 2: https://github.com/huggingface/transformers/blob/701298d2d3d5c7bde45e71cce12736098e3f05ef/src/transformers/pipelines/image_to_text.py#L97-L102C13) Next diff I will add support for b64 encoded image format --> we need to convert this to a PIL, see https://github.com/huggingface/transformers/blob/701298d2d3d5c7bde45e71cce12736098e3f05ef/src/transformers/pipelines/image_to_text.py#L83 ## Test Plan Rebase onto and test it: 5f3b667. Follow the README from AIConfig Editor https://github.com/lastmile-ai/aiconfig/tree/main/python/src/aiconfig/editor#dev, then run these command ```bash aiconfig_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/huggingface.aiconfig.json parsers_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/hf_model_parsers.py alias aiconfig="python3 -m 'aiconfig.scripts.aiconfig_cli'" aiconfig edit --aiconfig-path=$aiconfig_path --server-port=8080 --server-mode=debug_servers --parsers-module-path=$parsers_path ``` Then in AIConfig Editor run the prompt (streaming not supported so just took screenshots)
1 parent 617682e commit a5a26aa

File tree

3 files changed

+107
-29
lines changed

3 files changed

+107
-29
lines changed

extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py

Lines changed: 98 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
1+
import json
12
from typing import Any, Dict, Optional, List, TYPE_CHECKING
3+
from transformers import (
4+
Pipeline,
5+
pipeline,
6+
)
7+
28
from aiconfig import ParameterizedModelParser, InferenceOptions
39
from aiconfig.callback import CallbackEvent
4-
import torch
5-
from aiconfig.schema import Prompt, Output, ExecuteResult, Attachment
6-
7-
from transformers import pipeline, Pipeline
8-
10+
from aiconfig.schema import (
11+
Attachment,
12+
ExecuteResult,
13+
Output,
14+
OutputDataWithValue,
15+
Prompt,
16+
)
17+
18+
# Circular Dependency Type Hints
919
if TYPE_CHECKING:
1020
from aiconfig import AIConfigRuntime
1121

@@ -93,10 +103,11 @@ async def deserialize(
93103
await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_start", __name__, {"prompt": prompt, "params": params}))
94104

95105
# Build Completion data
96-
completion_params = self.get_model_settings(prompt, aiconfig)
106+
model_settings = self.get_model_settings(prompt, aiconfig)
107+
completion_params = refine_completion_params(model_settings)
97108

109+
#Add image inputs
98110
inputs = validate_and_retrieve_image_from_attachments(prompt)
99-
100111
completion_params["inputs"] = inputs
101112

102113
await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_complete", __name__, {"output": completion_params}))
@@ -110,24 +121,93 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio
110121
{"prompt": prompt, "options": options, "parameters": parameters},
111122
)
112123
)
113-
model_name = aiconfig.get_model_name(prompt)
114-
115-
self.pipelines[model_name] = pipeline(task="image-to-text", model=model_name)
116124

117-
captioner = self.pipelines[model_name]
118125
completion_data = await self.deserialize(prompt, aiconfig, parameters)
119126
inputs = completion_data.pop("inputs")
120-
model = completion_data.pop("model")
121-
response = captioner(inputs, **completion_data)
122127

123-
output = ExecuteResult(output_type="execute_result", data=response, metadata={})
128+
model_name: str | None = aiconfig.get_model_name(prompt)
129+
if isinstance(model_name, str) and model_name not in self.pipelines:
130+
self.pipelines[model_name] = pipeline(task="image-to-text", model=model_name)
131+
captioner = self.pipelines[model_name]
132+
133+
outputs: List[Output] = []
134+
response: List[Any] = captioner(inputs, **completion_data)
135+
for count, result in enumerate(response):
136+
output: Output = construct_regular_output(result, count)
137+
outputs.append(output)
124138

125-
prompt.outputs = [output]
126-
await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_run_complete", __name__, {"result": prompt.outputs}))
139+
prompt.outputs = outputs
140+
print(f"{prompt.outputs=}")
141+
await aiconfig.callback_manager.run_callbacks(
142+
CallbackEvent(
143+
"on_run_complete",
144+
__name__,
145+
{"result": prompt.outputs},
146+
)
147+
)
127148
return prompt.outputs
128149

129-
def get_output_text(self, response: dict[str, Any]) -> str:
130-
raise NotImplementedError("get_output_text is not implemented for HuggingFaceImage2TextTransformer")
150+
def get_output_text(
151+
self,
152+
prompt: Prompt,
153+
aiconfig: "AIConfigRuntime",
154+
output: Optional[Output] = None,
155+
) -> str:
156+
if output is None:
157+
output = aiconfig.get_latest_output(prompt)
158+
159+
if output is None:
160+
return ""
161+
162+
# TODO (rossdanlm): Handle multiple outputs in list
163+
# https://github.com/lastmile-ai/aiconfig/issues/467
164+
if output.output_type == "execute_result":
165+
output_data = output.data
166+
if isinstance(output_data, str):
167+
return output_data
168+
if isinstance(output_data, OutputDataWithValue):
169+
if isinstance(output_data.value, str):
170+
return output_data.value
171+
# HuggingFace Text summarization does not support function
172+
# calls so shouldn't get here, but just being safe
173+
return json.dumps(output_data.value, indent=2)
174+
return ""
175+
176+
177+
def refine_completion_params(model_settings: Dict[str, Any]) -> Dict[str, Any]:
178+
"""
179+
Refines the completion params for the HF image to text api. Removes any unsupported params.
180+
The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method
181+
"""
182+
supported_keys = {
183+
"max_new_tokens",
184+
"timeout",
185+
}
186+
187+
completion_data = {}
188+
for key in model_settings:
189+
if key.lower() in supported_keys:
190+
completion_data[key.lower()] = model_settings[key]
191+
192+
return completion_data
193+
194+
# Helper methods
195+
def construct_regular_output(result: Dict[str, str], execution_count: int) -> Output:
196+
"""
197+
Construct regular output per response result, without streaming enabled
198+
"""
199+
output = ExecuteResult(
200+
**{
201+
"output_type": "execute_result",
202+
# For some reason result is always in list format we haven't found
203+
# a way of being able to return multiple sequences from the image
204+
# to text pipeline
205+
"data": result[0]["generated_text"],
206+
"execution_count": execution_count,
207+
"metadata": {},
208+
}
209+
)
210+
return output
131211

132212

133213
def validate_attachment_type_is_image(attachment: Attachment):

extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_generation.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def __init__(self):
153153
config.register_model_parser(parser)
154154
"""
155155
super().__init__()
156-
self.generators : dict[str, Pipeline]= {}
156+
self.generators: dict[str, Pipeline]= {}
157157

158158
def id(self) -> str:
159159
"""
@@ -217,14 +217,14 @@ async def deserialize(
217217
# Build Completion data
218218
model_settings = self.get_model_settings(prompt, aiconfig)
219219
completion_data = refine_chat_completion_params(model_settings)
220-
220+
221221
#Add resolved prompt
222222
resolved_prompt = resolve_prompt(prompt, params, aiconfig)
223223
completion_data["prompt"] = resolved_prompt
224224
return completion_data
225225

226226
async def run_inference(
227-
self, prompt: Prompt, aiconfig : "AIConfigRuntime", options : InferenceOptions, parameters: Dict[str, Any]
227+
self, prompt: Prompt, aiconfig: "AIConfigRuntime", options: InferenceOptions, parameters: Dict[str, Any]
228228
) -> List[Output]:
229229
"""
230230
Invoked to run a prompt in the .aiconfig. This method should perform
@@ -239,8 +239,8 @@ async def run_inference(
239239
"""
240240
completion_data = await self.deserialize(prompt, aiconfig, options, parameters)
241241
completion_data["text_inputs"] = completion_data.pop("prompt", None)
242-
243-
model_name : str = aiconfig.get_model_name(prompt)
242+
243+
model_name: str | None = aiconfig.get_model_name(prompt)
244244
if isinstance(model_name, str) and model_name not in self.generators:
245245
self.generators[model_name] = pipeline('text-generation', model=model_name)
246246
generator = self.generators[model_name]
@@ -255,10 +255,10 @@ async def run_inference(
255255
streamer = TextIteratorStreamer(tokenizer)
256256
completion_data["streamer"] = streamer
257257

258-
outputs : List[Output] = []
258+
outputs: List[Output] = []
259259
output = None
260260
if not should_stream:
261-
response : List[Any] = generator(**completion_data)
261+
response: List[Any] = generator(**completion_data)
262262
for count, result in enumerate(response):
263263
output = construct_regular_output(result, count)
264264
outputs.append(output)
@@ -267,7 +267,7 @@ async def run_inference(
267267
raise ValueError("Sorry, TextIteratorStreamer does not support multiple return sequences, please set `num_return_sequences` to 1")
268268
if not streamer:
269269
raise ValueError("Stream option is selected but streamer is not initialized")
270-
270+
271271
# For streaming, cannot call `generator` directly otherwise response will be blocking
272272
thread = threading.Thread(target=generator, kwargs=completion_data)
273273
thread.start()

extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,12 +258,10 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio
258258
streamer = TextIteratorStreamer(tokenizer)
259259
completion_data["streamer"] = streamer
260260

261-
outputs: List[Output] = []
262-
output = None
263-
264261
def _summarize():
265262
return summarizer(inputs, **completion_data)
266263

264+
outputs: List[Output] = []
267265
if not should_stream:
268266
response: List[Any] = _summarize()
269267
for count, result in enumerate(response):

0 commit comments

Comments
 (0)