88from PIL import Image
99from pydantic import AnyUrl
1010
11- from docling .datamodel .base_models import OpenAiApiResponse
11+ from docling .datamodel .base_models import OpenAiApiResponse , OpenAiResponseUsage
1212from docling .models .utils .generation_utils import GenerationStopper
1313
1414_log = logging .getLogger (__name__ )
1515
1616
17- def api_image_request (
17+ def api_image_request_with_usage (
1818 image : Image .Image ,
1919 prompt : str ,
2020 url : AnyUrl ,
2121 timeout : float = 20 ,
2222 headers : Optional [Dict [str , str ]] = None ,
23- token_extract_key : Optional [str ] = None ,
2423 ** params ,
25- ) -> Tuple [str , Optional [dict ]]:
24+ ) -> Tuple [str , Optional [OpenAiResponseUsage ]]:
2625 """Send an image+prompt to an OpenAI-compatible API and return (text, usage).
2726
2827 If no usage data is available, the second tuple element will be None.
@@ -38,138 +37,34 @@ def api_image_request(
3837 "type" : "image_url" ,
3938 "image_url" : {"url" : f"data:image/png;base64,{ image_base64 } " },
4039 },
41- {"type" : "text" , "text" : prompt },
40+ {
41+ "type" : "text" ,
42+ "text" : prompt ,
43+ },
4244 ],
4345 }
4446 ]
4547
46- payload = {"messages" : messages , ** params }
48+ payload = {
49+ "messages" : messages ,
50+ ** params ,
51+ }
52+
4753 headers = headers or {}
4854
49- r = requests .post (str (url ), headers = headers , json = payload , timeout = timeout )
55+ r = requests .post (
56+ str (url ),
57+ headers = headers ,
58+ json = payload ,
59+ timeout = timeout ,
60+ )
5061 if not r .ok :
5162 _log .error (f"Error calling the API. Response was { r .text } " )
5263 r .raise_for_status ()
5364
54- # Try to parse JSON body
55- try :
56- resp_json = r .json ()
57- except Exception :
58- api_resp = OpenAiApiResponse .model_validate_json (r .text )
59- generated_text = api_resp .choices [0 ].message .content .strip ()
60- return generated_text , None
61-
62- usage = None
63- if isinstance (resp_json , dict ):
64- usage = resp_json .get ("usage" )
65-
66- # Extract generated text using common OpenAI shapes
67- generated_text = ""
68- try :
69- generated_text = resp_json ["choices" ][0 ]["message" ]["content" ].strip ()
70- except Exception :
71- try :
72- generated_text = resp_json ["choices" ][0 ].get ("text" , "" )
73- if isinstance (generated_text , str ):
74- generated_text = generated_text .strip ()
75- except Exception :
76- try :
77- api_resp = OpenAiApiResponse .model_validate_json (r .text )
78- generated_text = api_resp .choices [0 ].message .content .strip ()
79- except Exception :
80- generated_text = ""
65+ api_resp = OpenAiApiResponse .model_validate_json (r .text )
66+ generated_text = api_resp .choices [0 ].message .content .strip ()
8167
82- # If an explicit token_extract_key is provided and found in usage, use it
83- if token_extract_key and isinstance (usage , dict ) and token_extract_key in usage :
84- extracted = usage .get (token_extract_key )
85- generated_text = (
86- str (extracted ).strip () if extracted is not None else generated_text
87- )
68+ usage = api_resp .usage if hasattr (api_resp , "usage" ) else None
8869
8970 return generated_text , usage
90-
91-
92- def api_image_request_streaming (
93- image : Image .Image ,
94- prompt : str ,
95- url : AnyUrl ,
96- * ,
97- timeout : float = 20 ,
98- headers : Optional [Dict [str , str ]] = None ,
99- generation_stoppers : List [GenerationStopper ] = [],
100- ** params ,
101- ) -> str :
102- """
103- Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
104- Parses SSE lines: 'data: {json}\n \n ', terminated by 'data: [DONE]'.
105- Accumulates text and calls stopper.should_stop(window) as chunks arrive.
106- If stopper triggers, the HTTP connection is closed to abort server-side generation.
107- """
108- img_io = BytesIO ()
109- image .save (img_io , "PNG" )
110- image_b64 = base64 .b64encode (img_io .getvalue ()).decode ("utf-8" )
111-
112- messages = [
113- {
114- "role" : "user" ,
115- "content" : [
116- {
117- "type" : "image_url" ,
118- "image_url" : {"url" : f"data:image/png;base64,{ image_b64 } " },
119- },
120- {"type" : "text" , "text" : prompt },
121- ],
122- }
123- ]
124-
125- payload = {"messages" : messages , "stream" : True , ** params }
126- _log .debug (f"API streaming request payload: { json .dumps (payload , indent = 2 )} " )
127-
128- hdrs = {"Accept" : "text/event-stream" , ** (headers or {})}
129- if "temperature" in params :
130- hdrs ["X-Temperature" ] = str (params ["temperature" ])
131-
132- # Stream the HTTP response
133- with requests .post (
134- str (url ), headers = hdrs , json = payload , timeout = timeout , stream = True
135- ) as r :
136- if not r .ok :
137- _log .error (
138- f"Error calling the API { url } in streaming mode. Response was { r .text } "
139- )
140- r .raise_for_status ()
141-
142- full_text : List [str ] = []
143- for raw_line in r .iter_lines (decode_unicode = True ):
144- if not raw_line : # keep-alives / blank lines
145- continue
146- if not raw_line .startswith ("data:" ):
147- # Some proxies inject comments; ignore anything not starting with 'data:'
148- continue
149-
150- data = raw_line [len ("data:" ) :].strip ()
151- if data == "[DONE]" :
152- break
153-
154- try :
155- obj = json .loads (data )
156- except json .JSONDecodeError :
157- _log .debug ("Skipping non-JSON SSE chunk: %r" , data [:200 ])
158- continue
159-
160- try :
161- delta = obj ["choices" ][0 ].get ("delta" ) or {}
162- piece = delta .get ("content" ) or ""
163- except (KeyError , IndexError ) as e :
164- _log .debug ("Unexpected SSE chunk shape: %s" , e )
165- piece = ""
166-
167- if piece :
168- full_text .append (piece )
169- for stopper in generation_stoppers :
170- lookback = max (1 , stopper .lookback_tokens ())
171- window = "" .join (full_text )[- lookback :]
172- if stopper .should_stop (window ):
173- return "" .join (full_text )
174-
175- return "" .join (full_text )
0 commit comments