3
3
import logging
4
4
from collections .abc import AsyncGenerator
5
5
from enum import Enum
6
- from typing import IO , Union
6
+ from typing import IO , Union , Optional
7
7
8
8
import pymupdf
9
9
from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
20
20
from pypdf import PdfReader
21
21
from openai import AsyncOpenAI
22
22
23
- from .mediadescriber import ContentUnderstandingDescriber
23
+ from .mediadescriber import MediaDescriber , ContentUnderstandingDescriber , MultimodalModelDescriber
24
24
from .page import Page
25
25
from .parser import Parser
26
26
@@ -45,6 +45,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
45
45
offset += len (page_text )
46
46
47
47
48
+ class MediaDescriptionStrategy (Enum ):
49
+ NONE = "none"
50
+ OPENAI = "openai"
51
+ CONTENTUNDERSTANDING = "content_understanding"
52
+
48
53
class DocumentAnalysisParser (Parser ):
49
54
"""
50
55
Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -57,13 +62,27 @@ def __init__(
57
62
credential : Union [AsyncTokenCredential , AzureKeyCredential ],
58
63
model_id = "prebuilt-layout" ,
59
64
include_media_description : bool = False ,
65
+ media_description_strategy : Enum = MediaDescriptionStrategy .NONE ,
66
+ # If using OpenAI, this is the client to use
67
+ openai_client : Union [AsyncOpenAI , None ] = None ,
68
+ openai_model : Optional [str ] = None ,
69
+ openai_deployment : Optional [str ] = None ,
70
+ # If using Content Understanding, this is the endpoint for the service
60
71
content_understanding_endpoint : Union [str , None ] = None ,
61
72
):
62
73
self .model_id = model_id
63
74
self .endpoint = endpoint
64
75
self .credential = credential
65
- self .use_content_understanding = use_content_understanding
66
- self .content_understanding_endpoint = content_understanding_endpoint
76
+ self .media_description_strategy = media_description_strategy
77
+ if media_description_strategy == MediaDescriptionStrategy .OPENAI :
78
+ logger .info ("Including media description with OpenAI" )
79
+ self .use_content_understanding = False
80
+ self .openai_client = openai_client
81
+ self .openai_model = openai_model
82
+ self .openai_deployment = openai_deployment
83
+ if media_description_strategy == MediaDescriptionStrategy .CONTENTUNDERSTANDING :
84
+ logger .info ("Including media description with Azure Content Understanding" )
85
+ self .content_understanding_endpoint = content_understanding_endpoint
67
86
68
87
async def parse (self , content : IO ) -> AsyncGenerator [Page , None ]:
69
88
logger .info ("Extracting text from '%s' using Azure Document Intelligence" , content .name )
@@ -72,14 +91,23 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
72
91
endpoint = self .endpoint , credential = self .credential
73
92
) as document_intelligence_client :
74
93
file_analyzed = False
75
- if self .use_content_understanding :
94
+
95
+ media_describer : Union [ContentUnderstandingDescriber , MultimodalModelDescriber , None ] = None
96
+ if self .media_description_strategy == MediaDescriptionStrategy .CONTENTUNDERSTANDING :
76
97
if self .content_understanding_endpoint is None :
77
- raise ValueError ("Content Understanding is enabled but no endpoint was provided " )
98
+ raise ValueError ("Content Understanding endpoint must be provided when using Content Understanding strategy " )
78
99
if isinstance (self .credential , AzureKeyCredential ):
79
100
raise ValueError (
80
101
"AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
81
102
)
82
- cu_describer = ContentUnderstandingDescriber (self .content_understanding_endpoint , self .credential )
103
+ media_describer = ContentUnderstandingDescriber (self .content_understanding_endpoint , self .credential )
104
+
105
+ if self .media_description_strategy == MediaDescriptionStrategy .OPENAI :
106
+ if self .openai_client is None or self .openai_model is None :
107
+ raise ValueError ("OpenAI client must be provided when using OpenAI media description strategy" )
108
+ media_describer = MultimodalModelDescriber (self .openai_client , self .openai_model , self .openai_deployment )
109
+
110
+ if media_describer is not None :
83
111
content_bytes = content .read ()
84
112
try :
85
113
poller = await document_intelligence_client .begin_analyze_document (
@@ -117,7 +145,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
117
145
if table .bounding_regions and table .bounding_regions [0 ].page_number == page .page_number
118
146
]
119
147
figures_on_page = []
120
- if self .use_content_understanding :
148
+ if self .media_description_strategy != MediaDescriptionStrategy . NONE :
121
149
figures_on_page = [
122
150
figure
123
151
for figure in (analyze_result .figures or [])
@@ -163,13 +191,13 @@ class ObjectType(Enum):
163
191
page_text += DocumentAnalysisParser .table_to_html (tables_on_page [object_idx ])
164
192
added_objects .add (mask_char )
165
193
elif object_type == ObjectType .FIGURE :
166
- if cu_describer is None :
167
- raise ValueError ("cu_describer should not be None, unable to describe figure" )
194
+ if media_describer is None :
195
+ raise ValueError ("media_describer should not be None, unable to describe figure" )
168
196
if object_idx is None :
169
197
raise ValueError ("Expected object_idx to be set" )
170
198
if mask_char not in added_objects :
171
199
figure_html = await DocumentAnalysisParser .figure_to_html (
172
- doc_for_pymupdf , figures_on_page [object_idx ], cu_describer
200
+ doc_for_pymupdf , figures_on_page [object_idx ], media_describer
173
201
)
174
202
page_text += figure_html
175
203
added_objects .add (mask_char )
@@ -182,7 +210,7 @@ class ObjectType(Enum):
182
210
183
211
@staticmethod
184
212
async def figure_to_html (
185
- doc : pymupdf .Document , figure : DocumentFigure , cu_describer : ContentUnderstandingDescriber
213
+ doc : pymupdf .Document , figure : DocumentFigure , media_describer : MediaDescriber
186
214
) -> str :
187
215
figure_title = (figure .caption and figure .caption .content ) or ""
188
216
logger .info ("Describing figure %s with title '%s'" , figure .id , figure_title )
@@ -200,7 +228,7 @@ async def figure_to_html(
200
228
)
201
229
page_number = first_region ["pageNumber" ] # 1-indexed
202
230
cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
203
- figure_description = await cu_describer .describe_image (cropped_img )
231
+ figure_description = await media_describer .describe_image (cropped_img )
204
232
return f"<figure><figcaption>{ figure_title } <br>{ figure_description } </figcaption></figure>"
205
233
206
234
@staticmethod
0 commit comments