@@ -232,10 +232,51 @@ def wrap_pdfchat_query(query, document):
232232 return reformatted_query_context
233233
234234
235- LLAMA_PARSE_MAX_RETRY = 2
236- LLAMAPARSE_SUPPORTED_LANGS = {
235+ # LLAMA_PARSE_MAX_RETRY = 2
236+ # LLAMAPARSE_SUPPORTED_LANGS = {
237+ # "English": "en",
238+ # "Chinese": "ch_sim",
239+ # "Russian": "ru",
240+ # "Spanish": "es",
241+ # "Japanese": "ja",
242+ # "Korean": "ko",
243+ # "French": "fr",
244+ # "German": "de",
245+ # "Vietnamese": "vi",
246+ # }
247+
248+
249+ # def parse_pdf(file_path):
250+ # from llama_parse import LlamaParse
251+
252+ # assert (
253+ # "LLAMA_CLOUD_API_KEY" in os.environ
254+ # ), "Make sure to specify LlamaParse API key."
255+
256+ # for _ in range(LLAMA_PARSE_MAX_RETRY):
257+ # try:
258+ # documents = LlamaParse(
259+ # result_type="markdown",
260+ # verbose=True,
261+ # languages=list(LLAMAPARSE_SUPPORTED_LANGS.values()),
262+ # accurate_mode=True,
263+ # ).load_data(file_path)
264+ # assert len(documents) > 0
265+ # break
266+ # except AssertionError as e:
267+ # continue
268+
269+ # output = "\n".join(
270+ # [f"Page {i+1}:\n{doc.text}\n" for i, doc in enumerate(documents)]
271+ # )
272+
273+ # return output
274+
275+
276+ PDFPARSE_MAX_RETRY = 2
277+ PDFPARSE_SUPPORTED_LANGS = {
237278 "English" : "en" ,
238- "Chinese" : "ch_sim " ,
279+ "Chinese" : "zh " ,
239280 "Russian" : "ru" ,
240281 "Spanish" : "es" ,
241282 "Japanese" : "ja" ,
@@ -244,33 +285,36 @@ def wrap_pdfchat_query(query, document):
244285 "German" : "de" ,
245286 "Vietnamese" : "vi" ,
246287}
288+ MARKER_PDFPARSE_CONFIG = {
289+ "output_format" : "markdown" ,
290+ "languages" : "," .join (PDFPARSE_SUPPORTED_LANGS .values ()),
291+ }
247292
248293
249294def parse_pdf (file_path ):
250- from llama_parse import LlamaParse
251-
252- assert (
253- "LLAMA_CLOUD_API_KEY" in os .environ
254- ), "Make sure to specify LlamaParse API key."
295+ from marker .config .parser import ConfigParser
296+ from marker .models import create_model_dict
297+ from marker .converters .pdf import PdfConverter
255298
256- for _ in range (LLAMA_PARSE_MAX_RETRY ):
299+ output_md , output_images = None , None
300+ for _ in range (PDFPARSE_MAX_RETRY ):
257301 try :
258- documents = LlamaParse (
259- result_type = "markdown" ,
260- verbose = True ,
261- languages = list (LLAMAPARSE_SUPPORTED_LANGS .values ()),
262- accurate_mode = True ,
263- ).load_data (file_path )
264- assert len (documents ) > 0
302+ config_parser = ConfigParser (MARKER_PDFPARSE_CONFIG )
303+
304+ converter = PdfConverter (
305+ config = config_parser .generate_config_dict (),
306+ artifact_dict = create_model_dict (),
307+ processor_list = config_parser .get_processors (),
308+ renderer = config_parser .get_renderer (),
309+ )
310+ rendered = converter (file_path )
311+ output_md = rendered .markdown
312+ output_images = list (rendered .images .values ())
265313 break
266314 except AssertionError as e :
267315 continue
268316
269- output = "\n " .join (
270- [f"Page { i + 1 } :\n { doc .text } \n " for i , doc in enumerate (documents )]
271- )
272-
273- return output
317+ return output_md , output_images
274318
275319
276320def _prepare_text_with_image (state , text , images , csam_flag ):
@@ -284,12 +328,26 @@ def _prepare_text_with_image(state, text, images, csam_flag):
284328 return text
285329
286330
331+ # def _prepare_text_with_pdf(text, pdfs):
332+ # if len(pdfs) > 0:
333+ # document_content = parse_pdf(pdfs[0])
334+ # print("Document processed")
335+ # text = wrap_pdfchat_query(text, document_content)
336+
337+ # return text
338+
339+
287340def _prepare_text_with_pdf (text , pdfs ):
288341 if len (pdfs ) > 0 :
289- document_content = parse_pdf (pdfs [0 ])
342+ parsed_text , imgs = parse_pdf (pdfs [0 ])
290343 print ("Document processed" )
291- text = wrap_pdfchat_query (text , document_content )
344+ wrapped_text = wrap_pdfchat_query (text , parsed_text )
292345
346+ imgs = convert_pdf_images_to_conversation_format (imgs )
347+
348+ if len (imgs ) > 0 :
349+ return wrapped_text , imgs
350+ return wrapped_text
293351 return text
294352
295353
@@ -307,6 +365,20 @@ def convert_images_to_conversation_format(images):
307365 return conv_images
308366
309367
368+ def convert_pdf_images_to_conversation_format (images ):
369+ MAX_NSFW_ENDPOINT_IMAGE_SIZE_IN_MB = 5 / 1.5
370+ conv_images = []
371+ if len (images ) > 0 :
372+ for img in images :
373+ # pdf parser returns a PIL image object instead of path
374+ conv_images .append (
375+ Image (url = "" ).to_conversation_format (
376+ MAX_NSFW_ENDPOINT_IMAGE_SIZE_IN_MB , pil_img = img
377+ )
378+ )
379+ return conv_images
380+
381+
310382def moderate_input (state , text , all_conv_text , model_list , images , ip ):
311383 text_flagged = moderation_filter (all_conv_text , model_list )
312384 # flagged = moderation_filter(text, [state.model_name])
0 commit comments