1616from gradio .data_classes import FileData
1717import numpy as np
1818
19+ from io import BytesIO
20+ import base64
21+
1922from fastchat .constants import (
2023 TEXT_MODERATION_MSG ,
2124 IMAGE_MODERATION_MSG ,
@@ -217,29 +220,38 @@ def wrap_pdfchat_query(query, document):
217220
218221# def parse_pdf(file_path):
219222# from llama_parse import LlamaParse
223+ # from llama_index.core.schema import ImageDocument, TextNode
224+
225+ # from PIL import Image
220226
221- # assert (
222- # "LLAMA_CLOUD_API_KEY" in os.environ
223- # ), "Make sure to specify LlamaParse API key."
224-
225- # for _ in range(LLAMA_PARSE_MAX_RETRY):
226- # try:
227- # documents = LlamaParse(
228- # result_type="markdown",
229- # verbose=True,
230- # languages=list(LLAMAPARSE_SUPPORTED_LANGS.values()),
231- # accurate_mode=True,
232- # ).load_data(file_path)
233- # assert len(documents) > 0
234- # break
235- # except AssertionError as e:
236- # continue
237-
238- # output = "\n".join(
239- # [f"Page {i+1}:\n{doc.text}\n" for i, doc in enumerate(documents)]
227+ # parser = LlamaParse(
228+ # api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
229+ # result_type="markdown",
240230# )
241231
242- # return output
232+ # def get_image_nodes(json_objs: List[dict], download_path: str):
233+ # image_dicts = parser.get_images(json_objs, download_path=download_path)
234+ # return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]
235+
236+ # json_objs = parser.get_json_result(file_path)
237+ # json_list = json_objs[0]["pages"]
238+
239+ # text = ""
240+ # for page in json_list:
241+ # text += f"Page {page['page']}:\n{page['md']}\n"
242+ # if (page['images']):
243+ # for i, image in enumerate(page['images']):
244+ # text += f"page{page['page']}_figure{i + 1}\n"
245+
246+ # image_documents = get_image_nodes(json_objs, ".")
247+ # images = []
248+
249+ # for image_doc in image_documents:
250+ # image_path = image_doc.image_path
251+ # image = Image.open(image_path)
252+ # images.append(image)
253+
254+ # return text, images
243255
244256
245257PDFPARSE_MAX_RETRY = 2
@@ -259,29 +271,48 @@ def wrap_pdfchat_query(query, document):
259271 "languages" : "," .join (PDFPARSE_SUPPORTED_LANGS .values ()),
260272}
261273
274+ def convert_base64_to_pil_image (b64_string ):
275+ from PIL import Image
276+
277+ image_data = base64 .b64decode (b64_string )
278+ image_bytes = BytesIO (image_data )
279+ image = Image .open (image_bytes )
280+
281+ return image
262282
263283def parse_pdf (file_path ):
264- from marker .config .parser import ConfigParser
265- from marker .models import create_model_dict
266- from marker .converters .pdf import PdfConverter
267-
268- output_md , output_images = None , None
269- for _ in range (PDFPARSE_MAX_RETRY ):
270- try :
271- config_parser = ConfigParser (MARKER_PDFPARSE_CONFIG )
272-
273- converter = PdfConverter (
274- config = config_parser .generate_config_dict (),
275- artifact_dict = create_model_dict (),
276- processor_list = config_parser .get_processors (),
277- renderer = config_parser .get_renderer (),
278- )
279- rendered = converter (file_path )
280- output_md = rendered .markdown
281- output_images = list (rendered .images .values ())
284+ import requests
285+
286+ url = "https://www.datalab.to/api/v1/marker"
287+
288+ form_data = {
289+ 'file' : ('test.pdf' , open (file_path , 'rb' ), 'application/pdf' ),
290+ 'langs' : (None , "English" ),
291+ "force_ocr" : (None , False ),
292+ "paginate" : (None , False ),
293+ 'output_format' : (None , 'markdown' ),
294+ "use_llm" : (None , True ),
295+ "strip_existing_ocr" : (None , False ),
296+ "disable_image_extraction" : (None , False )
297+ }
298+
299+ headers = {"X-Api-Key" : os .getenv ("X-Api-Key" )}
300+ response = requests .post (url , files = form_data , headers = headers )
301+ data = response .json ()
302+
303+ max_polls = 300
304+ check_url = data ["request_check_url" ]
305+
306+ for i in range (max_polls ):
307+ time .sleep (2 )
308+ response = requests .get (check_url , headers = headers )
309+ data = response .json ()
310+
311+ if data ["status" ] == "complete" :
282312 break
283- except AssertionError as e :
284- continue
313+
314+ output_md = data ["markdown" ]
315+ output_images = [convert_base64_to_pil_image (b64_image ) for b64_image in data ["images" ].values ()]
285316
286317 return output_md , output_images
287318
0 commit comments