Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions magic_pdf/pipe/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def dump_md(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
)
writer.write_string(file_path, md_content)
return md_content

def dump_content_list(
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
Expand All @@ -66,6 +67,7 @@ def dump_content_list(
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
)
return content_list

def dump_middle_json(self, writer: DataWriter, file_path: str):
"""Dump the result of pipeline.
Expand Down
48 changes: 48 additions & 0 deletions projects/web_api/app_v0_10_6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Literal

from fastapi import FastAPI, UploadFile, HTTPException

from pdf_parse_main import pdf_parse_main

app = FastAPI()

parse_allowed_methods = Literal["auto", "txt", "ocr"]

# Here to set the default output path of the parsing result file
PDF_OUTPUT_PATH = "/tmp/output"


@app.post("/pdf-parse")
async def pdf_parse(
file: UploadFile,
parse_method: parse_allowed_methods = "auto",
is_output: bool = False,
save_path: str = PDF_OUTPUT_PATH,
):
"""
is_output: Whether to keep the parsing result file
save_path: Parse result file save path
"""
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=415, detail="File type error")

pdf_bytes = await file.read()
pdf_file_name = file.filename.split(".")[0]

try:
md_content, list_content, txt_content = await pdf_parse_main(
pdf_bytes, pdf_file_name, parse_method, is_output, save_path
)

return {"md_data": md_content, "content_list_data": list_content, "txt_data": txt_content}

except ValueError as ve:
raise HTTPException(status_code=400, detail=str(ve))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


if __name__ == "__main__":
import uvicorn

uvicorn.run(app, host="0.0.0.0", port=8999)
106 changes: 106 additions & 0 deletions projects/web_api/pdf_parse_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
from shutil import rmtree
from datetime import datetime

from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze

from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset


async def pdf_parse_main(
pdf_file,
pdf_file_name: str = "noname",
parse_method: str = "auto",
is_save_output: bool = False,
save_path: str = None,
):
"""

Args:
pdf_file: file path or file bytes
pdf_file_name: Use the file name as the folder name for saving the result file.
parse_method: auto/txt/ocr
is_save_output: Whether to save the output file.
save_path: Directory to save the output file. By default, the output file will be saved in the current workspace directory.

Returns:
md_content: markdown result without images
list_content: list result
txt_content: just text from list_content

"""
local_md_dir = None

try:
# In the case that the pdf_file is the file path, read its byte data.
if isinstance(pdf_file, str) and os.path.exists(pdf_file):
file_reader = FileBasedDataReader()
pdf_bytes = file_reader.read(pdf_file)
pdf_file_name = os.path.splitext(os.path.basename(pdf_file))[0]
elif isinstance(pdf_file, bytes):
pdf_bytes = pdf_file
pdf_file_name = pdf_file_name
else:
raise ValueError(
"pdf_file must be a file path or byte data. \
Please ensure the path is correct or provide the correct byte data."
)

# Create the output directory
timestamp = datetime.now().strftime("%Y%m%d%H%M%f")[:-4]
if save_path:
local_md_dir = os.path.join(save_path, f"{pdf_file_name}_{timestamp}")
else:
local_md_dir = os.path.join(os.getcwd(), f"{pdf_file_name}_{timestamp}")
local_image_dir = os.path.join(local_md_dir, "images")
os.makedirs(local_image_dir, exist_ok=True)

md_writer = FileBasedDataWriter(local_md_dir)
image_writer = FileBasedDataWriter(local_image_dir)

ds = PymuDocDataset(pdf_bytes)

if parse_method == "auto":
parse_method = "ocr" if ds.classify() == SupportedPdfParseMethod.OCR else "txt"

if parse_method == "txt":
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer, debug_mode=True)
elif parse_method == "ocr":
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer, debug_mode=True)
else:
raise ValueError(f"Unsupported parsing method: {parse_method}, please choose [auto, txt, ocr].")

# result
md_content = pipe_result.dump_md(md_writer, f"{pdf_file_name}.md", local_image_dir)
list_content = pipe_result.dump_content_list(md_writer, f"{pdf_file_name}_content_list.json", local_image_dir)
# middle_content = pipe_result._pipe_res

# get text
txt_content = "\n".join(i.get("text", "") for i in list_content)

return md_content, list_content, txt_content

except Exception as e:
raise Exception(f"An error occurred when processing the file: {e}")

finally:
if not is_save_output and local_md_dir and os.path.exists(local_md_dir):
# delete the output directory
rmtree(local_md_dir)


# test
if __name__ == "__main__":
import asyncio

pdf_path = "/home/yzz/pdf_file_test/Quality.pdf"

with open(pdf_path, "rb") as f:
pdf_bytes = f.read()

# asyncio.run(pdf_parse_main(pdf_path, parse_method="auto", is_save_output=True))
asyncio.run(pdf_parse_main(pdf_bytes, parse_method="auto", is_save_output=True))
Loading