|
549 | 549 | "\n", |
550 | 550 | "from concurrent.futures import ThreadPoolExecutor\n", |
551 | 551 | "from functools import partial\n", |
| 552 | + "import pypdf\n", |
552 | 553 | "from pypdf import PdfReader, PdfWriter\n", |
553 | 554 | "from unstructured.partition.auto import partition\n", |
554 | 555 | "from unstructured.staging.base import convert_to_isd, convert_to_dataframe, elements_from_json\n", |
555 | | - "import tempfile\n", |
556 | | - "import pdfminer\n", |
557 | 556 | "import requests\n", |
558 | 557 | "import time" |
559 | 558 | ] |
|
598 | 597 | "source": [ |
599 | 598 | "# pipeline-api\n", |
600 | 599 | "\n", |
601 | | - "def get_pdf_splits(pdf, split_size=1):\n", |
| 600 | + "def get_pdf_splits(pdf_pages, split_size=1):\n", |
602 | 601 | " '''\n", |
603 | 602 | " Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages\n", |
604 | 603 | " Return the files with their page offset in the form [( BytesIO, int)]\n", |
|
607 | 606 | "\n", |
608 | 607 | " offset = 0\n", |
609 | 608 | "\n", |
610 | | - " while offset < len(pdf.pages):\n", |
| 609 | + " while offset < len(pdf_pages):\n", |
611 | 610 | " new_pdf = PdfWriter()\n", |
612 | 611 | " pdf_buffer = io.BytesIO()\n", |
613 | 612 | "\n", |
614 | 613 | " end = offset+split_size\n", |
615 | | - " for page in pdf.pages[offset : end]:\n", |
| 614 | + " for page in pdf_pages[offset : end]:\n", |
616 | 615 | " new_pdf.add_page(page)\n", |
617 | 616 | "\n", |
618 | 617 | " new_pdf.write(pdf_buffer)\n", |
|
678 | 677 | "\n", |
679 | 678 | " return elements\n", |
680 | 679 | "\n", |
681 | | - "def partition_pdf_splits(request, file, file_filename, content_type, coordinates, **partition_kwargs):\n", |
| 680 | + "def partition_pdf_splits(request, pdf_pages, file, file_filename, content_type, coordinates, **partition_kwargs):\n", |
682 | 681 | " '''\n", |
683 | 682 | " Split a pdf into chunks and process in parallel with more api calls, or partition\n", |
684 | 683 | " locally if the chunk is small enough. As soon as any remote call fails, bubble up\n", |
|
691 | 690 | " partition_kwargs holds any others parameters that will be forwarded, or passed to partition\n", |
692 | 691 | " ''' \n", |
693 | 692 | " pages_per_pdf = int(os.environ.get(\"UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE\", 1))\n", |
694 | | - " pdf = PdfReader(file)\n", |
695 | 693 | "\n", |
696 | 694 | " # If it's small enough, just process locally\n", |
697 | | - " if len(pdf.pages) <= pages_per_pdf:\n", |
| 695 | + " if len(pdf_pages) <= pages_per_pdf:\n", |
698 | 696 | " return partition(\n", |
699 | 697 | " file=file,\n", |
700 | 698 | " file_filename=file_filename,\n", |
|
703 | 701 | " )\n", |
704 | 702 | "\n", |
705 | 703 | " results = []\n", |
706 | | - " page_tuples = get_pdf_splits(pdf, split_size=pages_per_pdf)\n", |
| 704 | + " page_tuples = get_pdf_splits(pdf_pages, split_size=pages_per_pdf)\n", |
707 | 705 | " \n", |
708 | 706 | " partition_func = partial(\n", |
709 | 707 | " partition_file_via_api,\n", |
|
771 | 769 | " # Note(yuming): convert file type for msg files\n", |
772 | 770 | " # since fast api might sent the wrong one.\n", |
773 | 771 | " file_content_type = \"application/x-ole-storage\"\n", |
| 772 | + " \n", |
| 773 | + " if filename.endswith(\".pdf\"):\n", |
| 774 | + " try: \n", |
| 775 | + " pdf = PdfReader(file)\n", |
| 776 | + " except pypdf.errors.EmptyFileError:\n", |
| 777 | + " raise HTTPException(\n", |
| 778 | + " status_code=400,\n", |
| 779 | + " detail=f\"{filename} does not appear to be a valid PDF\"\n", |
| 780 | + " )\n", |
| 781 | + " if pdf.is_encrypted:\n", |
| 782 | + " raise HTTPException(\n", |
| 783 | + " status_code=400,\n", |
| 784 | + " detail=f\"File: {filename} is encrypted. Please decrypt it with password.\"\n", |
| 785 | + " )\n", |
774 | 786 | " \n", |
775 | 787 | " strategy = (m_strategy[0] if len(m_strategy) else 'auto').lower()\n", |
776 | 788 | " strategies = ['fast', 'hi_res', 'auto', 'ocr_only']\n", |
|
828 | 840 | " if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n", |
829 | 841 | " elements = partition_pdf_splits(\n", |
830 | 842 | " request,\n", |
| 843 | + " pdf_pages = pdf.pages,\n", |
831 | 844 | " file=file,\n", |
832 | 845 | " file_filename=filename,\n", |
833 | 846 | " content_type=file_content_type,\n", |
|
856 | 869 | " if 'Invalid file' in e.args[0]:\n", |
857 | 870 | " raise HTTPException(status_code=400, detail=f\"{file_content_type} not currently supported\")\n", |
858 | 871 | " raise e\n", |
859 | | - " except pdfminer.pdfparser.PDFSyntaxError:\n", |
860 | | - " raise HTTPException(status_code=400, detail=f\"{filename} does not appear to be a valid PDF\")\n", |
861 | 872 | "\n", |
862 | 873 | " if response_type == \"text/csv\":\n", |
863 | 874 | " df = convert_to_dataframe(elements)\n", |
|
0 commit comments