|
1 | 1 | { |
2 | 2 | "cells": [ |
3 | 3 | { |
4 | | - "attachments": {}, |
5 | 4 | "cell_type": "markdown", |
6 | 5 | "id": "e908195c", |
7 | 6 | "metadata": {}, |
|
10 | 9 | ] |
11 | 10 | }, |
12 | 11 | { |
13 | | - "attachments": {}, |
14 | 12 | "cell_type": "markdown", |
15 | 13 | "id": "727614ba", |
16 | 14 | "metadata": {}, |
|
37 | 35 | "source": [] |
38 | 36 | }, |
39 | 37 | { |
40 | | - "attachments": {}, |
41 | 38 | "cell_type": "markdown", |
42 | 39 | "id": "3848e558", |
43 | 40 | "metadata": {}, |
|
46 | 43 | ] |
47 | 44 | }, |
48 | 45 | { |
49 | | - "attachments": {}, |
50 | 46 | "cell_type": "markdown", |
51 | 47 | "id": "01a62fe4", |
52 | 48 | "metadata": {}, |
|
102 | 98 | ] |
103 | 99 | }, |
104 | 100 | { |
105 | | - "attachments": {}, |
106 | 101 | "cell_type": "markdown", |
107 | 102 | "id": "63e3b32b", |
108 | 103 | "metadata": {}, |
|
222 | 217 | ] |
223 | 218 | }, |
224 | 219 | { |
225 | | - "attachments": {}, |
226 | 220 | "cell_type": "markdown", |
227 | 221 | "id": "15d69b6b", |
228 | 222 | "metadata": {}, |
|
231 | 225 | ] |
232 | 226 | }, |
233 | 227 | { |
234 | | - "attachments": {}, |
235 | 228 | "cell_type": "markdown", |
236 | 229 | "id": "5c9e618c", |
237 | 230 | "metadata": {}, |
|
324 | 317 | ] |
325 | 318 | }, |
326 | 319 | { |
327 | | - "attachments": {}, |
328 | 320 | "cell_type": "markdown", |
329 | 321 | "id": "258531fe", |
330 | 322 | "metadata": {}, |
|
369 | 361 | ] |
370 | 362 | }, |
371 | 363 | { |
372 | | - "attachments": {}, |
373 | 364 | "cell_type": "markdown", |
374 | 365 | "id": "10e1d3df", |
375 | 366 | "metadata": {}, |
|
378 | 369 | ] |
379 | 370 | }, |
380 | 371 | { |
381 | | - "attachments": {}, |
382 | 372 | "cell_type": "markdown", |
383 | 373 | "id": "52943c00", |
384 | 374 | "metadata": {}, |
|
461 | 451 | ] |
462 | 452 | }, |
463 | 453 | { |
464 | | - "attachments": {}, |
465 | 454 | "cell_type": "markdown", |
466 | 455 | "id": "0f7fea99", |
467 | 456 | "metadata": {}, |
|
528 | 517 | ] |
529 | 518 | }, |
530 | 519 | { |
531 | | - "attachments": {}, |
532 | 520 | "cell_type": "markdown", |
533 | 521 | "id": "cde38923", |
534 | 522 | "metadata": {}, |
|
730 | 718 | "outputs": [], |
731 | 719 | "source": [ |
732 | 720 | "# pipeline-api\n", |
| 721 | + "from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES\n", |
| 722 | + "import logging\n", |
| 723 | + "\n", |
| 724 | + "logger = logging.getLogger(\"unstructured_api\")\n", |
733 | 725 | "\n", |
734 | 726 | "def pipeline_api(\n", |
735 | 727 | " file, \n", |
|
741 | 733 | " m_encoding=[],\n", |
742 | 734 | " m_xml_keep_tags=[],\n", |
743 | 735 | " m_pdf_infer_table_structure = [],\n", |
| 736 | + " m_hi_res_model_name=[],\n", |
744 | 737 | " file_content_type=None,\n", |
745 | 738 | " response_type=\"application/json\"\n", |
746 | 739 | "):\n", |
| 740 | + " logger.debug(\n", |
| 741 | + " f\"\\npipeline_api input params:\\n\"\n", |
| 742 | + " f\"filename: {filename}\\n\"\n", |
| 743 | + " f\"m_strategy: {m_strategy}\\n\"\n", |
| 744 | + " f\"m_coordinates: {m_coordinates}\\n\"\n", |
| 745 | + " f\"m_ocr_languages: {m_ocr_languages}\\n\"\n", |
| 746 | + " f\"m_encoding: {m_encoding}\\n\"\n", |
| 747 | + " f\"m_xml_keep_tags: {m_xml_keep_tags}\\n\"\n", |
| 748 | + " f\"m_pdf_infer_table_structure: {m_pdf_infer_table_structure}\\n\"\n", |
| 749 | + " f\"m_hi_res_model_name: {m_hi_res_model_name}\\n\"\n", |
| 750 | + " f\"file_content_type: {file_content_type}\\n\"\n", |
| 751 | + " f\"response_type: {response_type}\"\n", |
| 752 | + " )\n", |
747 | 753 | " if filename.endswith(\".msg\"):\n", |
748 | 754 | " # Note(yuming): convert file type for msg files\n", |
749 | 755 | " # since fast api might sent the wrong one.\n", |
|
760 | 766 | " show_coordinates_str = (m_coordinates[0] if len(m_coordinates) else \"false\").lower()\n", |
761 | 767 | " show_coordinates = show_coordinates_str == \"true\"\n", |
762 | 768 | " \n", |
| 769 | + " hi_res_model_name = m_hi_res_model_name[0] if len(m_hi_res_model_name) else None\n", |
| 770 | + " \n", |
| 771 | + " if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:\n", |
| 772 | + " raise HTTPException(status_code=400, detail=f\"coordinates aren't available when using the {hi_res_model_name} model type\")\n", |
| 773 | + " \n", |
763 | 774 | " # Parallel mode is set by env variable\n", |
764 | 775 | " enable_parallel_mode = os.environ.get(\"UNSTRUCTURED_PARALLEL_MODE_ENABLED\", \"false\")\n", |
765 | 776 | " pdf_parallel_mode_enabled = enable_parallel_mode == \"true\"\n", |
|
780 | 791 | " pdf_infer_table_structure = False\n", |
781 | 792 | " \n", |
782 | 793 | " try:\n", |
| 794 | + " logger.debug(\n", |
| 795 | + " f\"\\npartition input data:\\n\"\n", |
| 796 | + " f\"content_type: {file_content_type}\\n\"\n", |
| 797 | + " f\"strategy: {strategy}\\n\"\n", |
| 798 | + " f\"ocr_languages: {ocr_languages}\\n\"\n", |
| 799 | + " f\"coordinates: {show_coordinates}\\n\"\n", |
| 800 | + " f\"pdf_infer_table_structure: {pdf_infer_table_structure}\\n\"\n", |
| 801 | + " f\"encoding: {encoding}\\n\"\n", |
| 802 | + " f\"model_name: {hi_res_model_name}\\n\"\n", |
| 803 | + " f\"xml_keep_tags: {xml_keep_tags}\\n\"\n", |
| 804 | + " )\n", |
| 805 | + " \n", |
783 | 806 | " if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n", |
784 | 807 | " elements = partition_pdf_splits(\n", |
785 | 808 | " request,\n", |
|
791 | 814 | " coordinates=show_coordinates,\n", |
792 | 815 | " pdf_infer_table_structure=pdf_infer_table_structure,\n", |
793 | 816 | " encoding=encoding,\n", |
| 817 | + " model_name=hi_res_model_name\n", |
794 | 818 | " )\n", |
795 | 819 | " else:\n", |
796 | 820 | " elements = partition(\n", |
|
802 | 826 | " pdf_infer_table_structure=pdf_infer_table_structure,\n", |
803 | 827 | " encoding=encoding,\n", |
804 | 828 | " xml_keep_tags=xml_keep_tags,\n", |
| 829 | + " model_name=hi_res_model_name\n", |
805 | 830 | " )\n", |
806 | 831 | " except ValueError as e:\n", |
807 | 832 | " if 'Invalid file' in e.args[0]:\n", |
|
968 | 993 | ] |
969 | 994 | }, |
970 | 995 | { |
971 | | - "attachments": {}, |
972 | 996 | "cell_type": "markdown", |
973 | 997 | "id": "e997bff5", |
974 | 998 | "metadata": {}, |
|
0 commit comments