Skip to content

Commit 043a1e6

Browse files
authored
Expose model name in api (#148)
* Add in latest unstructed dependency as a requirement * Add update to api code to expose the model name as an optional parameter * Update docker to use multistage builds and initialize the chipper model * Remove unneeded logger configs in notebook * Fix smoketest * regenerate api * tidy notebooks * Bump api tools version * regenerate api * fix unit test * update response code to 400 from 403
1 parent 74680fc commit 043a1e6

File tree

12 files changed

+151
-44
lines changed

12 files changed

+151
-44
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
## 0.0.31
22

3+
* Support model name as api parameter
34
* Add retry parameters on fanout requests
45
* Bump unstructured library to 0.8.1
56
* Fix how to remove an element's coordinate information

Dockerfile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:experimental
2-
FROM quay.io/unstructured-io/base-images:rocky8.7-3
2+
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base
33

44
# NOTE(crag): NB_USER ARG for mybinder.org compat:
55
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
@@ -21,7 +21,7 @@ RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
2121
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
2222
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
2323

24-
24+
FROM base as python-deps
2525
# COPY requirements/dev.txt requirements-dev.txt
2626
COPY requirements/base.txt requirements-base.txt
2727
RUN python3.8 -m pip install pip==${PIP_VERSION} \
@@ -33,10 +33,13 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} \
3333

3434
USER ${NB_USER}
3535

36+
FROM python-deps as model-deps
3637
RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
3738
python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
38-
python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
39+
UNSTRUCTURED_HI_RES_SUPPORTED_MODEL=chipper python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
40+
3941

42+
FROM model-deps as code
4043
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
4144
COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
4245
COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
@@ -46,6 +49,6 @@ COPY --chown=${NB_USER}:${NB_USER} pipeline-notebooks pipeline-notebooks
4649
ENTRYPOINT ["uvicorn", "prepline_general.api.app:app", \
4750
"--log-config", "logger_config.yaml", \
4851
"--host", "0.0.0.0"]
49-
# Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
50-
# but some tooling will inspect containers and perform work contingent on networking support declared.
51-
EXPOSE 8000
52+
# Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
53+
# but some tooling will inspect containers and perform work contingent on networking support declared.
54+
EXPOSE 8000

logger_config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,9 @@ loggers:
4444
handlers:
4545
- standard_handler
4646
propagate: no
47+
unstructured_api:
48+
level: DEBUG
49+
handlers:
50+
- standard_handler
51+
propagate: no
4752

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
{
22
"cells": [
33
{
4-
"attachments": {},
54
"cell_type": "markdown",
65
"id": "e908195c",
76
"metadata": {},
@@ -10,7 +9,6 @@
109
]
1110
},
1211
{
13-
"attachments": {},
1412
"cell_type": "markdown",
1513
"id": "727614ba",
1614
"metadata": {},
@@ -37,7 +35,6 @@
3735
"source": []
3836
},
3937
{
40-
"attachments": {},
4138
"cell_type": "markdown",
4239
"id": "3848e558",
4340
"metadata": {},
@@ -46,7 +43,6 @@
4643
]
4744
},
4845
{
49-
"attachments": {},
5046
"cell_type": "markdown",
5147
"id": "01a62fe4",
5248
"metadata": {},
@@ -102,7 +98,6 @@
10298
]
10399
},
104100
{
105-
"attachments": {},
106101
"cell_type": "markdown",
107102
"id": "63e3b32b",
108103
"metadata": {},
@@ -222,7 +217,6 @@
222217
]
223218
},
224219
{
225-
"attachments": {},
226220
"cell_type": "markdown",
227221
"id": "15d69b6b",
228222
"metadata": {},
@@ -231,7 +225,6 @@
231225
]
232226
},
233227
{
234-
"attachments": {},
235228
"cell_type": "markdown",
236229
"id": "5c9e618c",
237230
"metadata": {},
@@ -324,7 +317,6 @@
324317
]
325318
},
326319
{
327-
"attachments": {},
328320
"cell_type": "markdown",
329321
"id": "258531fe",
330322
"metadata": {},
@@ -369,7 +361,6 @@
369361
]
370362
},
371363
{
372-
"attachments": {},
373364
"cell_type": "markdown",
374365
"id": "10e1d3df",
375366
"metadata": {},
@@ -378,7 +369,6 @@
378369
]
379370
},
380371
{
381-
"attachments": {},
382372
"cell_type": "markdown",
383373
"id": "52943c00",
384374
"metadata": {},
@@ -461,7 +451,6 @@
461451
]
462452
},
463453
{
464-
"attachments": {},
465454
"cell_type": "markdown",
466455
"id": "0f7fea99",
467456
"metadata": {},
@@ -528,7 +517,6 @@
528517
]
529518
},
530519
{
531-
"attachments": {},
532520
"cell_type": "markdown",
533521
"id": "cde38923",
534522
"metadata": {},
@@ -730,6 +718,10 @@
730718
"outputs": [],
731719
"source": [
732720
"# pipeline-api\n",
721+
"from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES\n",
722+
"import logging\n",
723+
"\n",
724+
"logger = logging.getLogger(\"unstructured_api\")\n",
733725
"\n",
734726
"def pipeline_api(\n",
735727
" file, \n",
@@ -741,9 +733,23 @@
741733
" m_encoding=[],\n",
742734
" m_xml_keep_tags=[],\n",
743735
" m_pdf_infer_table_structure = [],\n",
736+
" m_hi_res_model_name=[],\n",
744737
" file_content_type=None,\n",
745738
" response_type=\"application/json\"\n",
746739
"):\n",
740+
" logger.debug(\n",
741+
" f\"\\npipeline_api input params:\\n\"\n",
742+
" f\"filename: {filename}\\n\"\n",
743+
" f\"m_strategy: {m_strategy}\\n\"\n",
744+
" f\"m_coordinates: {m_coordinates}\\n\"\n",
745+
" f\"m_ocr_languages: {m_ocr_languages}\\n\"\n",
746+
" f\"m_encoding: {m_encoding}\\n\"\n",
747+
" f\"m_xml_keep_tags: {m_xml_keep_tags}\\n\"\n",
748+
" f\"m_pdf_infer_table_structure: {m_pdf_infer_table_structure}\\n\"\n",
749+
" f\"m_hi_res_model_name: {m_hi_res_model_name}\\n\"\n",
750+
" f\"file_content_type: {file_content_type}\\n\"\n",
751+
" f\"response_type: {response_type}\"\n",
752+
" )\n",
747753
" if filename.endswith(\".msg\"):\n",
748754
" # Note(yuming): convert file type for msg files\n",
749755
" # since fast api might sent the wrong one.\n",
@@ -760,6 +766,11 @@
760766
" show_coordinates_str = (m_coordinates[0] if len(m_coordinates) else \"false\").lower()\n",
761767
" show_coordinates = show_coordinates_str == \"true\"\n",
762768
" \n",
769+
" hi_res_model_name = m_hi_res_model_name[0] if len(m_hi_res_model_name) else None\n",
770+
" \n",
771+
" if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:\n",
772+
" raise HTTPException(status_code=400, detail=f\"coordinates aren't available when using the {hi_res_model_name} model type\")\n",
773+
" \n",
763774
" # Parallel mode is set by env variable\n",
764775
" enable_parallel_mode = os.environ.get(\"UNSTRUCTURED_PARALLEL_MODE_ENABLED\", \"false\")\n",
765776
" pdf_parallel_mode_enabled = enable_parallel_mode == \"true\"\n",
@@ -780,6 +791,18 @@
780791
" pdf_infer_table_structure = False\n",
781792
" \n",
782793
" try:\n",
794+
" logger.debug(\n",
795+
" f\"\\npartition input data:\\n\"\n",
796+
" f\"content_type: {file_content_type}\\n\"\n",
797+
" f\"strategy: {strategy}\\n\"\n",
798+
" f\"ocr_languages: {ocr_languages}\\n\"\n",
799+
" f\"coordinates: {show_coordinates}\\n\"\n",
800+
" f\"pdf_infer_table_structure: {pdf_infer_table_structure}\\n\"\n",
801+
" f\"encoding: {encoding}\\n\"\n",
802+
" f\"model_name: {hi_res_model_name}\\n\"\n",
803+
" f\"xml_keep_tags: {xml_keep_tags}\\n\"\n",
804+
" )\n",
805+
" \n",
783806
" if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n",
784807
" elements = partition_pdf_splits(\n",
785808
" request,\n",
@@ -791,6 +814,7 @@
791814
" coordinates=show_coordinates,\n",
792815
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
793816
" encoding=encoding,\n",
817+
" model_name=hi_res_model_name\n",
794818
" )\n",
795819
" else:\n",
796820
" elements = partition(\n",
@@ -802,6 +826,7 @@
802826
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
803827
" encoding=encoding,\n",
804828
" xml_keep_tags=xml_keep_tags,\n",
829+
" model_name=hi_res_model_name\n",
805830
" )\n",
806831
" except ValueError as e:\n",
807832
" if 'Invalid file' in e.args[0]:\n",
@@ -968,7 +993,6 @@
968993
]
969994
},
970995
{
971-
"attachments": {},
972996
"cell_type": "markdown",
973997
"id": "e997bff5",
974998
"metadata": {},

prepline_general/api/app.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ def filter(self, record: logging.LogRecord) -> bool:
3939
return record.getMessage().find("/healthcheck") == -1
4040

4141

42+
# Filter out /metrics noise
43+
class MetricsCheckFilter(logging.Filter):
44+
def filter(self, record: logging.LogRecord) -> bool:
45+
return record.getMessage().find("/metrics") == -1
46+
47+
4248
logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
4349

4450

0 commit comments

Comments
 (0)