Skip to content

Commit d6cff33

Browse files
authored
Chore: add support for include_page_breaks param (#153)
* add page break param to api * test inclusion of page breaks * add test file * fix: include page breaks test * rebase * update changelog and readme * added include page breaks param to smoke test
1 parent 2cd5ff9 commit d6cff33

File tree

9 files changed

+124
-24
lines changed

9 files changed

+124
-24
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
## 0.0.32-dev0
22

33
* Improve logging of params to single line json
4+
* Add support for `include_page_breaks` parameter
45

56
## 0.0.31
67

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,20 @@ curl -X 'POST'
136136
| jq -C . | less -R
137137
```
138138

139+
#### Page Breaks
140+
141+
For supported filetypes, set the `include_page_breaks` parameter to `true` to include `PageBreak` elements in the output.
142+
143+
```
144+
curl -X 'POST'
145+
'https://api.unstructured.io/general/v0/general' \
146+
-H 'accept: application/json' \
147+
-H 'Content-Type: multipart/form-data' \
148+
-F 'files=@sample-docs/layout-parser-paper-fast.pdf' \
149+
-F 'include_page_breaks=true' \
150+
| jq -C . | less -R
151+
```
152+
139153
## Developer Quick Start
140154

141155
* Using `pyenv` to manage virtualenv's is recommended

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"cells": [
33
{
4+
"attachments": {},
45
"cell_type": "markdown",
56
"id": "e908195c",
67
"metadata": {},
@@ -9,6 +10,7 @@
910
]
1011
},
1112
{
13+
"attachments": {},
1214
"cell_type": "markdown",
1315
"id": "727614ba",
1416
"metadata": {},
@@ -35,6 +37,7 @@
3537
"source": []
3638
},
3739
{
40+
"attachments": {},
3841
"cell_type": "markdown",
3942
"id": "3848e558",
4043
"metadata": {},
@@ -43,6 +46,7 @@
4346
]
4447
},
4548
{
49+
"attachments": {},
4650
"cell_type": "markdown",
4751
"id": "01a62fe4",
4852
"metadata": {},
@@ -98,6 +102,7 @@
98102
]
99103
},
100104
{
105+
"attachments": {},
101106
"cell_type": "markdown",
102107
"id": "63e3b32b",
103108
"metadata": {},
@@ -217,6 +222,7 @@
217222
]
218223
},
219224
{
225+
"attachments": {},
220226
"cell_type": "markdown",
221227
"id": "15d69b6b",
222228
"metadata": {},
@@ -225,6 +231,7 @@
225231
]
226232
},
227233
{
234+
"attachments": {},
228235
"cell_type": "markdown",
229236
"id": "5c9e618c",
230237
"metadata": {},
@@ -317,6 +324,7 @@
317324
]
318325
},
319326
{
327+
"attachments": {},
320328
"cell_type": "markdown",
321329
"id": "258531fe",
322330
"metadata": {},
@@ -361,6 +369,7 @@
361369
]
362370
},
363371
{
372+
"attachments": {},
364373
"cell_type": "markdown",
365374
"id": "10e1d3df",
366375
"metadata": {},
@@ -369,6 +378,7 @@
369378
]
370379
},
371380
{
381+
"attachments": {},
372382
"cell_type": "markdown",
373383
"id": "52943c00",
374384
"metadata": {},
@@ -451,6 +461,7 @@
451461
]
452462
},
453463
{
464+
"attachments": {},
454465
"cell_type": "markdown",
455466
"id": "0f7fea99",
456467
"metadata": {},
@@ -517,6 +528,7 @@
517528
]
518529
},
519530
{
531+
"attachments": {},
520532
"cell_type": "markdown",
521533
"id": "cde38923",
522534
"metadata": {},
@@ -730,6 +742,7 @@
730742
" m_strategy=[],\n",
731743
" m_coordinates=[],\n",
732744
" m_ocr_languages=[],\n",
745+
" m_include_page_breaks=[],\n",
733746
" m_encoding=[],\n",
734747
" m_xml_keep_tags=[],\n",
735748
" m_pdf_infer_table_structure = [],\n",
@@ -744,6 +757,7 @@
744757
" \"m_strategy\": m_strategy,\n",
745758
" \"m_coordinates\": m_coordinates,\n",
746759
" \"m_ocr_languages\": m_ocr_languages,\n",
760+
" \"m_include_page_breaks\": m_include_page_breaks,\n",
747761
" \"m_encoding\": m_encoding,\n",
748762
" \"m_xml_keep_tags\": m_xml_keep_tags,\n",
749763
" \"m_pdf_infer_table_structure\": m_pdf_infer_table_structure,\n",
@@ -780,6 +794,9 @@
780794
" \n",
781795
" ocr_languages= ('+'.join(m_ocr_languages) if len(m_ocr_languages) else 'eng').lower()\n",
782796
"\n",
797+
" include_page_breaks_str = (m_include_page_breaks[0] if len(m_include_page_breaks) else \"false\").lower()\n",
798+
" include_page_breaks = include_page_breaks_str == \"true\"\n",
799+
" \n",
783800
" encoding = m_encoding[0] if len(m_encoding) else None\n",
784801
" \n",
785802
" xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else \"false\").lower()\n",
@@ -801,6 +818,7 @@
801818
" \"ocr_languages\": ocr_languages,\n",
802819
" \"coordinates\": show_coordinates,\n",
803820
" \"pdf_infer_table_structure\": pdf_infer_table_structure,\n",
821+
" \"include_page_breaks\": include_page_breaks,\n",
804822
" \"encoding\": encoding,\n",
805823
" \"model_name\": hi_res_model_name,\n",
806824
" \"xml_keep_tags\": xml_keep_tags\n",
@@ -817,6 +835,7 @@
817835
" ocr_languages=ocr_languages,\n",
818836
" coordinates=show_coordinates,\n",
819837
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
838+
" include_page_breaks=include_page_breaks,\n",
820839
" encoding=encoding,\n",
821840
" model_name=hi_res_model_name\n",
822841
" )\n",
@@ -828,6 +847,7 @@
828847
" strategy=strategy,\n",
829848
" ocr_languages=ocr_languages,\n",
830849
" pdf_infer_table_structure=pdf_infer_table_structure,\n",
850+
" include_page_breaks=include_page_breaks,\n",
831851
" encoding=encoding,\n",
832852
" xml_keep_tags=xml_keep_tags,\n",
833853
" model_name=hi_res_model_name\n",
@@ -997,6 +1017,7 @@
9971017
]
9981018
},
9991019
{
1020+
"attachments": {},
10001021
"cell_type": "markdown",
10011022
"id": "e997bff5",
10021023
"metadata": {},

prepline_general/api/general.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def pipeline_api(
202202
m_strategy=[],
203203
m_coordinates=[],
204204
m_ocr_languages=[],
205+
m_include_page_breaks=[],
205206
m_encoding=[],
206207
m_xml_keep_tags=[],
207208
m_pdf_infer_table_structure=[],
@@ -218,6 +219,7 @@ def pipeline_api(
218219
"m_strategy": m_strategy,
219220
"m_coordinates": m_coordinates,
220221
"m_ocr_languages": m_ocr_languages,
222+
"m_include_page_breaks": m_include_page_breaks,
221223
"m_encoding": m_encoding,
222224
"m_xml_keep_tags": m_xml_keep_tags,
223225
"m_pdf_infer_table_structure": m_pdf_infer_table_structure,
@@ -258,6 +260,11 @@ def pipeline_api(
258260

259261
ocr_languages = ("+".join(m_ocr_languages) if len(m_ocr_languages) else "eng").lower()
260262

263+
include_page_breaks_str = (
264+
m_include_page_breaks[0] if len(m_include_page_breaks) else "false"
265+
).lower()
266+
include_page_breaks = include_page_breaks_str == "true"
267+
261268
encoding = m_encoding[0] if len(m_encoding) else None
262269

263270
xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else "false").lower()
@@ -281,6 +288,7 @@ def pipeline_api(
281288
"ocr_languages": ocr_languages,
282289
"coordinates": show_coordinates,
283290
"pdf_infer_table_structure": pdf_infer_table_structure,
291+
"include_page_breaks": include_page_breaks,
284292
"encoding": encoding,
285293
"model_name": hi_res_model_name,
286294
"xml_keep_tags": xml_keep_tags,
@@ -300,6 +308,7 @@ def pipeline_api(
300308
ocr_languages=ocr_languages,
301309
coordinates=show_coordinates,
302310
pdf_infer_table_structure=pdf_infer_table_structure,
311+
include_page_breaks=include_page_breaks,
303312
encoding=encoding,
304313
model_name=hi_res_model_name,
305314
)
@@ -311,6 +320,7 @@ def pipeline_api(
311320
strategy=strategy,
312321
ocr_languages=ocr_languages,
313322
pdf_infer_table_structure=pdf_infer_table_structure,
323+
include_page_breaks=include_page_breaks,
314324
encoding=encoding,
315325
xml_keep_tags=xml_keep_tags,
316326
model_name=hi_res_model_name,
@@ -467,6 +477,7 @@ def pipeline_1(
467477
strategy: List[str] = Form(default=[]),
468478
coordinates: List[str] = Form(default=[]),
469479
ocr_languages: List[str] = Form(default=[]),
480+
include_page_breaks: List[str] = Form(default=[]),
470481
encoding: List[str] = Form(default=[]),
471482
xml_keep_tags: List[str] = Form(default=[]),
472483
pdf_infer_table_structure: List[str] = Form(default=[]),
@@ -513,6 +524,7 @@ def response_generator(is_multipart):
513524
m_strategy=strategy,
514525
m_coordinates=coordinates,
515526
m_ocr_languages=ocr_languages,
527+
m_include_page_breaks=include_page_breaks,
516528
m_encoding=encoding,
517529
m_xml_keep_tags=xml_keep_tags,
518530
m_pdf_infer_table_structure=pdf_infer_table_structure,

requirements/base.txt

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ filetype==1.2.0
6565
# via unstructured
6666
flatbuffers==23.5.26
6767
# via onnxruntime
68-
fonttools==4.40.0
68+
fonttools==4.41.0
6969
# via matplotlib
7070
fsspec==2023.6.0
7171
# via huggingface-hub
@@ -103,7 +103,7 @@ jinja2==3.1.2
103103
# unstructured-api-tools
104104
joblib==1.3.1
105105
# via nltk
106-
jsonschema==4.18.1
106+
jsonschema==4.18.3
107107
# via nbformat
108108
jsonschema-specifications==2023.6.1
109109
# via jsonschema
@@ -146,7 +146,7 @@ mypy-extensions==1.0.0
146146
# via mypy
147147
nbclient==0.8.0
148148
# via nbconvert
149-
nbconvert==7.6.0
149+
nbconvert==7.7.1
150150
# via unstructured-api-tools
151151
nbformat==5.9.1
152152
# via
@@ -203,7 +203,7 @@ pdfminer-six==20221105
203203
# via
204204
# pdfplumber
205205
# unstructured
206-
pdfplumber==0.9.0
206+
pdfplumber==0.10.0
207207
# via layoutparser
208208
pillow==10.0.0
209209
# via
@@ -217,7 +217,7 @@ pillow==10.0.0
217217
# unstructured
218218
pkgutil-resolve-name==1.3.10
219219
# via jsonschema
220-
platformdirs==3.8.1
220+
platformdirs==3.9.1
221221
# via jupyter-core
222222
portalocker==2.7.0
223223
# via iopath
@@ -239,8 +239,10 @@ pypandoc==1.11
239239
# via unstructured
240240
pyparsing==3.0.9
241241
# via matplotlib
242-
pypdf==3.12.1
242+
pypdf==3.12.2
243243
# via -r requirements/base.in
244+
pypdfium2==4.18.0
245+
# via pdfplumber
244246
pytesseract==0.3.10
245247
# via layoutparser
246248
python-dateutil==2.8.2
@@ -289,7 +291,7 @@ requests==2.31.0
289291
# torchvision
290292
# transformers
291293
# unstructured
292-
rpds-py==0.8.10
294+
rpds-py==0.8.11
293295
# via
294296
# jsonschema
295297
# referencing
@@ -379,12 +381,10 @@ unstructured-inference==0.5.5
379381
# via unstructured
380382
urllib3==2.0.3
381383
# via requests
382-
uvicorn[standard]==0.22.0
384+
uvicorn[standard]==0.23.0
383385
# via unstructured-api-tools
384386
uvloop==0.17.0
385387
# via uvicorn
386-
wand==0.6.11
387-
# via pdfplumber
388388
watchfiles==0.19.0
389389
# via uvicorn
390390
webencodings==0.5.1
@@ -397,7 +397,7 @@ xlrd==2.0.1
397397
# via unstructured
398398
xlsxwriter==3.1.2
399399
# via python-pptx
400-
zipp==3.16.0
400+
zipp==3.16.2
401401
# via
402402
# importlib-metadata
403403
# importlib-resources

0 commit comments

Comments
 (0)