Skip to content

Commit 0906f9e

Browse files
authored
Jj/297 max chars (#298)
Closes #297 Adds `max_characters` param for chunking. Refactors associated tests for chunking parameters.
1 parent c91d1b9 commit 0906f9e

File tree

7 files changed

+69
-55
lines changed

7 files changed

+69
-55
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.56-dev0
2+
* **Add `max_characters` param for chunking** This param gives users additional control to "chunk" elements into larger or smaller `CompositeElement`s
3+
4+
15
## 0.0.55
26

37
* Bump unstructured to 0.10.26

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,10 @@ Set the `chunking_strategy` to chunk text into larger or smaller elements. Defau
219219
reaches a length of n characters. Defaults to 500.
220220

221221
`new_after_n_chars`
222-
Cuts off new sections once they reach a length of "n" characters. Defaults to 1500.
222+
Cuts off new sections once they reach a length of "n" characters (soft max). Defaults to 1500.
223+
224+
`max_characters`
225+
Cuts off new sections once they reach a length of "n" characters (hard max). Defaults to 1500.
223226

224227
```
225228
curl -X 'POST'

openapi.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,13 @@
205205
"new_after_n_chars": {
206206
"type": "integer",
207207
"title": "New after n chars",
208-
"description": "If chunking strategy is set, cut off new sections after reaching a length of n chars. Default: 1500",
208+
"description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500",
209+
"example": 1500
210+
},
211+
"max_characters": {
212+
"type": "integer",
213+
"title": "Max Characters",
214+
"description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 1500",
209215
"example": 1500
210216
}
211217
},

prepline_general/api/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
app = FastAPI(
1212
title="Unstructured Pipeline API",
1313
description="""""",
14-
version="0.0.55",
14+
version="0.0.56",
1515
docs_url="/general/docs",
1616
openapi_url="/general/openapi.json",
1717
)

prepline_general/api/general.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ def pipeline_api(
269269
m_multipage_sections=[],
270270
m_combine_under_n_chars=[],
271271
m_new_after_n_chars=[],
272+
m_max_characters=[],
272273
):
273274
if filename.endswith(".msg"):
274275
# Note(yuming): convert file type for msg files
@@ -300,6 +301,7 @@ def pipeline_api(
300301
"m_multipage_sections": m_multipage_sections,
301302
"m_combine_under_n_chars": m_combine_under_n_chars,
302303
"new_after_n_chars": m_new_after_n_chars,
304+
"m_max_characters": m_max_characters,
303305
},
304306
default=str,
305307
)
@@ -403,6 +405,10 @@ def pipeline_api(
403405
else 1500
404406
)
405407

408+
max_characters = (
409+
int(m_max_characters[0]) if m_max_characters and m_max_characters[0].isdigit() else 1500
410+
)
411+
406412
try:
407413
logger.debug(
408414
"partition input data: {}".format(
@@ -423,6 +429,7 @@ def pipeline_api(
423429
"multipage_sections": multipage_sections,
424430
"combine_under_n_chars": combine_under_n_chars,
425431
"new_after_n_chars": new_after_n_chars,
432+
"max_characters": max_characters,
426433
},
427434
default=str,
428435
)
@@ -446,6 +453,7 @@ def pipeline_api(
446453
"multipage_sections": multipage_sections,
447454
"combine_under_n_chars": combine_under_n_chars,
448455
"new_after_n_chars": new_after_n_chars,
456+
"max_characters": max_characters,
449457
}
450458

451459
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -621,7 +629,7 @@ def return_content_type(filename):
621629

622630

623631
@router.post("/general/v0/general")
624-
@router.post("/general/v0.0.55/general")
632+
@router.post("/general/v0.0.56/general")
625633
def pipeline_1(
626634
request: Request,
627635
gz_uncompressed_content_type: Optional[str] = Form(default=None),
@@ -641,6 +649,7 @@ def pipeline_1(
641649
multipage_sections: List[str] = Form(default=[]),
642650
combine_under_n_chars: List[str] = Form(default=[]),
643651
new_after_n_chars: List[str] = Form(default=[]),
652+
max_characters: List[str] = Form(default=[]),
644653
):
645654
if files:
646655
for file_index in range(len(files)):
@@ -697,6 +706,7 @@ def response_generator(is_multipart):
697706
m_multipage_sections=multipage_sections,
698707
m_combine_under_n_chars=combine_under_n_chars,
699708
m_new_after_n_chars=new_after_n_chars,
709+
m_max_characters=max_characters,
700710
)
701711

702712
if is_expected_response_type(media_type, type(response)):

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.55
2+
version: 0.0.56

test_general/api/test_app.py

Lines changed: 41 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ def test_parallel_mode_passes_params(monkeypatch):
489489
"multipage_sections": False,
490490
"combine_under_n_chars": 501,
491491
"new_after_n_chars": 1501,
492+
"max_characters": 1502,
492493
},
493494
)
494495

@@ -511,6 +512,7 @@ def test_parallel_mode_passes_params(monkeypatch):
511512
multipage_sections=False,
512513
combine_under_n_chars=501,
513514
new_after_n_chars=1501,
515+
max_characters=1502,
514516
)
515517

516518

@@ -649,56 +651,45 @@ def test_chunking_strategy_param():
649651
assert "CompositeElement" in [element.get("type") for element in response_with_chunking]
650652

651653

652-
# def test_chunking_strategy_additional_params():
653-
# client = TestClient(app)
654-
# test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"
655-
# res = client.post(
656-
# MAIN_API_ROUTE,
657-
# files=[("files", (str(test_file), open(test_file, "rb")))],
658-
# data={
659-
# "chunking_strategy": "by_title",
660-
# "multipage_sections": "False",
661-
# "combine_under_n_chars": "0",
662-
# },
663-
# )
664-
# response_from_multipage_false_combine_chars_0 = res.json()
665-
666-
# res = client.post(
667-
# MAIN_API_ROUTE,
668-
# files=[("files", (str(test_file), open(test_file, "rb")))],
669-
# data={
670-
# "chunking_strategy": "by_title",
671-
# "multipage_sections": "True",
672-
# "combine_under_n_chars": "0",
673-
# },
674-
# )
675-
# response_from_multipage_true_combine_chars_0 = res.json()
676-
677-
# res = client.post(
678-
# MAIN_API_ROUTE,
679-
# files=[("files", (str(test_file), open(test_file, "rb")))],
680-
# data={
681-
# "chunking_strategy": "by_title",
682-
# "multipage_sections": "True",
683-
# "combine_under_n_chars": "5000",
684-
# # Defining new_after_n_chars since it has to be greater than combine_under_n_chars
685-
# "new_after_n_chars": "50000",
686-
# },
687-
# )
688-
# response_multipage_true_combine_chars_5000 = res.json()
689-
690-
# assert (
691-
# response_multipage_true_combine_chars_5000
692-
# != response_from_multipage_true_combine_chars_0
693-
# )
694-
# assert (
695-
# response_from_multipage_true_combine_chars_0
696-
# != response_from_multipage_false_combine_chars_0
697-
# )
698-
# assert (
699-
# response_multipage_true_combine_chars_5000
700-
# != response_from_multipage_false_combine_chars_0
701-
# )
654+
# Defaults:
655+
# multippage = True, combine_text_under_n_chars = None, new_after_n_chars = None,
656+
# max_characters = 500
657+
@pytest.mark.parametrize(
658+
("multipage_sections", "combine_under_n_chars", "new_after_n_chars", "max_characters"),
659+
[
660+
(False, None, None, 500), # test multipage_sections
661+
(True, 1000, None, 5000), # test combine_under_n_chars
662+
(True, None, 10, 500), # test new_after_n_chars
663+
(True, None, None, 100), # test max__characters
664+
],
665+
)
666+
def test_chunking_strategy_additional_params(
667+
multipage_sections, combine_under_n_chars, new_after_n_chars, max_characters
668+
):
669+
client = TestClient(app)
670+
test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"
671+
672+
arg_resp = client.post(
673+
MAIN_API_ROUTE,
674+
files=[("files", (str(test_file), open(test_file, "rb")))],
675+
data={
676+
"chunking_strategy": "by_title",
677+
"multipage_sections": multipage_sections,
678+
"combine_under_n_chars": combine_under_n_chars,
679+
"new_after_n_chars": new_after_n_chars,
680+
"max_characters": max_characters,
681+
},
682+
)
683+
arg_resp_json = arg_resp.json()
684+
685+
default_resp = client.post(
686+
MAIN_API_ROUTE,
687+
files=[("files", (str(test_file), open(test_file, "rb")))],
688+
data={"chunking_strategy": "by_title"},
689+
)
690+
default_resp_json = default_resp.json()
691+
692+
assert arg_resp_json != default_resp_json
702693

703694

704695
def test_encrypted_pdf():

0 commit comments

Comments
 (0)