Skip to content

Commit 4c7e037

Browse files
committed
add test for chunking strategies
1 parent 2082d4f commit 4c7e037

File tree

2 files changed

+43
-1
lines changed

2 files changed

+43
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ __pycache__/
1212
.idea/
1313
openapi.json
1414
openapi_client.json
15+
.env

_test_unstructured_client/integration/test_decorators.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import httpx
77
import json
8+
import os
89
import pytest
910
import requests
1011
from deepdiff import DeepDiff
@@ -19,7 +20,7 @@
1920
from unstructured_client._hooks.custom import form_utils
2021
from unstructured_client._hooks.custom import split_pdf_hook
2122

22-
FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
23+
FAKE_KEY = os.getenv("UNSTRUCTURED_API_KEY") or "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
2324

2425

2526
@pytest.mark.parametrize("concurrency_level", [1, 2, 5])
@@ -472,3 +473,43 @@ async def mock_send(_, request: httpx.Request, **kwargs):
472473
assert mock_endpoint_called
473474

474475
assert res.status_code == 200
476+
477+
478+
@pytest.mark.parametrize(
479+
("filename", "chunking_strategy", "expected_elements_num"),
480+
[
481+
## -- Paid strategy --
482+
("_sample_docs/layout-parser-paper.pdf", "by_page", 16), # 16 pages, 133 elements w/o chunking
483+
("_sample_docs/layout-parser-paper.pdf", shared.ChunkingStrategy.BY_PAGE, 16),
484+
# -- Open source strategy --
485+
("_sample_docs/layout-parser-paper.pdf", "by_title", -1), # unsure what the correct number is atm
486+
("_sample_docs/layout-parser-paper.pdf", shared.ChunkingStrategy.BY_TITLE, -1),
487+
],
488+
)
489+
def test_chunking(
490+
filename: str,
491+
chunking_strategy: str| shared.ChunkingStrategy,
492+
expected_elements_num: int,
493+
):
494+
495+
client = UnstructuredClient(api_key_auth=FAKE_KEY)
496+
497+
with open(filename, "rb") as f:
498+
files = shared.Files(
499+
content=f.read(),
500+
file_name=filename,
501+
)
502+
503+
parameters = shared.PartitionParameters(
504+
files=files,
505+
chunking_strategy=chunking_strategy, # type: ignore
506+
)
507+
508+
req = operations.PartitionRequest(
509+
partition_parameters=parameters
510+
)
511+
512+
resp = client.general.partition(request=req)
513+
assert len(resp.elements) == expected_elements_num
514+
assert all(element.type == "CompositeElement" for element in resp.elements)
515+

0 commit comments

Comments
 (0)