Skip to content

Commit 29f0ded

Browse files
test: revive ingest unit tests (#688)
1 parent 508ce48 commit 29f0ded

File tree

3 files changed

+144
-135
lines changed

3 files changed

+144
-135
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,11 @@ jobs:
167167
source .venv/bin/activate
168168
mkdir "$NLTK_DATA"
169169
make install-ci
170-
- name: Test
170+
- name: Test Ingest (unit)
171+
run: |
172+
source .venv/bin/activate
173+
PYTHONPATH=. pytest test_unstructured_ingest/unit
174+
- name: Test (end-to-end)
171175
env:
172176
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
173177
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

test_unstructured_ingest/test_interfaces.py

Lines changed: 0 additions & 134 deletions
This file was deleted.
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import os
2+
import pathlib
3+
from dataclasses import dataclass
4+
5+
import pytest
6+
7+
from unstructured.ingest.connector.local import LocalIngestDoc, SimpleLocalConfig
8+
from unstructured.ingest.interfaces import (
9+
BaseConnector,
10+
BaseConnectorConfig,
11+
BaseIngestDoc,
12+
StandardConnectorConfig,
13+
)
14+
from unstructured.partition.auto import partition
15+
from unstructured.staging.base import convert_to_dict
16+
17+
DIRECTORY = pathlib.Path(__file__).parent.resolve()
18+
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
19+
TEST_DOWNLOAD_DIR="/tmp"
20+
TEST_OUTPUT_DIR="/tmp"
21+
TEST_ID="test"
22+
TEST_FILE_PATH=os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
23+
24+
@dataclass
25+
class TestConfig(BaseConnectorConfig):
26+
id: str
27+
path: str
28+
29+
TEST_CONFIG=TestConfig(id=TEST_ID, path=TEST_FILE_PATH)
30+
31+
@dataclass
32+
class TestIngestDoc(BaseIngestDoc):
33+
config: TestConfig
34+
35+
@property
36+
def filename(self):
37+
return "test"
38+
39+
def cleanup_file(self):
40+
pass
41+
42+
def get_file(self):
43+
pass
44+
45+
def has_output(self):
46+
return True
47+
48+
def write_result(self, result):
49+
pass
50+
51+
@pytest.fixture()
52+
def partition_test_results():
53+
# Reusable partition test results, calculated only once
54+
return partition(filename=str(TEST_FILE_PATH))
55+
56+
@pytest.fixture()
57+
def partition_file_test_results(partition_test_results):
58+
# Reusable partition_file test results, calculated only once
59+
return convert_to_dict(partition_test_results)
60+
61+
def test_process_file_fields_include_default(mocker, partition_test_results):
62+
"""Validate when metadata_include and metadata_exclude are not set, all fields:
63+
("element_id", "text", "type", "metadata") are included"""
64+
mocker.patch(
65+
"unstructured.ingest.interfaces.partition",
66+
return_value=partition_test_results,
67+
)
68+
test_ingest_doc = TestIngestDoc(
69+
config=TEST_CONFIG,
70+
standard_config=StandardConnectorConfig(
71+
download_dir=TEST_DOWNLOAD_DIR,
72+
output_dir=TEST_OUTPUT_DIR,
73+
metadata_include="filename,page_number",
74+
),
75+
)
76+
isd_elems = test_ingest_doc.process_file()
77+
assert len(isd_elems)
78+
for elem in isd_elems:
79+
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
80+
81+
82+
def test_process_file_metadata_includes_filename_and_page_number(mocker, partition_test_results):
83+
"""Validate when metadata_include is set to "filename,page_number",
84+
only filename is included in metadata"""
85+
mocker.patch(
86+
"unstructured.ingest.interfaces.partition",
87+
return_value=partition_test_results,
88+
)
89+
test_ingest_doc = TestIngestDoc(
90+
config=TEST_CONFIG,
91+
standard_config=StandardConnectorConfig(
92+
download_dir=TEST_DOWNLOAD_DIR,
93+
output_dir=TEST_OUTPUT_DIR,
94+
metadata_include="filename,page_number",
95+
),
96+
)
97+
isd_elems = test_ingest_doc.process_file()
98+
assert len(isd_elems)
99+
for elem in isd_elems:
100+
assert set(elem["metadata"].keys()) == {"filename", "page_number"}
101+
102+
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
103+
"""Validate when metadata_exclude is set to "filename,page_number",
104+
neither filename nor page_number are included in metadata"""
105+
mocker.patch(
106+
"unstructured.ingest.interfaces.partition",
107+
return_value=partition_test_results,
108+
)
109+
test_ingest_doc = TestIngestDoc(
110+
config=TEST_CONFIG,
111+
standard_config=StandardConnectorConfig(
112+
download_dir=TEST_DOWNLOAD_DIR,
113+
output_dir=TEST_OUTPUT_DIR,
114+
metadata_exclude="filename,page_number",
115+
),
116+
)
117+
isd_elems = test_ingest_doc.process_file()
118+
assert len(isd_elems)
119+
for elem in isd_elems:
120+
assert "filename" not in elem["metadata"].keys()
121+
assert "page_number" not in elem["metadata"].keys()
122+
123+
def test_process_file_flatten_metadata(mocker, partition_test_results):
124+
mocker.patch(
125+
"unstructured.ingest.interfaces.partition",
126+
return_value=partition_test_results,
127+
)
128+
test_ingest_doc = TestIngestDoc(
129+
config=TEST_CONFIG,
130+
standard_config=StandardConnectorConfig(
131+
download_dir=TEST_DOWNLOAD_DIR,
132+
output_dir=TEST_OUTPUT_DIR,
133+
metadata_include="filename,page_number",
134+
flatten_metadata=True,
135+
),
136+
)
137+
isd_elems = test_ingest_doc.process_file()
138+
for elem in isd_elems:
139+
assert {"element_id", "text", "type", "filename", "page_number"} == set(elem.keys())

0 commit comments

Comments
 (0)