Skip to content

Commit dc6d7d7

Browse files
authored
feat: add metadata_filename parameter across all partition functions (#811)
* fix conflicts * add tests and clean metadata_filename in partitions * fix test_email and remove comments * make tidy/check * update changelog and version * fix tests * make tidy again
1 parent 8d2e7c0 commit dc6d7d7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+628
-92
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
### Features
66

7+
* Add metadata_filename parameter across all partition functions
8+
79
### Fixes
810
* Fix KeyError when `isd_to_elements` doesn't find a type
911

test_unstructured/partition/test_csv.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,16 @@ def test_partition_csv_from_filename(filename="example-docs/stanley-cups.csv"):
1212
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
1313
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
1414
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
15+
assert elements[0].metadata.filename == "stanley-cups.csv"
16+
17+
18+
def test_partition_csv_from_filename_with_metadata_filename(
19+
filename="example-docs/stanley-cups.csv",
20+
):
21+
elements = partition_csv(filename=filename, metadata_filename="test")
22+
23+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
24+
assert elements[0].metadata.filename == "test"
1525

1626

1727
def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
@@ -22,6 +32,15 @@ def test_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
2232
assert isinstance(elements[0], Table)
2333
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
2434
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
35+
assert elements[0].metadata.filename is None
36+
37+
38+
def test_partition_csv_from_file_with_metadata_filename(filename="example-docs/stanley-cups.csv"):
39+
with open(filename, "rb") as f:
40+
elements = partition_csv(file=f, metadata_filename="test")
41+
42+
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
43+
assert elements[0].metadata.filename == "test"
2544

2645

2746
def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.csv"):
@@ -31,3 +50,4 @@ def test_partition_csv_can_exclude_metadata(filename="example-docs/stanley-cups.
3150
assert isinstance(elements[0], Table)
3251
assert elements[0].metadata.text_as_html is None
3352
assert elements[0].metadata.filetype is None
53+
assert elements[0].metadata.filename is None

test_unstructured/partition/test_doc.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,28 +56,40 @@ def expected_elements():
5656
]
5757

5858

59-
def test_partition_doc_with_filename(mock_document, expected_elements, tmpdir, capsys):
59+
def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys):
6060
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
6161
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
6262
mock_document.save(docx_filename)
6363
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
64-
6564
elements = partition_doc(filename=doc_filename)
6665
assert elements == expected_elements
6766
assert elements[0].metadata.filename == "mock_document.doc"
6867
assert elements[0].metadata.file_directory == tmpdir.dirname
69-
7068
assert capsys.readouterr().out == ""
7169
assert capsys.readouterr().err == ""
7270

7371

74-
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
72+
def test_partition_doc_from_filename_with_metadata_filename(
73+
mock_document,
74+
expected_elements,
75+
tmpdir,
76+
):
7577
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
7678
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
7779
mock_document.save(docx_filename)
7880
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
7981

80-
partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
82+
elements = partition_doc(filename=doc_filename, metadata_filename="test")
83+
assert elements == expected_elements
84+
assert all(element.metadata.filename == "test" for element in elements)
85+
86+
87+
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
88+
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
89+
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
90+
mock_document.save(docx_filename)
91+
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
92+
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
8193

8294

8395
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
@@ -87,7 +99,7 @@ def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmp
8799
partition_doc(filename=doc_filename)
88100

89101

90-
def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsys):
102+
def test_partition_doc_from_file(mock_document, expected_elements, tmpdir, capsys):
91103
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
92104
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
93105
mock_document.save(docx_filename)
@@ -96,9 +108,22 @@ def test_partition_doc_with_file(mock_document, expected_elements, tmpdir, capsy
96108
with open(doc_filename, "rb") as f:
97109
elements = partition_doc(file=f)
98110
assert elements == expected_elements
99-
100111
assert capsys.readouterr().out == ""
101112
assert capsys.readouterr().err == ""
113+
for element in elements:
114+
assert element.metadata.filename is None
115+
116+
117+
def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
118+
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
119+
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
120+
mock_document.save(docx_filename)
121+
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
122+
123+
with open(doc_filename, "rb") as f:
124+
elements = partition_doc(file=f, metadata_filename="test")
125+
for element in elements:
126+
assert element.metadata.filename == "test"
102127

103128

104129
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
@@ -116,7 +141,7 @@ def test_partition_doc_raises_with_neither():
116141
partition_doc()
117142

118143

119-
def test_partition_doc_with_file_exclude_metadata(mock_document, tmpdir):
144+
def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
120145
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
121146
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
122147
mock_document.save(docx_filename)
@@ -130,7 +155,7 @@ def test_partition_doc_with_file_exclude_metadata(mock_document, tmpdir):
130155
assert elements[0].metadata.filename is None
131156

132157

133-
def test_partition_doc_with_filename_exclude_metadata(mock_document, tmpdir):
158+
def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
134159
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
135160
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
136161
mock_document.save(docx_filename)

test_unstructured/partition/test_docx.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,22 @@ def expected_elements():
5757
]
5858

5959

60-
def test_partition_docx_with_filename(mock_document, expected_elements, tmpdir):
60+
def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir):
6161
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
6262
mock_document.save(filename)
6363

6464
elements = partition_docx(filename=filename)
6565
assert elements == expected_elements
6666
assert elements[0].metadata.page_number is None
67+
for element in elements:
68+
assert element.metadata.filename == "mock_document.docx"
69+
70+
71+
def test_partition_docx_from_filename_with_metadata_filename(mock_document, tmpdir):
72+
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
73+
mock_document.save(filename)
74+
elements = partition_docx(filename=filename, metadata_filename="test")
75+
assert all(element.metadata.filename == "test" for element in elements)
6776

6877

6978
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
@@ -79,15 +88,30 @@ def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpd
7988
spooled_temp_file.seek(0)
8089
elements = partition_docx(file=spooled_temp_file)
8190
assert elements == expected_elements
91+
for element in elements:
92+
assert element.metadata.filename is None
8293

8394

84-
def test_partition_docx_with_file(mock_document, expected_elements, tmpdir):
95+
def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
8596
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
8697
mock_document.save(filename)
8798

8899
with open(filename, "rb") as f:
89100
elements = partition_docx(file=f)
90101
assert elements == expected_elements
102+
for element in elements:
103+
assert element.metadata.filename is None
104+
105+
106+
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
107+
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
108+
mock_document.save(filename)
109+
110+
with open(filename, "rb") as f:
111+
elements = partition_docx(file=f, metadata_filename="test")
112+
assert elements == expected_elements
113+
for element in elements:
114+
assert element.metadata.filename == "test"
91115

92116

93117
def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
@@ -125,30 +149,36 @@ def test_partition_docx_grabs_header_and_footer(filename="example-docs/handbook-
125149
elements = partition_docx(filename=filename)
126150
assert elements[0] == Header("US Trustee Handbook")
127151
assert elements[-1] == Footer("Copyright")
152+
for element in elements:
153+
assert element.metadata.filename == "handbook-1p.docx"
128154

129155

130156
def test_partition_docx_includes_pages_if_present(filename="example-docs/handbook-1p.docx"):
131157
elements = partition_docx(filename=filename, include_page_breaks=False)
132158
assert "PageBreak" not in [elem.category for elem in elements]
133159
assert elements[1].metadata.page_number == 1
134160
assert elements[-2].metadata.page_number == 2
161+
for element in elements:
162+
assert element.metadata.filename == "handbook-1p.docx"
135163

136164

137165
def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.docx"):
138166
elements = partition_docx(filename=filename, include_page_breaks=True)
139167
assert "PageBreak" in [elem.category for elem in elements]
140168
assert elements[1].metadata.page_number == 1
141169
assert elements[-2].metadata.page_number == 2
170+
for element in elements:
171+
assert element.metadata.filename == "handbook-1p.docx"
142172

143173

144-
def test_partition_docx_with_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
174+
def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
145175
elements = partition_docx(filename=filename, include_metadata=False)
146176
assert elements[0].metadata.filetype is None
147177
assert elements[0].metadata.page_name is None
148178
assert elements[0].metadata.filename is None
149179

150180

151-
def test_partition_docx_with_file_exclude_metadata(mock_document, tmpdir):
181+
def test_partition_docx_from_file_exclude_metadata(mock_document, tmpdir):
152182
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
153183
mock_document.save(filename)
154184

0 commit comments

Comments
 (0)