Skip to content

Commit b6bfbf9

Browse files
authored
fix: track filename in metadata for docx tables (#597)
* fix: track filename in metadata for docx tables * bump version * remove accidental commit
1 parent 301cef2 commit b6bfbf9

File tree

4 files changed

+27
-10
lines changed

4 files changed

+27
-10
lines changed

CHANGELOG.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
1-
## 0.6.7-dev5
2-
3-
### Enhancements
4-
5-
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
6-
7-
## 0.6.7-dev4
1+
## 0.6.7-dev6
82

93
### Enhancements
104

115
* Add `file_directory` to metadata
6+
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
127
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
138
partition strategy in CLI. For example, `--partition-strategy fast`.
149
* Added metadata for filetype.
@@ -26,6 +21,7 @@
2621
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
2722
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
2823
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
24+
* Fix to ensure `filename` is tracked in metadata for `docx` tables.
2925

3026
## 0.6.6
3127

test_unstructured/partition/test_docx.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Address,
88
ListItem,
99
NarrativeText,
10+
Table,
1011
Text,
1112
Title,
1213
)
@@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
9798
def test_partition_docx_raises_with_neither():
9899
with pytest.raises(ValueError):
99100
partition_docx()
101+
102+
103+
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
104+
elements = partition_docx(filename=filename)
105+
106+
assert isinstance(elements[0], Table)
107+
assert (
108+
elements[0].metadata.text_as_html
109+
== """<table>
110+
<thead>
111+
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
112+
</thead>
113+
<tbody>
114+
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
115+
</tbody>
116+
</table>"""
117+
)
118+
assert elements[0].metadata.filename == "fake_table.docx"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.7-dev5" # pragma: no cover
1+
__version__ = "0.6.7-dev6" # pragma: no cover

unstructured/partition/docx.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,10 @@ def partition_docx(
137137
text_table = _convert_table_to_text(table, as_html=False)
138138
element = Table(text_table)
139139
if element is not None:
140-
element.metadata = ElementMetadata(filename=metadata_filename)
141-
element.metadata = ElementMetadata(text_as_html=html_table)
140+
element.metadata = ElementMetadata(
141+
text_as_html=html_table,
142+
filename=metadata_filename,
143+
)
142144
elements.append(element)
143145
table_index += 1
144146
elif element_item.tag.endswith("p"):

0 commit comments

Comments
 (0)