Skip to content

Commit 601594d

Browse files
authored
fix(docx): fix short-row DOCX table (#2943)
**Summary** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
1 parent eff84af commit 601594d

32 files changed

+157
-340
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.13.7-dev1
1+
## 0.13.7-dev2
22

33
### Enhancements
44

@@ -8,6 +8,8 @@
88

99
### Fixes
1010

11+
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
12+
1113
## 0.13.6
1214

1315
### Enhancements
16.6 KB
Binary file not shown.

test_unstructured/partition/docx/test_docx.py

Lines changed: 129 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
Address,
1717
CompositeElement,
1818
Element,
19-
ElementType,
2019
Footer,
2120
Header,
2221
ListItem,
@@ -132,6 +131,133 @@ def but_the_text_of_a_merged_cell_appears_only_once(self):
132131
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
133132
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
134133

134+
def it_can_partition_tables_with_incomplete_rows(self):
135+
"""DOCX permits table rows to start late and end early.
136+
137+
It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
138+
they allow rows to start late, like in column 3, and end early, like the last cell is in
139+
column 5 of a 7 column table.
140+
141+
A practical example might look like this:
142+
143+
+------+------+
144+
| East | West |
145+
+----------+------+------+
146+
| Started | 25 | 32 |
147+
+----------+------+------+
148+
| Finished | 17 | 21 |
149+
+----------+------+------+
150+
"""
151+
elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx")))
152+
153+
e = next(elements)
154+
assert e.text.startswith("Example of DOCX table ")
155+
# --
156+
# ┌───┬───┐
157+
# │ a │ b │
158+
# ├───┼───┤
159+
# │ c │ d │
160+
# └───┴───┘
161+
e = next(elements)
162+
assert type(e).__name__ == "Table"
163+
assert e.text == "a b c d"
164+
assert e.metadata.text_as_html == (
165+
"<table>\n"
166+
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
167+
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
168+
"</table>"
169+
)
170+
# --
171+
# ┌───┐
172+
# │ a │
173+
# ├───┼───┐
174+
# │ b │ c │
175+
# └───┴───┘
176+
e = next(elements)
177+
assert type(e).__name__ == "Table"
178+
assert e.text == "a b c", f"actual {e.text=}"
179+
assert e.metadata.text_as_html == (
180+
"<table>\n"
181+
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
182+
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
183+
"</table>"
184+
), f"actual {e.metadata.text_as_html=}"
185+
# --
186+
# ┌───────┐
187+
# │ a │
188+
# ├───┬───┼───┐
189+
# │ b │ c │ d │
190+
# └───┴───┴───┘
191+
e = next(elements)
192+
assert type(e).__name__ == "Table"
193+
assert e.text == "a b c d", f"actual {e.text=}"
194+
assert e.metadata.text_as_html == (
195+
"<table>\n"
196+
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
197+
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
198+
"</table>"
199+
), f"actual {e.metadata.text_as_html=}"
200+
# --
201+
# ┌───┬───┐
202+
# │ │ b │
203+
# │ a ├───┼───┐
204+
# │ │ c │ d │
205+
# └───┴───┴───┘
206+
e = next(elements)
207+
assert type(e).__name__ == "Table"
208+
assert e.text == "a b c d", f"actual {e.text=}"
209+
assert e.metadata.text_as_html == (
210+
"<table>\n"
211+
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
212+
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
213+
"</table>"
214+
), f"actual {e.metadata.text_as_html=}"
215+
# -- late-start, early-end, and >2 rows vertical span --
216+
# ┌───────┬───┬───┐
217+
# │ a │ b │ c │
218+
# └───┬───┴───┼───┘
219+
# │ d │
220+
# ┌───┤ ├───┐
221+
# │ e │ │ f │
222+
# └───┤ ├───┘
223+
# │ │
224+
# └───────┘
225+
e = next(elements)
226+
assert type(e).__name__ == "Table"
227+
assert e.text == "a b c d e f", f"actual {e.text=}"
228+
assert e.metadata.text_as_html == (
229+
"<table>\n"
230+
"<thead>\n"
231+
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
232+
"</thead>\n<tbody>\n"
233+
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
234+
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
235+
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
236+
"</tbody>\n"
237+
"</table>"
238+
), f"actual {e.metadata.text_as_html=}"
239+
# --
240+
# -- The table from the specimen file we received with the bug report. --
241+
e = next(elements)
242+
assert type(e).__name__ == "Table"
243+
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
244+
assert e.metadata.text_as_html == (
245+
"<table>\n"
246+
"<thead>\n"
247+
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
248+
"</thead>\n"
249+
"<tbody>\n"
250+
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
251+
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
252+
"<tr><td> </td><td>More </td><td> </td></tr>\n"
253+
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
254+
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
255+
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
256+
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
257+
"</tbody>\n"
258+
"</table>"
259+
), f"actual {e.metadata.text_as_html=}"
260+
135261
# -- page-break behaviors --------------------------------------------------------------------
136262

137263
def it_places_page_breaks_precisely_where_they_occur(self):
@@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat():
299425
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
300426
"saved-by Dennis Forsythe",
301427
]
302-
assert [e.category for e in elements] == [
303-
ElementType.UNCATEGORIZED_TEXT,
304-
ElementType.UNCATEGORIZED_TEXT,
305-
ElementType.TABLE,
306-
]
428+
assert [type(e) for e in elements] == [Text, Text, Table]
307429

308430

309431
@pytest.mark.parametrize("infer_table_structure", [True, False])
@@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
687809
filename = "example-docs/handbook-1p.docx"
688810
partition_docx(
689811
filename=filename,
690-
languages="eng", # pyright: ignore[reportGeneralTypeIssues]
812+
languages="eng", # pyright: ignore[reportArgumentType]
691813
)
692814

693815

typings/docx/__init__.pyi

Lines changed: 0 additions & 3 deletions
This file was deleted.

typings/docx/api.pyi

Lines changed: 0 additions & 5 deletions
This file was deleted.

typings/docx/blkcntnr.pyi

Lines changed: 0 additions & 13 deletions
This file was deleted.

typings/docx/document.pyi

Lines changed: 0 additions & 28 deletions
This file was deleted.

typings/docx/drawing.pyi

Lines changed: 0 additions & 1 deletion
This file was deleted.

typings/docx/enum/section.pyi

Lines changed: 0 additions & 11 deletions
This file was deleted.

typings/docx/oxml/__init__.pyi

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)