|
16 | 16 | Address, |
17 | 17 | CompositeElement, |
18 | 18 | Element, |
19 | | - ElementType, |
20 | 19 | Footer, |
21 | 20 | Header, |
22 | 21 | ListItem, |
@@ -132,6 +131,133 @@ def but_the_text_of_a_merged_cell_appears_only_once(self): |
132 | 131 | table = docx.Document(example_doc_path("docx-tables.docx")).tables[2] |
133 | 132 | assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e" |
134 | 133 |
|
| 134 | + def it_can_partition_tables_with_incomplete_rows(self): |
| 135 | + """DOCX permits table rows to start late and end early. |
| 136 | +
|
| 137 | + It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that |
| 138 | + they allow rows to start late, like in column 3, and end early, like the last cell is in |
| 139 | + column 5 of a 7 column table. |
| 140 | +
|
| 141 | + A practical example might look like this: |
| 142 | +
|
| 143 | + +------+------+ |
| 144 | + | East | West | |
| 145 | + +----------+------+------+ |
| 146 | + | Started | 25 | 32 | |
| 147 | + +----------+------+------+ |
| 148 | + | Finished | 17 | 21 | |
| 149 | + +----------+------+------+ |
| 150 | + """ |
| 151 | + elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx"))) |
| 152 | + |
| 153 | + e = next(elements) |
| 154 | + assert e.text.startswith("Example of DOCX table ") |
| 155 | + # -- |
| 156 | + # ┌───┬───┐ |
| 157 | + # │ a │ b │ |
| 158 | + # ├───┼───┤ |
| 159 | + # │ c │ d │ |
| 160 | + # └───┴───┘ |
| 161 | + e = next(elements) |
| 162 | + assert type(e).__name__ == "Table" |
| 163 | + assert e.text == "a b c d" |
| 164 | + assert e.metadata.text_as_html == ( |
| 165 | + "<table>\n" |
| 166 | + "<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n" |
| 167 | + "<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n" |
| 168 | + "</table>" |
| 169 | + ) |
| 170 | + # -- |
| 171 | + # ┌───┐ |
| 172 | + # │ a │ |
| 173 | + # ├───┼───┐ |
| 174 | + # │ b │ c │ |
| 175 | + # └───┴───┘ |
| 176 | + e = next(elements) |
| 177 | + assert type(e).__name__ == "Table" |
| 178 | + assert e.text == "a b c", f"actual {e.text=}" |
| 179 | + assert e.metadata.text_as_html == ( |
| 180 | + "<table>\n" |
| 181 | + "<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n" |
| 182 | + "<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n" |
| 183 | + "</table>" |
| 184 | + ), f"actual {e.metadata.text_as_html=}" |
| 185 | + # -- |
| 186 | + # ┌───────┐ |
| 187 | + # │ a │ |
| 188 | + # ├───┬───┼───┐ |
| 189 | + # │ b │ c │ d │ |
| 190 | + # └───┴───┴───┘ |
| 191 | + e = next(elements) |
| 192 | + assert type(e).__name__ == "Table" |
| 193 | + assert e.text == "a b c d", f"actual {e.text=}" |
| 194 | + assert e.metadata.text_as_html == ( |
| 195 | + "<table>\n" |
| 196 | + "<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n" |
| 197 | + "<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n" |
| 198 | + "</table>" |
| 199 | + ), f"actual {e.metadata.text_as_html=}" |
| 200 | + # -- |
| 201 | + # ┌───┬───┐ |
| 202 | + # │ │ b │ |
| 203 | + # │ a ├───┼───┐ |
| 204 | + # │ │ c │ d │ |
| 205 | + # └───┴───┴───┘ |
| 206 | + e = next(elements) |
| 207 | + assert type(e).__name__ == "Table" |
| 208 | + assert e.text == "a b c d", f"actual {e.text=}" |
| 209 | + assert e.metadata.text_as_html == ( |
| 210 | + "<table>\n" |
| 211 | + "<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n" |
| 212 | + "<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n" |
| 213 | + "</table>" |
| 214 | + ), f"actual {e.metadata.text_as_html=}" |
| 215 | + # -- late-start, early-end, and >2 rows vertical span -- |
| 216 | + # ┌───────┬───┬───┐ |
| 217 | + # │ a │ b │ c │ |
| 218 | + # └───┬───┴───┼───┘ |
| 219 | + # │ d │ |
| 220 | + # ┌───┤ ├───┐ |
| 221 | + # │ e │ │ f │ |
| 222 | + # └───┤ ├───┘ |
| 223 | + # │ │ |
| 224 | + # └───────┘ |
| 225 | + e = next(elements) |
| 226 | + assert type(e).__name__ == "Table" |
| 227 | + assert e.text == "a b c d e f", f"actual {e.text=}" |
| 228 | + assert e.metadata.text_as_html == ( |
| 229 | + "<table>\n" |
| 230 | + "<thead>\n" |
| 231 | + "<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n" |
| 232 | + "</thead>\n<tbody>\n" |
| 233 | + "<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n" |
| 234 | + "<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n" |
| 235 | + "<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n" |
| 236 | + "</tbody>\n" |
| 237 | + "</table>" |
| 238 | + ), f"actual {e.metadata.text_as_html=}" |
| 239 | + # -- |
| 240 | + # -- The table from the specimen file we received with the bug report. -- |
| 241 | + e = next(elements) |
| 242 | + assert type(e).__name__ == "Table" |
| 243 | + assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}" |
| 244 | + assert e.metadata.text_as_html == ( |
| 245 | + "<table>\n" |
| 246 | + "<thead>\n" |
| 247 | + "<tr><th>Data </th><th>Data </th><th> </th></tr>\n" |
| 248 | + "</thead>\n" |
| 249 | + "<tbody>\n" |
| 250 | + "<tr><td>Data </td><td>Data </td><td> </td></tr>\n" |
| 251 | + "<tr><td>Data </td><td>Data </td><td> </td></tr>\n" |
| 252 | + "<tr><td> </td><td>More </td><td> </td></tr>\n" |
| 253 | + "<tr><td>Dato </td><td> </td><td> </td></tr>\n" |
| 254 | + "<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n" |
| 255 | + "<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n" |
| 256 | + "<tr><td> </td><td>Format </td><td>Format</td></tr>\n" |
| 257 | + "</tbody>\n" |
| 258 | + "</table>" |
| 259 | + ), f"actual {e.metadata.text_as_html=}" |
| 260 | + |
135 | 261 | # -- page-break behaviors -------------------------------------------------------------------- |
136 | 262 |
|
137 | 263 | def it_places_page_breaks_precisely_where_they_occur(self): |
@@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat(): |
299 | 425 | "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.", |
300 | 426 | "saved-by Dennis Forsythe", |
301 | 427 | ] |
302 | | - assert [e.category for e in elements] == [ |
303 | | - ElementType.UNCATEGORIZED_TEXT, |
304 | | - ElementType.UNCATEGORIZED_TEXT, |
305 | | - ElementType.TABLE, |
306 | | - ] |
| 428 | + assert [type(e) for e in elements] == [Text, Text, Table] |
307 | 429 |
|
308 | 430 |
|
309 | 431 | @pytest.mark.parametrize("infer_table_structure", [True, False]) |
@@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages(): |
687 | 809 | filename = "example-docs/handbook-1p.docx" |
688 | 810 | partition_docx( |
689 | 811 | filename=filename, |
690 | | - languages="eng", # pyright: ignore[reportGeneralTypeIssues] |
| 812 | + languages="eng", # pyright: ignore[reportArgumentType] |
691 | 813 | ) |
692 | 814 |
|
693 | 815 |
|
|
0 commit comments