|
22 | 22 | from pptx.text.text import _Paragraph # pyright: ignore [reportPrivateUsage] |
23 | 23 |
|
24 | 24 | from unstructured.chunking import add_chunking_strategy |
| 25 | +from unstructured.common.html_table import HtmlTable, htmlify_matrix_of_cell_texts |
25 | 26 | from unstructured.documents.elements import ( |
26 | 27 | Element, |
27 | 28 | ElementMetadata, |
|
34 | 35 | Title, |
35 | 36 | ) |
36 | 37 | from unstructured.file_utils.model import FileType |
37 | | -from unstructured.partition.common.common import convert_ms_office_table_to_text |
38 | 38 | from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date |
39 | 39 | from unstructured.partition.text_type import ( |
40 | 40 | is_email_address, |
@@ -213,38 +213,6 @@ def _iter_picture_elements(self, picture: Picture) -> Iterator[Element]: |
213 | 213 | PicturePartitionerCls = self._opts.picture_partitioner |
214 | 214 | yield from PicturePartitionerCls.iter_elements(picture, self._opts) |
215 | 215 |
|
216 | | - def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]: |
217 | | - """Generate Title element for each paragraph in title `shape`. |
218 | | -
|
219 | | - Text is most likely a title, but in the rare case that the title shape was used |
220 | | - for the slide body text, also check for bulleted paragraphs.""" |
221 | | - if self._shape_is_off_slide(shape): |
222 | | - return |
223 | | - |
224 | | - depth = 0 |
225 | | - for paragraph in shape.text_frame.paragraphs: |
226 | | - text = paragraph.text |
227 | | - if text.strip() == "": |
228 | | - continue |
229 | | - |
230 | | - if self._is_bulleted_paragraph(paragraph): |
231 | | - bullet_depth = paragraph.level or 0 |
232 | | - yield ListItem( |
233 | | - text=text, |
234 | | - metadata=self._opts.text_metadata(category_depth=bullet_depth), |
235 | | - detection_origin=DETECTION_ORIGIN, |
236 | | - ) |
237 | | - elif is_email_address(text): |
238 | | - yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN) |
239 | | - else: |
240 | | - # increment the category depth by the paragraph increment in the shape |
241 | | - yield Title( |
242 | | - text=text, |
243 | | - metadata=self._opts.text_metadata(category_depth=depth), |
244 | | - detection_origin=DETECTION_ORIGIN, |
245 | | - ) |
246 | | - depth += 1 # Cannot enumerate because we want to skip empty paragraphs |
247 | | - |
248 | 216 | def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]: |
249 | 217 | """Generate Text or subtype element for each paragraph in `shape`.""" |
250 | 218 | if self._shape_is_off_slide(shape): |
@@ -280,18 +248,55 @@ def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]: |
280 | 248 |
|
281 | 249 | An empty table does not produce an element. |
282 | 250 | """ |
283 | | - text_table = convert_ms_office_table_to_text(graphfrm.table, as_html=False).strip() |
284 | | - if not text_table: |
| 251 | + if not (rows := list(graphfrm.table.rows)): |
| 252 | + return |
| 253 | + |
| 254 | + html_text = htmlify_matrix_of_cell_texts( |
| 255 | + [[cell.text for cell in row.cells] for row in rows] |
| 256 | + ) |
| 257 | + html_table = HtmlTable.from_html_text(html_text) |
| 258 | + |
| 259 | + if not html_table.text: |
285 | 260 | return |
286 | | - html_table = None |
287 | | - if self._opts.infer_table_structure: |
288 | | - html_table = convert_ms_office_table_to_text(graphfrm.table, as_html=True) |
289 | | - yield Table( |
290 | | - text=text_table, |
291 | | - metadata=self._opts.table_metadata(html_table), |
292 | | - detection_origin=DETECTION_ORIGIN, |
| 261 | + |
| 262 | + metadata = self._opts.table_metadata( |
| 263 | + html_table.html if self._opts.infer_table_structure else None |
293 | 264 | ) |
294 | 265 |
|
| 266 | + yield Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN) |
| 267 | + |
| 268 | + def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]: |
| 269 | + """Generate Title element for each paragraph in title `shape`. |
| 270 | +
|
| 271 | + Text is most likely a title, but in the rare case that the title shape was used |
| 272 | + for the slide body text, also check for bulleted paragraphs.""" |
| 273 | + if self._shape_is_off_slide(shape): |
| 274 | + return |
| 275 | + |
| 276 | + depth = 0 |
| 277 | + for paragraph in shape.text_frame.paragraphs: |
| 278 | + text = paragraph.text |
| 279 | + if text.strip() == "": |
| 280 | + continue |
| 281 | + |
| 282 | + if self._is_bulleted_paragraph(paragraph): |
| 283 | + bullet_depth = paragraph.level or 0 |
| 284 | + yield ListItem( |
| 285 | + text=text, |
| 286 | + metadata=self._opts.text_metadata(category_depth=bullet_depth), |
| 287 | + detection_origin=DETECTION_ORIGIN, |
| 288 | + ) |
| 289 | + elif is_email_address(text): |
| 290 | + yield EmailAddress(text=text, detection_origin=DETECTION_ORIGIN) |
| 291 | + else: |
| 292 | + # increment the category depth by the paragraph increment in the shape |
| 293 | + yield Title( |
| 294 | + text=text, |
| 295 | + metadata=self._opts.text_metadata(category_depth=depth), |
| 296 | + detection_origin=DETECTION_ORIGIN, |
| 297 | + ) |
| 298 | + depth += 1 # Cannot enumerate because we want to skip empty paragraphs |
| 299 | + |
295 | 300 | def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]: |
296 | 301 | """Orders the shapes on `slide` from top to bottom and left to right. |
297 | 302 |
|
|
0 commit comments