|
16 | 16 | LayoutElements, |
17 | 17 | ) |
18 | 18 |
|
| 19 | +from test_unstructured.unit_utils import example_doc_path |
19 | 20 | from unstructured.documents.elements import ElementType |
20 | 21 | from unstructured.partition.pdf_image import ocr |
21 | | -from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes |
| 22 | +from unstructured.partition.pdf_image.pdf_image_utils import ( |
| 23 | + convert_pdf_to_images, |
| 24 | + pad_element_bboxes, |
| 25 | +) |
22 | 26 | from unstructured.partition.utils.config import env_config |
23 | 27 | from unstructured.partition.utils.constants import ( |
24 | 28 | Source, |
@@ -436,6 +440,28 @@ def mock_ocr_layout(): |
436 | 440 | ) |
437 | 441 |
|
438 | 442 |
|
| 443 | +def test_supplement_element_with_table_extraction(): |
| 444 | + from unstructured_inference.models import tables |
| 445 | + |
| 446 | + tables.load_agent() |
| 447 | + |
| 448 | + image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf"))) |
| 449 | + elements = LayoutElements( |
| 450 | + element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]), |
| 451 | + texts=np.array(["foo"]), |
| 452 | + sources=np.array(["yolox_sg"]), |
| 453 | + element_class_ids=np.array([0]), |
| 454 | + element_class_id_map={0: "Table"}, |
| 455 | + ) |
| 456 | + supplemented = ocr.supplement_element_with_table_extraction( |
| 457 | + elements=elements, |
| 458 | + image=image, |
| 459 | + tables_agent=tables.tables_agent, |
| 460 | + ocr_agent=ocr.OCRAgent.get_agent(language="eng"), |
| 461 | + ) |
| 462 | + assert supplemented.text_as_html[0].startswith("<table>") |
| 463 | + |
| 464 | + |
439 | 465 | def test_get_table_tokens(mock_ocr_layout): |
440 | 466 | with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout): |
441 | 467 | ocr_agent = OCRAgent.get_agent(language="eng") |
|
0 commit comments