|
| 1 | +from functools import partial |
| 2 | + |
| 3 | +import pytest |
| 4 | + |
| 5 | +from unstructured.chunking.basic import chunk_elements |
| 6 | +from unstructured.chunking.title import chunk_by_title |
| 7 | +from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title |
| 8 | + |
| 9 | + |
| 10 | +@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)]) |
| 11 | +def chunking_fn(request): |
| 12 | + return request.param |
| 13 | + |
| 14 | + |
| 15 | +def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): |
| 16 | + metadata_1 = '<h1 class="Title" id="1">Header </h1>' |
| 17 | + metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>' |
| 18 | + metadata_3 = ( |
| 19 | + '<form class="Form" id="3"> ' |
| 20 | + '<label class="FormField" for="company-name" id="4">Form field name </label>' |
| 21 | + '<input class="FormFieldValue" id="5" value="Example value" />' |
| 22 | + "</form>" |
| 23 | + ) |
| 24 | + combined_metadata = " ".join([metadata_1, metadata_2, metadata_3]) |
| 25 | + |
| 26 | + elements = [ |
| 27 | + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), |
| 28 | + Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)), |
| 29 | + Text( |
| 30 | + text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) |
| 31 | + ), |
| 32 | + ] |
| 33 | + chunks = chunking_fn(elements) |
| 34 | + assert len(chunks) == 1 |
| 35 | + assert chunks[0].metadata.text_as_html == combined_metadata |
| 36 | + |
| 37 | + |
| 38 | +def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn): |
| 39 | + """ |
| 40 | + Ground truth |
| 41 | + <Document> |
| 42 | + <Page> |
| 43 | + <Section> |
| 44 | + <p>First</p> |
| 45 | + <p>Second</p> |
| 46 | + </Section> |
| 47 | + </Page> |
| 48 | + </Document> |
| 49 | + Elements: Document, Page, Section, Paragraph, Paragraph |
| 50 | + Chunk 1: Document, Page, Section, Paragraph |
| 51 | +
|
| 52 | + Chunk 2: |
| 53 | + Paragraph |
| 54 | + """ |
| 55 | + |
| 56 | + metadata_1 = '<div class="Section" id="1" />' |
| 57 | + metadata_2 = '<p class="Paragraph" id="2">First </p>' |
| 58 | + metadata_3 = '<p class="Paragraph" id="3">Second </p>' |
| 59 | + |
| 60 | + elements = [ |
| 61 | + Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), |
| 62 | + NarrativeText( |
| 63 | + text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") |
| 64 | + ), |
| 65 | + NarrativeText( |
| 66 | + text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") |
| 67 | + ), |
| 68 | + ] |
| 69 | + chunks = chunking_fn(elements, max_characters=6) |
| 70 | + assert len(chunks) == 2 |
| 71 | + assert chunks[0].text == "First" |
| 72 | + assert chunks[1].text == "Second" |
| 73 | + |
| 74 | + assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 |
| 75 | + assert chunks[1].metadata.text_as_html == metadata_3 |
| 76 | + |
| 77 | + |
| 78 | +def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): |
| 79 | + """Mimic behaviour of elements with non-html metadata""" |
| 80 | + metadata_1 = '<h1 class="Title" id="1">Header </h1>' |
| 81 | + elements = [ |
| 82 | + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), |
| 83 | + ] |
| 84 | + chunks = chunking_fn(elements, max_characters=3) |
| 85 | + assert len(chunks) == 2 |
| 86 | + |
| 87 | + assert chunks[0].text == "Hea" |
| 88 | + assert chunks[1].text == "der" |
| 89 | + assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>' |
| 90 | + assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>' |
0 commit comments