Skip to content

Commit b3a2dd4

Browse files
fix: html incorrectly categorizing text (#3841)
Fixes #3666 --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: scanny <[email protected]>
1 parent 9ece0b5 commit b3a2dd4

39 files changed

+187
-13557
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.12-dev4
1+
## 0.16.12-dev5
22

33
### Enhancements
44

@@ -11,6 +11,7 @@
1111
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
1212
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
1313
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
14+
- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
1415

1516
## 0.16.11
1617

test_unstructured/metrics/test_element_type.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"fake-email.txt",
2020
{
2121
("NarrativeText", None): 1,
22-
("Title", 0): 1,
22+
("UncategorizedText", None): 1,
2323
("ListItem", 1): 2,
2424
},
2525
),
@@ -50,7 +50,7 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
5050
(
5151
"fake-email.txt",
5252
{
53-
("Title", 0): 1,
53+
("UncategorizedText", None): 1,
5454
("ListItem", 1): 2,
5555
("NarrativeText", None): 2,
5656
},

test_unstructured/partition/html/test_parser.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -384,29 +384,26 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
384384
elements = div.iter_elements()
385385

386386
e = next(elements)
387-
assert e == Title("Text of div with hierarchical phrasing content before first block item")
387+
assert e == Text("Text of div with hierarchical phrasing content before first block item")
388388
assert e.metadata.to_dict() == {
389-
"category_depth": 0,
390389
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
391390
"emphasized_text_tags": ["b", "bi", "b"],
392391
}
393392
e = next(elements)
394393
assert e == NarrativeText("Click here to see the blurb for this block item.")
395394
assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
396395
e = next(elements)
397-
assert e == Title("tail of block item with hierarchical phrasing content")
396+
assert e == Text("tail of block item with hierarchical phrasing content")
398397
assert e.metadata.to_dict() == {
399-
"category_depth": 0,
400398
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
401399
"emphasized_text_tags": ["b", "bi", "b"],
402400
}
403401
e = next(elements)
404-
assert e == Title("second block item")
405-
assert e.metadata.to_dict() == {"category_depth": 0}
402+
assert e == Text("second block item")
403+
assert e.metadata.to_dict() == {}
406404
e = next(elements)
407-
assert e == Title("tail of block item with hierarchical phrasing content")
405+
assert e == Text("tail of block item with hierarchical phrasing content")
408406
assert e.metadata.to_dict() == {
409-
"category_depth": 0,
410407
"emphasized_text_contents": ["with", "hierarchical"],
411408
"emphasized_text_tags": ["b", "bi"],
412409
}
@@ -664,22 +661,22 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
664661
("html_text", "expected_value"),
665662
[
666663
# -- Phrasing with nested block but no text or tail produces only element for block --
667-
("<strong><p>aaa</p></strong>", [Title("aaa")]),
664+
("<strong><p>aaa</p></strong>", [Text("aaa")]),
668665
# -- Phrasing with text produces annotated text-segment for the text --
669666
(
670667
"<strong>aaa<p>bbb</p></strong>",
671668
[
672669
TextSegment(
673670
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
674671
),
675-
Title("bbb"),
672+
Text("bbb"),
676673
],
677674
),
678675
# -- Phrasing with tail produces annotated text-segment for the tail --
679676
(
680677
"<strong><p>aaa</p>bbb</strong>",
681678
[
682-
Title("aaa"),
679+
Text("aaa"),
683680
TextSegment(
684681
"bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
685682
),
@@ -692,7 +689,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
692689
TextSegment(
693690
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
694691
),
695-
Title("bbb"),
692+
Text("bbb"),
696693
TextSegment(
697694
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
698695
),
@@ -776,15 +773,15 @@ def it_generates_text_segments_for_its_children_and_their_tails(
776773
# -- a phrasing element with no block children produces no elements --
777774
("<dfn></dfn>", "", []),
778775
# -- a child block element produces an element --
779-
("<kbd><p>aaa</p></kbd>", "", [Title("aaa")]),
776+
("<kbd><p>aaa</p></kbd>", "", [Text("aaa")]),
780777
# -- a child block element with a tail also produces a text-segment for the tail --
781-
("<kbd><p>aaa</p>bbb</kbd>", "", [Title("aaa"), TextSegment("bbb", {})]),
778+
("<kbd><p>aaa</p>bbb</kbd>", "", [Text("aaa"), TextSegment("bbb", {})]),
782779
# -- and also text-segments for phrasing following the tail --
783780
(
784781
"<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>",
785782
"",
786783
[
787-
Title("aaa"),
784+
Text("aaa"),
788785
TextSegment("bbb", {}),
789786
TextSegment("ccc", {}),
790787
TextSegment("ddd", {}),
@@ -798,7 +795,7 @@ def it_generates_text_segments_for_its_children_and_their_tails(
798795
TextSegment(
799796
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
800797
),
801-
Title("bbb"),
798+
Text("bbb"),
802799
TextSegment(
803800
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
804801
),
@@ -872,7 +869,7 @@ def and_it_generates_elements_for_its_block_children(
872869
[
873870
TextSegment("aaa", {}),
874871
TextSegment("bbb", {}),
875-
Title("ccc"),
872+
Text("ccc"),
876873
TextSegment("ddd", {}),
877874
TextSegment("eee", {}),
878875
],
@@ -996,7 +993,7 @@ def it_generates_enclosed_block_items_as_separate_elements(self):
996993
"link_urls": ["http://eie.io"],
997994
},
998995
),
999-
Title("one with"),
996+
Text("one with"),
1000997
TextSegment(
1001998
" the Force.",
1002999
{

test_unstructured/partition/html/test_partition.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path):
7272
assert elements == [
7373
Title("A Great and Glorious Section"),
7474
NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
75-
Title("Another Magnificent paragraph"),
75+
Text("Another Magnificent paragraph"),
7676
NarrativeText("The prior element is a title based on its capitalization patterns!"),
7777
Table("I'm in a table"),
7878
Title("A New Beginning"),
@@ -201,7 +201,7 @@ def test_partition_html_processes_chinese_chracters():
201201

202202
def test_emoji_appears_with_emoji_utf8_code():
203203
assert partition_html(text='<html charset="utf-8"><p>Hello &#128512;</p></html>') == [
204-
Title("Hello 😀")
204+
Text("Hello 😀")
205205
]
206206

207207

@@ -575,10 +575,10 @@ def test_pre_tag_parsing_respects_order():
575575
"<div>The Big Blue Bear</div>\n"
576576
)
577577
) == [
578-
Title("The Big Brown Bear"),
578+
Text("The Big Brown Bear"),
579579
NarrativeText("The big brown bear is growling."),
580580
NarrativeText("The big brown bear is sleeping."),
581-
Title("The Big Blue Bear"),
581+
Text("The Big Blue Bear"),
582582
]
583583

584584

@@ -604,7 +604,7 @@ def test_partition_html_br_tag_parsing():
604604

605605
assert elements == [
606606
Title("Header 1"),
607-
Title("Text"),
607+
Text("Text"),
608608
Title("Header 2"),
609609
Text(
610610
" Param1 = Y\nParam2 = 1\nParam3 = 2\nParam4 = A\n \nParam5 = A,B,C,D,E\n"
@@ -640,7 +640,7 @@ def test_partition_html_tag_tail_parsing():
640640

641641
elements = partition_html(text=html_text)
642642

643-
assert elements == [Title("Head"), Title("Nested"), Title("Tail")]
643+
assert elements == [Text("Head"), Text("Nested"), Text("Tail")]
644644

645645

646646
# -- parsing edge cases --------------------------------------------------------------------------
@@ -731,11 +731,11 @@ def test_containers_with_text_are_processed():
731731
assert elements == [
732732
Text("Hi All,"),
733733
NarrativeText("Get excited for our first annual family day!"),
734-
Title("Best."),
734+
Text("Best."),
735735
Text("--"),
736-
Title("Dino the Datasaur"),
737-
Title("Unstructured Technologies"),
738-
Title("Data Scientist"),
736+
Text("Dino the Datasaur"),
737+
Text("Unstructured Technologies"),
738+
Text("Data Scientist"),
739739
Address("Doylestown, PA 18901"),
740740
NarrativeText("See you there!"),
741741
]
@@ -786,7 +786,7 @@ def test_html_grabs_bulleted_text_in_paras():
786786

787787
def test_joins_tag_text_correctly():
788788
elements = partition_html(text="<p>Hello again peet mag<i>ic</i>al</p>")
789-
assert elements == [Title("Hello again peet magical")]
789+
assert elements == [Text("Hello again peet magical")]
790790

791791

792792
def test_sample_doc_with_emoji():
@@ -796,17 +796,17 @@ def test_sample_doc_with_emoji():
796796

797797
def test_only_text_and_no_elements_in_body():
798798
elements = partition_html(text="<body>Hello</body>")
799-
assert elements == [Title("Hello")]
799+
assert elements == [Text("Hello")]
800800

801801

802802
def test_text_before_elements_in_body():
803803
elements = partition_html(text="<body>Hello<p>World</p></body>")
804-
assert elements == [Title("Hello"), Title("World")]
804+
assert elements == [Text("Hello"), Text("World")]
805805

806806

807807
def test_line_break_in_container():
808808
elements = partition_html(text="<div>Hello<br/>World</div>")
809-
assert elements == [Title("Hello World")]
809+
assert elements == [Text("Hello World")]
810810

811811

812812
@pytest.mark.parametrize("tag", ["del", "form", "noscript"])
@@ -963,7 +963,7 @@ def test_partition_html_grabs_emphasized_texts():
963963
assert e.metadata.emphasized_text_contents is None
964964
assert e.metadata.emphasized_text_tags is None
965965
e = elements[4]
966-
assert e == Title("A lone span text!")
966+
assert e == Text("A lone span text!")
967967
assert e.metadata.emphasized_text_contents is None
968968
assert e.metadata.emphasized_text_tags is None
969969

@@ -1078,7 +1078,7 @@ def test_partition_html_grabs_links():
10781078
assert e.metadata.link_urls is None
10791079
assert e.metadata.link_texts is None
10801080
e = elements[4]
1081-
assert e == Title("A lone link!")
1081+
assert e == Text("A lone link!")
10821082
assert e.metadata.link_urls == ["/loner"]
10831083
assert e.metadata.link_texts == ["A lone link!"]
10841084

test_unstructured/partition/test_auto.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
175175

176176
EXPECTED_EMAIL_OUTPUT = [
177177
NarrativeText(text="This is a test email to use for unit tests."),
178-
Title(text="Important points:"),
178+
Text(text="Important points:"),
179179
ListItem(text="Roses are red"),
180180
ListItem(text="Violets are blue"),
181181
]
@@ -440,7 +440,7 @@ def test_partition_md_from_url_works_with_embedded_html():
440440
def test_auto_partition_msg_from_filename():
441441
assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [
442442
NarrativeText(text="This is a test email to use for unit tests."),
443-
Title(text="Important points:"),
443+
Text(text="Important points:"),
444444
ListItem(text="Roses are red"),
445445
ListItem(text="Violets are blue"),
446446
]

test_unstructured/partition/test_email.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
EXPECTED_OUTPUT = [
3232
NarrativeText(text="This is a test email to use for unit tests."),
33-
Title(text="Important points:"),
33+
Text(text="Important points:"),
3434
ListItem(text="Roses are red"),
3535
ListItem(text="Violets are blue"),
3636
]
@@ -88,9 +88,9 @@ def test_extract_email_from_text_plain_matches_elements_extracted_from_text_html
8888
elements_from_text = partition_email(file_path, content_source="text/plain")
8989
elements_from_html = partition_email(file_path, content_source="text/html")
9090

91-
assert elements_from_text == EXPECTED_OUTPUT
91+
assert all(e.text == eo.text for e, eo in zip(elements_from_text, EXPECTED_OUTPUT))
9292
assert elements_from_html == EXPECTED_OUTPUT
93-
assert elements_from_html == elements_from_text
93+
assert all(eh.text == et.text for eh, et in zip(elements_from_html, elements_from_text))
9494

9595

9696
def test_partition_email_round_trips_via_json():
@@ -354,14 +354,14 @@ def test_partition_email_can_process_attachments():
354354
)
355355

356356
assert elements == [
357-
Title("Hello!"),
357+
Text("Hello!"),
358358
NarrativeText("Here's the attachments!"),
359359
NarrativeText("It includes:"),
360360
ListItem("Lots of whitespace"),
361361
ListItem("Little to no content"),
362362
ListItem("and is a quick read"),
363363
Text("Best,"),
364-
Title("Mallori"),
364+
Text("Mallori"),
365365
NarrativeText("Hey this is a fake attachment!"),
366366
]
367367
assert all(e.metadata.last_modified == "2022-12-23T18:08:48+00:00" for e in elements)

test_unstructured/partition/test_msg.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,13 @@
2323
ListItem,
2424
NarrativeText,
2525
Text,
26-
Title,
2726
)
2827
from unstructured.partition.common import UnsupportedFileFormatError
2928
from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
3029

3130
EXPECTED_MSG_OUTPUT = [
3231
NarrativeText(text="This is a test email to use for unit tests."),
33-
Title(text="Important points:"),
32+
Text(text="Important points:"),
3433
ListItem(text="Roses are red"),
3534
ListItem(text="Violets are blue"),
3635
]
@@ -138,9 +137,9 @@ def test_partition_msg_can_process_attachments():
138137
assert [type(e).__name__ for e in elements][:10] == [
139138
"NarrativeText",
140139
"Text",
141-
"Title",
142-
"Title",
143-
"Title",
140+
"Text",
141+
"Text",
142+
"Text",
144143
"Image",
145144
"Title",
146145
"Text",
@@ -175,9 +174,9 @@ def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: F
175174
# -- the email body is partitioned --
176175
NarrativeText("Here are those documents."),
177176
Text("--"),
178-
Title("Mallori Harrell"),
179-
Title("Unstructured Technologies"),
180-
Title("Data Scientist"),
177+
Text("Mallori Harrell"),
178+
Text("Unstructured Technologies"),
179+
Text("Data Scientist"),
181180
# -- no elements appear for the attachment(s) --
182181
]
183182

0 commit comments

Comments
 (0)