Skip to content

Commit 9ece0b5

Browse files
fix: improve false-positive Title elements on Chinese text (#3836)
**Summary** Improve element-type mapping for Chinese text. Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. Fixes #3084 --------- Co-authored-by: scanny <[email protected]> Co-authored-by: ryannikolaidis <[email protected]>
1 parent 9a9bf4c commit 9ece0b5

File tree

15 files changed

+856
-861
lines changed

15 files changed

+856
-861
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.12-dev3
1+
## 0.16.12-dev4
22

33
### Enhancements
44

@@ -10,6 +10,7 @@
1010

1111
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
1212
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
13+
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
1314

1415
## 0.16.11
1516

test_unstructured/metrics/test_element_type.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
7474
"handbook-1p.docx",
7575
{
7676
("Header", None): 1,
77-
("Title", 0): 1,
78-
("Title", 1): 1,
79-
("Title", 2): 1,
77+
("UncategorizedText", 0): 6,
8078
("ListItem", 3): 3,
81-
("NarrativeText", 4): 7,
79+
("NarrativeText", 0): 7,
8280
("Footer", None): 1,
8381
},
84-
(0.43, 0.07, 0.65),
82+
(0.78, 0.72, 0.81),
8583
),
8684
(
8785
"handbook-1p.docx",
8886
{
8987
("Header", None): 1,
90-
("Title", 0): 6,
88+
("UncategorizedText", 0): 6,
9189
("NarrativeText", 0): 7,
9290
("PageBreak", None): 1,
9391
("Footer", None): 1,

test_unstructured/partition/test_auto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,7 @@ def expected_docx_elements():
12861286
Title("These are a few of my favorite things:"),
12871287
ListItem("Parrots"),
12881288
ListItem("Hockey"),
1289-
Title("Analysis"),
1289+
Text("Analysis"),
12901290
NarrativeText("This is my first thought. This is my second thought."),
12911291
NarrativeText("This is my third thought."),
12921292
Text("2023"),

test_unstructured/partition/test_doc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
275275
Title("These are a few of my favorite things:"),
276276
ListItem("Parrots"),
277277
ListItem("Hockey"),
278-
Title("Analysis"),
278+
Text("Analysis"),
279279
NarrativeText("This is my first thought. This is my second thought."),
280280
NarrativeText("This is my third thought."),
281281
Text("2023"),

test_unstructured/partition/test_docx.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
627627
Title("These are a few of my favorite things:"),
628628
ListItem("Parrots"),
629629
ListItem("Hockey"),
630-
Title("Analysis"),
630+
Text("Analysis"),
631631
NarrativeText("This is my first thought. This is my second thought."),
632632
NarrativeText("This is my third thought."),
633633
Text("2023"),
@@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str:
12101210
opts_args["file_path"] = example_doc_path("page-breaks.docx")
12111211
opts = DocxPartitionerOptions(**opts_args)
12121212
expected = [
1213-
# NOTE(scanny) - -- page 1 --
1213+
# -- page 1 --
12141214
NarrativeText(
12151215
"First page, tab here:\t"
12161216
"followed by line-break here:\n"
@@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str:
12201220
"and hard page-break here>>"
12211221
),
12221222
PageBreak(""),
1223-
# NOTE(scanny) - -- page 2 --
1223+
# -- page 2 --
12241224
NarrativeText(
12251225
"<<Text on second page. The font is big so it breaks onto third page--"
12261226
"------------------here-->> <<but break falls inside link so text stays"
12271227
" together."
12281228
),
12291229
PageBreak(""),
1230-
# NOTE(scanny) - -- page 3 --
1230+
# -- page 3 --
12311231
NarrativeText("Continuous section break here>>"),
12321232
NarrativeText("<<followed by text on same page"),
12331233
NarrativeText("Odd-page section break here>>"),
12341234
PageBreak(""),
1235-
# NOTE(scanny) - -- page 4 --
1235+
# -- page 4 --
12361236
PageBreak(""),
1237-
# NOTE(scanny) - -- page 5 --
1237+
# -- page 5 --
12381238
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
12391239
NarrativeText(
12401240
'Then text gets big again so a "natural" rendered page break happens again here>> '
12411241
),
12421242
PageBreak(""),
1243-
# NOTE(scanny) - -- page 6 --
1244-
Title("<<and then more text proceeds."),
1243+
# -- page 6 --
1244+
Text("<<and then more text proceeds."),
12451245
]
12461246

12471247
elements = _DocxPartitioner.iter_document_elements(opts)

test_unstructured/partition/test_odt.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
Table,
2424
TableChunk,
2525
Text,
26-
Title,
2726
)
2827
from unstructured.partition.docx import partition_docx
2928
from unstructured.partition.odt import partition_odt
@@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
4443
elements = partition_odt(example_doc_path("fake.odt"))
4544

4645
assert elements == [
47-
Title("Lorem ipsum dolor sit amet."),
46+
Text("Lorem ipsum dolor sit amet."),
4847
Table(
4948
"Header row Mon Wed Fri"
5049
" Color Blue Red Green"
@@ -63,7 +62,7 @@ def test_partition_odt_from_file():
6362
elements = partition_odt(file=f)
6463

6564
assert elements == [
66-
Title("Lorem ipsum dolor sit amet."),
65+
Text("Lorem ipsum dolor sit amet."),
6766
Table(
6867
"Header row Mon Wed Fri"
6968
" Color Blue Red Green"

test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
}
2424
},
2525
{
26-
"type": "Title",
26+
"type": "UncategorizedText",
2727
"element_id": "5209312022a75a31d95385fdccff68fa",
2828
"text": "CHAPTER 1",
2929
"metadata": {
@@ -51,7 +51,7 @@
5151
}
5252
},
5353
{
54-
"type": "Title",
54+
"type": "UncategorizedText",
5555
"element_id": "22a23e29022f32945965002cd734a8f0",
5656
"text": "INTRODUCTION",
5757
"metadata": {
@@ -79,7 +79,7 @@
7979
}
8080
},
8181
{
82-
"type": "Title",
82+
"type": "UncategorizedText",
8383
"element_id": "4c175cf543957acc4420221de28d3fca",
8484
"text": "CHAPTER 1 \u2013 INTRODUCTION",
8585
"metadata": {
@@ -101,7 +101,7 @@
101101
}
102102
},
103103
{
104-
"type": "Title",
104+
"type": "UncategorizedText",
105105
"element_id": "77022a5264f552b223538977cd40f640",
106106
"text": "A.\tPURPOSE",
107107
"metadata": {
@@ -189,7 +189,7 @@
189189
}
190190
},
191191
{
192-
"type": "Title",
192+
"type": "UncategorizedText",
193193
"element_id": "e341ffc123dd2827638aba18149c4175",
194194
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
195195
"metadata": {
@@ -255,7 +255,7 @@
255255
}
256256
},
257257
{
258-
"type": "Title",
258+
"type": "UncategorizedText",
259259
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
260260
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
261261
"metadata": {

test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
}
2424
},
2525
{
26-
"type": "Title",
26+
"type": "UncategorizedText",
2727
"element_id": "5209312022a75a31d95385fdccff68fa",
2828
"text": "CHAPTER 1",
2929
"metadata": {
@@ -51,7 +51,7 @@
5151
}
5252
},
5353
{
54-
"type": "Title",
54+
"type": "UncategorizedText",
5555
"element_id": "22a23e29022f32945965002cd734a8f0",
5656
"text": "INTRODUCTION",
5757
"metadata": {
@@ -79,7 +79,7 @@
7979
}
8080
},
8181
{
82-
"type": "Title",
82+
"type": "UncategorizedText",
8383
"element_id": "4c175cf543957acc4420221de28d3fca",
8484
"text": "CHAPTER 1 \u2013 INTRODUCTION",
8585
"metadata": {
@@ -101,7 +101,7 @@
101101
}
102102
},
103103
{
104-
"type": "Title",
104+
"type": "UncategorizedText",
105105
"element_id": "77022a5264f552b223538977cd40f640",
106106
"text": "A.\tPURPOSE",
107107
"metadata": {
@@ -189,7 +189,7 @@
189189
}
190190
},
191191
{
192-
"type": "Title",
192+
"type": "UncategorizedText",
193193
"element_id": "e341ffc123dd2827638aba18149c4175",
194194
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
195195
"metadata": {
@@ -255,7 +255,7 @@
255255
}
256256
},
257257
{
258-
"type": "Title",
258+
"type": "UncategorizedText",
259259
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
260260
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
261261
"metadata": {

test_unstructured_ingest/expected-structured-output/google-drive/fake.docx.json

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"type": "Title",
3+
"type": "UncategorizedText",
44
"element_id": "56d531394823d81787d77a04462ed096",
55
"text": "Lorem ipsum dolor sit amet.",
66
"metadata": {
@@ -17,6 +17,13 @@
1717
"date_created": "1686809759.687",
1818
"date_modified": "1686809743.0",
1919
"permissions_data": [
20+
{
21+
"id": "anyoneWithLink",
22+
"type": "anyone",
23+
"kind": "drive#permission",
24+
"role": "reader",
25+
"allowFileDiscovery": false
26+
},
2027
{
2128
"id": "18298851591250030956",
2229
"displayName": "[email protected]",
@@ -29,31 +36,24 @@
2936
"pendingOwner": false
3037
},
3138
{
32-
"id": "09147371668407854156",
33-
"displayName": "roman",
39+
"id": "04774006893477068632",
40+
"displayName": "ryan",
3441
"type": "user",
3542
"kind": "drive#permission",
36-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
37-
"emailAddress": "roman@unstructured.io",
38-
"role": "writer",
43+
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
44+
"emailAddress": "ryan@unstructured.io",
45+
"role": "owner",
3946
"deleted": false,
4047
"pendingOwner": false
4148
},
4249
{
43-
"id": "anyoneWithLink",
44-
"type": "anyone",
45-
"kind": "drive#permission",
46-
"role": "reader",
47-
"allowFileDiscovery": false
48-
},
49-
{
50-
"id": "04774006893477068632",
51-
"displayName": "ryan",
50+
"id": "09147371668407854156",
51+
"displayName": "roman",
5252
"type": "user",
5353
"kind": "drive#permission",
54-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
55-
"emailAddress": "ryan@unstructured.io",
56-
"role": "owner",
54+
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
55+
"emailAddress": "roman@unstructured.io",
56+
"role": "writer",
5757
"deleted": false,
5858
"pendingOwner": false
5959
}

test_unstructured_ingest/expected-structured-output/google-drive/nested/fake.docx.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"type": "Title",
3+
"type": "UncategorizedText",
44
"element_id": "56d531394823d81787d77a04462ed096",
55
"text": "Lorem ipsum dolor sit amet.",
66
"metadata": {
@@ -17,6 +17,13 @@
1717
"date_created": "1718722775.76",
1818
"date_modified": "1718722788.018",
1919
"permissions_data": [
20+
{
21+
"id": "anyoneWithLink",
22+
"type": "anyone",
23+
"kind": "drive#permission",
24+
"role": "reader",
25+
"allowFileDiscovery": false
26+
},
2027
{
2128
"id": "18298851591250030956",
2229
"displayName": "[email protected]",
@@ -39,13 +46,6 @@
3946
"deleted": false,
4047
"pendingOwner": false
4148
},
42-
{
43-
"id": "anyoneWithLink",
44-
"type": "anyone",
45-
"kind": "drive#permission",
46-
"role": "reader",
47-
"allowFileDiscovery": false
48-
},
4949
{
5050
"id": "09147371668407854156",
5151
"displayName": "roman",

0 commit comments

Comments
 (0)