Skip to content

Commit 542d442

Browse files
chore CORE-4775: remove html page number metadata field (#2942)
### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: yuming-long <[email protected]>
1 parent 0d80886 commit 542d442

File tree

31 files changed

+626
-964
lines changed

31 files changed

+626
-964
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.13.7-dev0
2+
3+
### Enhancements
4+
* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
5+
6+
### Features
7+
8+
### Fixes
9+
110
## 0.13.6
211

312
### Enhancements

test_unstructured/chunking/test_title.py

Lines changed: 32 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,38 @@ def test_chunk_by_title_separates_by_page_number():
175175
]
176176

177177

178+
def test_chuck_by_title_respects_multipage():
179+
elements: list[Element] = [
180+
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
181+
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
182+
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
183+
Table("Heading\nCell text"),
184+
Title("An Okay Day"),
185+
Text("Today is an okay day."),
186+
Text("It is rainy outside."),
187+
Title("A Bad Day"),
188+
Text(
189+
"Today is a bad day.",
190+
metadata=ElementMetadata(
191+
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
192+
),
193+
),
194+
Text("It is storming outside."),
195+
CheckBox(),
196+
]
197+
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
198+
assert chunks == [
199+
CompositeElement(
200+
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
201+
),
202+
Table("Heading\nCell text"),
203+
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
204+
CompositeElement(
205+
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
206+
),
207+
]
208+
209+
178210
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
179211
"""PreChunker is insensitive to regex-metadata changes.
180212
@@ -328,52 +360,6 @@ def test_add_chunking_strategy_respects_max_characters():
328360
assert chunk_elements == chunks
329361

330362

331-
def test_add_chunking_strategy_on_partition_html_respects_multipage():
332-
filename = "example-docs/example-10k-1p.html"
333-
partitioned_elements_multipage_false_combine_chars_0 = partition_html(
334-
filename,
335-
chunking_strategy="by_title",
336-
multipage_sections=False,
337-
combine_text_under_n_chars=0,
338-
new_after_n_chars=300,
339-
max_characters=400,
340-
)
341-
partitioned_elements_multipage_true_combine_chars_0 = partition_html(
342-
filename,
343-
chunking_strategy="by_title",
344-
multipage_sections=True,
345-
combine_text_under_n_chars=0,
346-
new_after_n_chars=300,
347-
max_characters=400,
348-
)
349-
elements = partition_html(filename)
350-
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
351-
elements,
352-
multipage_sections=False,
353-
combine_text_under_n_chars=0,
354-
new_after_n_chars=300,
355-
max_characters=400,
356-
)
357-
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
358-
elements,
359-
multipage_sections=True,
360-
combine_text_under_n_chars=0,
361-
new_after_n_chars=300,
362-
max_characters=400,
363-
)
364-
assert (
365-
partitioned_elements_multipage_false_combine_chars_0
366-
== cleaned_elements_multipage_false_combine_chars_0
367-
)
368-
assert (
369-
partitioned_elements_multipage_true_combine_chars_0
370-
== cleaned_elements_multipage_true_combine_chars_0
371-
)
372-
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
373-
partitioned_elements_multipage_false_combine_chars_0,
374-
)
375-
376-
377363
def test_chunk_by_title_drops_detection_class_prob():
378364
elements: list[Element] = [
379365
Title(

test_unstructured/partition/test_auto.py

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,52 +1097,6 @@ def test_add_chunking_strategy_on_partition_auto():
10971097
assert chunk_elements == chunks
10981098

10991099

1100-
def test_add_chunking_strategy_title_on_partition_auto_respects_multipage():
1101-
filename = "example-docs/example-10k-1p.html"
1102-
partitioned_elements_multipage_false_combine_chars_0 = partition(
1103-
filename,
1104-
chunking_strategy="by_title",
1105-
multipage_sections=False,
1106-
combine_text_under_n_chars=0,
1107-
new_after_n_chars=300,
1108-
max_characters=400,
1109-
)
1110-
partitioned_elements_multipage_true_combine_chars_0 = partition(
1111-
filename,
1112-
chunking_strategy="by_title",
1113-
multipage_sections=True,
1114-
combine_text_under_n_chars=0,
1115-
new_after_n_chars=300,
1116-
max_characters=400,
1117-
)
1118-
elements = partition(filename)
1119-
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
1120-
elements,
1121-
multipage_sections=False,
1122-
combine_text_under_n_chars=0,
1123-
new_after_n_chars=300,
1124-
max_characters=400,
1125-
)
1126-
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
1127-
elements,
1128-
multipage_sections=True,
1129-
combine_text_under_n_chars=0,
1130-
new_after_n_chars=300,
1131-
max_characters=400,
1132-
)
1133-
assert (
1134-
partitioned_elements_multipage_false_combine_chars_0
1135-
== cleaned_elements_multipage_false_combine_chars_0
1136-
)
1137-
assert (
1138-
partitioned_elements_multipage_true_combine_chars_0
1139-
== cleaned_elements_multipage_true_combine_chars_0
1140-
)
1141-
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
1142-
partitioned_elements_multipage_false_combine_chars_0,
1143-
)
1144-
1145-
11461100
def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
11471101
filename = "example-docs/example-10k-1p.html"
11481102

test_unstructured/partition/test_html_partition.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -731,15 +731,13 @@ def test_all_element_ids_are_unique():
731731

732732

733733
def test_element_ids_are_deterministic():
734-
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
735-
assert ids == [
736-
"cba9e551ed975e0f8a1956095894e92a",
737-
"f540ea3b6569aafeb433df6616e79971",
738-
"f4a34ee0fac26589fffdb53d0dfedbaf",
739-
"15168aeddbd19da60791109a5a45af65",
740-
"0c027f66120dd96271489dd0bb69bff5",
741-
"abe89090c2e46dda8fff81053cc79f17",
734+
ids_first_partition = [
735+
e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")
736+
]
737+
ids_second_partition = [
738+
e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")
742739
]
740+
assert ids_first_partition == ids_second_partition
743741

744742

745743
def test_partition_html_b_tag_parsing():

test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/Shared Documents/ideas-page.html.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"element_id": "b7b1c359c06495bd6fe8e174b2a9908f",
3+
"element_id": "32bc8af17151389d3e80f65036f8e65b",
44
"metadata": {
55
"data_source": {
66
"date_created": "2023-06-16T05:04:47+00:00",
@@ -17,7 +17,6 @@
1717
"languages": [
1818
"eng"
1919
],
20-
"page_number": 1,
2120
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
2221
},
2322
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",

test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/SitePages/Home.html.json

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"element_id": "8d15e7cb1bbb2bf4bab95dcd20a79f29",
3+
"element_id": "f346c0d677012f9d4265678f9626c829",
44
"metadata": {
55
"data_source": {
66
"date_created": "0001-01-01T08:00:00Z",
@@ -16,14 +16,13 @@
1616
"filetype": "text/html",
1717
"languages": [
1818
"eng"
19-
],
20-
"page_number": 1
19+
]
2120
},
2221
"text": "Documents",
2322
"type": "Title"
2423
},
2524
{
26-
"element_id": "2ef8cded92afdc398b5757e488f5d53d",
25+
"element_id": "fea3bac751e7273dfe57b271fe9dd22b",
2726
"metadata": {
2827
"data_source": {
2928
"date_created": "0001-01-01T08:00:00Z",
@@ -39,8 +38,7 @@
3938
"filetype": "text/html",
4039
"languages": [
4140
"eng"
42-
],
43-
"page_number": 1
41+
]
4442
},
4543
"text": "Events",
4644
"type": "Title"

test_unstructured_ingest/expected-structured-output/Sharepoint-with-permissions/SitePages/This-is-a-title.html.json

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"element_id": "a227bc5e1e168472aa02c7ddeac6023b",
3+
"element_id": "fe9e95d69e2fe6e0fcf74f630e24f11f",
44
"metadata": {
55
"data_source": {
66
"date_created": "0001-01-01T08:00:00Z",
@@ -17,14 +17,13 @@
1717
"languages": [
1818
"cat",
1919
"fra"
20-
],
21-
"page_number": 1
20+
]
2221
},
2322
"text": "This is a plain text site page for testing purposes",
2423
"type": "ListItem"
2524
},
2625
{
27-
"element_id": "110e27269e69e01c41db4faf9a31d770",
26+
"element_id": "bf2f616265a06fa30e74df2cf6291c40",
2827
"metadata": {
2928
"data_source": {
3029
"date_created": "0001-01-01T08:00:00Z",
@@ -41,14 +40,13 @@
4140
"languages": [
4241
"cat",
4342
"fra"
44-
],
45-
"page_number": 1
43+
]
4644
},
4745
"text": "These are bullet points meant for testing",
4846
"type": "ListItem"
4947
},
5048
{
51-
"element_id": "c398848281e72db6061cf211b7c211d9",
49+
"element_id": "f59e42aff8f1b1ad83f8280a0686eabe",
5250
"metadata": {
5351
"data_source": {
5452
"date_created": "0001-01-01T08:00:00Z",
@@ -65,14 +63,13 @@
6563
"languages": [
6664
"cat",
6765
"fra"
68-
],
69-
"page_number": 1
66+
]
7067
},
7168
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ex tellus, sodales non nulla et, sodales consequat turpis. Etiam vestibulum nisl placerat risus elementum, a sodales purus rhoncus. Sed eget velit pharetra, pretium nisi nec, laoreet ligula. Duis luctus mi in ligula cursus, vel lacinia tortor ultricies. Aenean sit amet sodales odio, a maximus elit. Pellentesque vehicula diam sit amet leo placerat placerat. Integer varius elementum accumsan. Donec posuere elit mauris, eget efficitur nisl viverra vitae.",
7269
"type": "NarrativeText"
7370
},
7471
{
75-
"element_id": "8a67276048c91e45cae58a087eba44cc",
72+
"element_id": "9fa12141ac0e9ad3d09fe51dc393ad59",
7673
"metadata": {
7774
"data_source": {
7875
"date_created": "0001-01-01T08:00:00Z",
@@ -89,8 +86,7 @@
8986
"languages": [
9087
"cat",
9188
"fra"
92-
],
93-
"page_number": 1
89+
]
9490
},
9591
"text": "Integer at dictum nisi. Cras venenatis non velit in posuere. Curabitur tristique, eros eget tristique pellentesque, neque metus ullamcorper ligula, nec posuere neque lacus nec felis. Nulla a libero eget eros consectetur hendrerit. Pellentesque interdum, diam eget tristique pretium, quam lorem pulvinar lorem, a eleifend nisl lectus at ex. Praesent pulvinar ex ut consequat condimentum. Sed rutrum, erat a hendrerit blandit, urna mauris posuere est, at porttitor risus diam non leo. Nullam rutrum vehicula dolor, quis venenatis ligula rutrum sit amet. Nam massa justo, fermentum in dui lacinia, tincidunt imperdiet nunc. Nam posuere tortor ac lectus elementum, non mollis urna consequat. In interdum non tellus sed pellentesque.",
9692
"type": "NarrativeText"

test_unstructured_ingest/expected-structured-output/Sharepoint/Shared Documents/ideas-page.html.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"element_id": "b7b1c359c06495bd6fe8e174b2a9908f",
3+
"element_id": "32bc8af17151389d3e80f65036f8e65b",
44
"metadata": {
55
"data_source": {
66
"date_created": "2023-06-16T05:04:47+00:00",
@@ -17,7 +17,6 @@
1717
"languages": [
1818
"eng"
1919
],
20-
"page_number": 1,
2120
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
2221
},
2322
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",

test_unstructured_ingest/expected-structured-output/Sharepoint/SitePages/Home.html.json

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[
22
{
3-
"element_id": "8d15e7cb1bbb2bf4bab95dcd20a79f29",
3+
"element_id": "f346c0d677012f9d4265678f9626c829",
44
"metadata": {
55
"data_source": {
66
"date_created": "0001-01-01T08:00:00Z",
@@ -16,14 +16,13 @@
1616
"filetype": "text/html",
1717
"languages": [
1818
"eng"
19-
],
20-
"page_number": 1
19+
]
2120
},
2221
"text": "Documents",
2322
"type": "Title"
2423
},
2524
{
26-
"element_id": "2ef8cded92afdc398b5757e488f5d53d",
25+
"element_id": "fea3bac751e7273dfe57b271fe9dd22b",
2726
"metadata": {
2827
"data_source": {
2928
"date_created": "0001-01-01T08:00:00Z",
@@ -39,8 +38,7 @@
3938
"filetype": "text/html",
4039
"languages": [
4140
"eng"
42-
],
43-
"page_number": 1
41+
]
4442
},
4543
"text": "Events",
4644
"type": "Title"

0 commit comments

Comments
 (0)