Skip to content

Commit 1af41d5

Browse files
authored
feat(chunking): add .orig_elements behavior to chunking (#2656)
**Summary** Add the actual behavior to populate `.metadata.orig_elements` during chunking, when so instructed by the `include_orig_elements` option. **Additional Context** The underlying structures to support this, namely the `.metadata.orig_elements` field and the `include_orig_elements` chunking option, were added in closely prior PRs. This PR adds the behavior to actually populate that metadata field during chunking when the option is set.
1 parent c02cfb8 commit 1af41d5

File tree

6 files changed

+271
-55
lines changed

6 files changed

+271
-55
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
### Features
88

9+
* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR.
10+
911
### Fixes
1012

1113
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.

test_unstructured/chunking/test_base.py

Lines changed: 167 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,30 @@ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
352352
with pytest.raises(StopIteration):
353353
next(chunk_iter)
354354

355-
def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
355+
def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
356+
table = Table("foo bar", metadata=ElementMetadata(text_as_html="<table>foo bar</table>"))
357+
opts = ChunkingOptions(include_orig_elements=True)
358+
pre_chunk = TablePreChunk(table, "", opts)
359+
360+
chunk_iter = pre_chunk.iter_chunks()
361+
362+
chunk = next(chunk_iter)
363+
assert isinstance(chunk, Table)
364+
assert chunk.metadata.orig_elements == [table]
365+
assert chunk.metadata.text_as_html == "<table>foo bar</table>"
366+
# --
367+
with pytest.raises(StopIteration):
368+
next(chunk_iter)
369+
370+
def but_not_when_instructed_not_to(self):
371+
pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
372+
373+
chunk = next(pre_chunk.iter_chunks())
374+
375+
assert isinstance(chunk, Table)
376+
assert chunk.metadata.orig_elements is None
377+
378+
def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
356379
# fixed-overhead = 8+8+9+8+9+8 = 50
357380
# per-row overhead = 27
358381
html_table = (
@@ -398,6 +421,7 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
398421
"<tbody>\n"
399422
"<tr><td>Lo"
400423
)
424+
assert not chunk.metadata.is_continuation
401425
# --
402426
chunk = next(chunk_iter)
403427
assert isinstance(chunk, TableChunk)
@@ -408,6 +432,7 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
408432
"rem ipsum </td><td>A Link example</td></tr>\n"
409433
"<tr><td>Consectetur </td><td>adipiscing elit</td><"
410434
)
435+
assert chunk.metadata.is_continuation
411436
# -- note that text runs out but HTML continues because it's significantly longer. So two
412437
# -- of these chunks have HTML but no text.
413438
chunk = next(chunk_iter)
@@ -418,17 +443,42 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
418443
"<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n"
419444
"<tr><td>Vivamus quis </td><td>"
420445
)
446+
assert chunk.metadata.is_continuation
421447
# --
422448
chunk = next(chunk_iter)
423449
assert isinstance(chunk, TableChunk)
424450
assert chunk.text == ""
425451
assert chunk.metadata.text_as_html == (
426452
"nunc ipsum donec ac fermentum</td></tr>\n</tbody>\n</table>"
427453
)
454+
assert chunk.metadata.is_continuation
428455
# --
429456
with pytest.raises(StopIteration):
430457
next(chunk_iter)
431458

459+
def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self):
460+
"""Even though text and html are split, the orig_elements metadata is not."""
461+
table = Table(
462+
"Header Col 1 Header Col 2\nLorem ipsum dolor sit amet",
463+
metadata=ElementMetadata(text_as_html="<table/>"),
464+
)
465+
opts = ChunkingOptions(max_characters=30, include_orig_elements=True)
466+
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts)
467+
468+
chunk_iter = pre_chunk.iter_chunks()
469+
470+
chunk = next(chunk_iter)
471+
assert isinstance(chunk, TableChunk)
472+
assert chunk.text == "Header Col 1 Header Col 2"
473+
assert chunk.metadata.orig_elements == [table]
474+
assert not chunk.metadata.is_continuation
475+
# --
476+
chunk = next(chunk_iter)
477+
assert isinstance(chunk, TableChunk)
478+
assert chunk.text == "Lorem ipsum dolor sit amet"
479+
assert chunk.metadata.orig_elements == [table]
480+
assert chunk.metadata.is_continuation
481+
432482
@pytest.mark.parametrize(
433483
("text", "expected_value"),
434484
[
@@ -469,6 +519,50 @@ def it_includes_its_overlap_prefix_in_its_text_when_present(
469519
)
470520
assert pre_chunk._text == expected_value
471521

522+
def it_computes_metadata_for_each_chunk_to_help(self):
523+
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>"))
524+
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
525+
526+
metadata = pre_chunk._metadata
527+
528+
assert metadata.text_as_html == "<table/>"
529+
# -- opts.include_orig_elements is True by default --
530+
assert metadata.orig_elements == [table]
531+
# -- it produces a new instance each time it is called so changing one chunk's metadata does
532+
# -- not change that of any other chunk.
533+
assert pre_chunk._metadata is not metadata
534+
535+
def but_it_omits_orig_elements_from_metadata_when_so_instructed(self):
536+
pre_chunk = TablePreChunk(
537+
Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="<table/>")),
538+
overlap_prefix="",
539+
opts=ChunkingOptions(include_orig_elements=False),
540+
)
541+
542+
assert pre_chunk._metadata.orig_elements is None
543+
544+
def it_computes_the_original_elements_list_to_help(self):
545+
table = Table(
546+
"Lorem ipsum",
547+
metadata=ElementMetadata(text_as_html="<table/>", orig_elements=[Table("Lorem Ipsum")]),
548+
)
549+
pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions())
550+
551+
orig_elements = pre_chunk._orig_elements
552+
553+
# -- a TablePreChunk always has exactly one original (Table) element --
554+
assert len(orig_elements) == 1
555+
orig_element = orig_elements[0]
556+
# -- each item in orig_elements is a copy of the original element so we can mutate it
557+
# -- without changing user's data.
558+
assert orig_element == table
559+
assert orig_element is not table
560+
# -- it strips any .metadata.orig_elements from each element to prevent a recursive data
561+
# -- structure
562+
assert orig_element.metadata.orig_elements is None
563+
# -- computation is only on first call, all chunks get exactly the same orig-elements --
564+
assert pre_chunk._orig_elements is orig_elements
565+
472566

473567
class DescribeTextPreChunk:
474568
"""Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@@ -599,17 +693,15 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
599693
)
600694

601695
def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self):
602-
pre_chunk = TextPreChunk(
603-
[
604-
Title("Introduction"),
605-
Text(
606-
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
607-
" lectus porta volutpat.",
608-
),
609-
],
610-
overlap_prefix="e feugiat efficitur.",
611-
opts=ChunkingOptions(max_characters=200),
612-
)
696+
elements = [
697+
Title("Introduction"),
698+
Text(
699+
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
700+
" lectus porta volutpat.",
701+
),
702+
]
703+
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
704+
pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts)
613705

614706
chunk_iter = pre_chunk.iter_chunks()
615707

@@ -619,36 +711,44 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window
619711
" adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.",
620712
)
621713
assert chunk.metadata is pre_chunk._consolidated_metadata
714+
assert chunk.metadata.orig_elements == elements
715+
# --
716+
with pytest.raises(StopIteration):
717+
next(chunk_iter)
622718

623719
def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self):
624720
# -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
625721
# -- The pre-chunker will isolate that element in a pre_chunk of its own.
626-
pre_chunk = TextPreChunk(
627-
[
628-
Text(
629-
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
630-
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
631-
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
632-
" commodo consequat."
633-
),
634-
],
635-
overlap_prefix="",
636-
opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")),
637-
)
722+
elements = [
723+
Text(
724+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
725+
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
726+
" veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
727+
" commodo consequat."
728+
)
729+
]
730+
opts = ChunkingOptions(max_characters=200, include_orig_elements=True)
731+
pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts)
638732

639733
chunk_iter = pre_chunk.iter_chunks()
640734

735+
# -- Note that .metadata.orig_elements is the same single original element, "repeated" for
736+
# -- each text-split chunk. This behavior emerges without explicit command as a consequence
737+
# -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
738+
# -- `._consolidated_metadata)` for each text-split chunk.
641739
chunk = next(chunk_iter)
642740
assert chunk == CompositeElement(
643741
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
644742
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
645743
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
646744
)
647745
assert chunk.metadata is pre_chunk._consolidated_metadata
746+
assert chunk.metadata.orig_elements == elements
648747
# --
649748
chunk = next(chunk_iter)
650749
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
651750
assert chunk.metadata is pre_chunk._continuation_metadata
751+
assert chunk.metadata.orig_elements == elements
652752
# --
653753
with pytest.raises(StopIteration):
654754
next(chunk_iter)
@@ -762,6 +862,23 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
762862
"parent_id": ["f87731e0"],
763863
}
764864

865+
def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
866+
opts = ChunkingOptions(include_orig_elements=True)
867+
metadata = ElementMetadata(filename="foo.pdf")
868+
element = Title("Lorem Ipsum", metadata=metadata)
869+
element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata)
870+
pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts)
871+
872+
consolidated_metadata = pre_chunk._consolidated_metadata
873+
874+
# -- pre-chunk elements are included as metadata --
875+
orig_elements = consolidated_metadata.orig_elements
876+
assert orig_elements is not None
877+
assert orig_elements == [element, element_2]
878+
# -- and they are the exact instances, not copies --
879+
assert orig_elements[0] is element
880+
assert orig_elements[1] is element_2
881+
765882
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
766883
"""regex_metadata of chunk is combined regex_metadatas of its elements.
767884
@@ -868,6 +985,32 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
868985
},
869986
}
870987

988+
def it_computes_the_original_elements_list_to_help(self):
989+
element = Title("Introduction")
990+
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
991+
element_3 = CompositeElement(
992+
"In rhoncus ipsum sed lectus porta volutpat.",
993+
metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]),
994+
)
995+
pre_chunk = TextPreChunk(
996+
[element, element_2, element_3],
997+
overlap_prefix="",
998+
opts=ChunkingOptions(include_orig_elements=True),
999+
)
1000+
1001+
orig_elements = pre_chunk._orig_elements
1002+
1003+
# -- all elements of pre-chunk are included --
1004+
assert orig_elements == [element, element_2, element_3]
1005+
# -- orig_elements that are chunks (having orig-elements of their own) are copied and the
1006+
# -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
1007+
# -- structure that nests orig_elements within orig_elements.
1008+
assert orig_elements[0] is element
1009+
assert orig_elements[2] is not element_3
1010+
assert orig_elements[2].metadata.orig_elements is None
1011+
# -- computation is only on first call, all chunks get exactly the same orig-elements --
1012+
assert pre_chunk._orig_elements is orig_elements
1013+
8711014
@pytest.mark.parametrize(
8721015
("elements", "overlap_prefix", "expected_value"),
8731016
[

test_unstructured/chunking/test_basic.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them():
113113
]
114114

115115

116+
def test_it_includes_original_elements_as_metadata_when_requested():
117+
element = Title("Introduction")
118+
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
119+
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
120+
121+
chunks = chunk_elements(
122+
[element, element_2, element_3], max_characters=70, include_orig_elements=True
123+
)
124+
125+
assert len(chunks) == 2
126+
chunk = chunks[0]
127+
assert chunk == CompositeElement(
128+
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
129+
)
130+
assert chunk.metadata.orig_elements == [element, element_2]
131+
# --
132+
chunk = chunks[1]
133+
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
134+
assert chunk.metadata.orig_elements == [element_3]
135+
136+
116137
# ------------------------------------------------------------------------------------------------
117138
# UNIT TESTS
118139
# ------------------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)