@@ -352,7 +352,30 @@ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
352352 with pytest .raises (StopIteration ):
353353 next (chunk_iter )
354354
355- def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window (self ):
355+ def and_it_includes_the_original_table_element_in_metadata_when_so_instructed (self ):
356+ table = Table ("foo bar" , metadata = ElementMetadata (text_as_html = "<table>foo bar</table>" ))
357+ opts = ChunkingOptions (include_orig_elements = True )
358+ pre_chunk = TablePreChunk (table , "" , opts )
359+
360+ chunk_iter = pre_chunk .iter_chunks ()
361+
362+ chunk = next (chunk_iter )
363+ assert isinstance (chunk , Table )
364+ assert chunk .metadata .orig_elements == [table ]
365+ assert chunk .metadata .text_as_html == "<table>foo bar</table>"
366+ # --
367+ with pytest .raises (StopIteration ):
368+ next (chunk_iter )
369+
370+ def but_not_when_instructed_not_to (self ):
371+ pre_chunk = TablePreChunk (Table ("foobar" ), "" , ChunkingOptions (include_orig_elements = False ))
372+
373+ chunk = next (pre_chunk .iter_chunks ())
374+
375+ assert isinstance (chunk , Table )
376+ assert chunk .metadata .orig_elements is None
377+
378+ def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window (self ):
356379 # fixed-overhead = 8+8+9+8+9+8 = 50
357380 # per-row overhead = 27
358381 html_table = (
@@ -398,6 +421,7 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
398421 "<tbody>\n "
399422 "<tr><td>Lo"
400423 )
424+ assert not chunk .metadata .is_continuation
401425 # --
402426 chunk = next (chunk_iter )
403427 assert isinstance (chunk , TableChunk )
@@ -408,6 +432,7 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
408432 "rem ipsum </td><td>A Link example</td></tr>\n "
409433 "<tr><td>Consectetur </td><td>adipiscing elit</td><"
410434 )
435+ assert chunk .metadata .is_continuation
411436 # -- note that text runs out but HTML continues because it's significantly longer. So two
412437 # -- of these chunks have HTML but no text.
413438 chunk = next (chunk_iter )
@@ -418,17 +443,42 @@ def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_win
418443 "<tr><td>Nunc aliquam </td><td>id enim nec molestie</td></tr>\n "
419444 "<tr><td>Vivamus quis </td><td>"
420445 )
446+ assert chunk .metadata .is_continuation
421447 # --
422448 chunk = next (chunk_iter )
423449 assert isinstance (chunk , TableChunk )
424450 assert chunk .text == ""
425451 assert chunk .metadata .text_as_html == (
426452 "nunc ipsum donec ac fermentum</td></tr>\n </tbody>\n </table>"
427453 )
454+ assert chunk .metadata .is_continuation
428455 # --
429456 with pytest .raises (StopIteration ):
430457 next (chunk_iter )
431458
459+ def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed (self ):
460+ """Even though text and html are split, the orig_elements metadata is not."""
461+ table = Table (
462+ "Header Col 1 Header Col 2\n Lorem ipsum dolor sit amet" ,
463+ metadata = ElementMetadata (text_as_html = "<table/>" ),
464+ )
465+ opts = ChunkingOptions (max_characters = 30 , include_orig_elements = True )
466+ pre_chunk = TablePreChunk (table , overlap_prefix = "" , opts = opts )
467+
468+ chunk_iter = pre_chunk .iter_chunks ()
469+
470+ chunk = next (chunk_iter )
471+ assert isinstance (chunk , TableChunk )
472+ assert chunk .text == "Header Col 1 Header Col 2"
473+ assert chunk .metadata .orig_elements == [table ]
474+ assert not chunk .metadata .is_continuation
475+ # --
476+ chunk = next (chunk_iter )
477+ assert isinstance (chunk , TableChunk )
478+ assert chunk .text == "Lorem ipsum dolor sit amet"
479+ assert chunk .metadata .orig_elements == [table ]
480+ assert chunk .metadata .is_continuation
481+
432482 @pytest .mark .parametrize (
433483 ("text" , "expected_value" ),
434484 [
@@ -469,6 +519,50 @@ def it_includes_its_overlap_prefix_in_its_text_when_present(
469519 )
470520 assert pre_chunk ._text == expected_value
471521
522+ def it_computes_metadata_for_each_chunk_to_help (self ):
523+ table = Table ("Lorem ipsum" , metadata = ElementMetadata (text_as_html = "<table/>" ))
524+ pre_chunk = TablePreChunk (table , overlap_prefix = "" , opts = ChunkingOptions ())
525+
526+ metadata = pre_chunk ._metadata
527+
528+ assert metadata .text_as_html == "<table/>"
529+ # -- opts.include_orig_elements is True by default --
530+ assert metadata .orig_elements == [table ]
531+ # -- it produces a new instance each time it is called so changing one chunk's metadata does
532+ # -- not change that of any other chunk.
533+ assert pre_chunk ._metadata is not metadata
534+
535+ def but_it_omits_orig_elements_from_metadata_when_so_instructed (self ):
536+ pre_chunk = TablePreChunk (
537+ Table ("Lorem ipsum" , metadata = ElementMetadata (text_as_html = "<table/>" )),
538+ overlap_prefix = "" ,
539+ opts = ChunkingOptions (include_orig_elements = False ),
540+ )
541+
542+ assert pre_chunk ._metadata .orig_elements is None
543+
544+ def it_computes_the_original_elements_list_to_help (self ):
545+ table = Table (
546+ "Lorem ipsum" ,
547+ metadata = ElementMetadata (text_as_html = "<table/>" , orig_elements = [Table ("Lorem Ipsum" )]),
548+ )
549+ pre_chunk = TablePreChunk (table , overlap_prefix = "" , opts = ChunkingOptions ())
550+
551+ orig_elements = pre_chunk ._orig_elements
552+
553+ # -- a TablePreChunk always has exactly one original (Table) element --
554+ assert len (orig_elements ) == 1
555+ orig_element = orig_elements [0 ]
556+ # -- each item in orig_elements is a copy of the original element so we can mutate it
557+ # -- without changing user's data.
558+ assert orig_element == table
559+ assert orig_element is not table
560+ # -- it strips any .metadata.orig_elements from each element to prevent a recursive data
561+ # -- structure
562+ assert orig_element .metadata .orig_elements is None
563+ # -- computation is only on first call, all chunks get exactly the same orig-elements --
564+ assert pre_chunk ._orig_elements is orig_elements
565+
472566
473567class DescribeTextPreChunk :
474568 """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects."""
@@ -599,17 +693,15 @@ def it_can_combine_itself_with_another_TextPreChunk_instance(self):
599693 )
600694
601695 def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window (self ):
602- pre_chunk = TextPreChunk (
603- [
604- Title ("Introduction" ),
605- Text (
606- "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
607- " lectus porta volutpat." ,
608- ),
609- ],
610- overlap_prefix = "e feugiat efficitur." ,
611- opts = ChunkingOptions (max_characters = 200 ),
612- )
696+ elements = [
697+ Title ("Introduction" ),
698+ Text (
699+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
700+ " lectus porta volutpat." ,
701+ ),
702+ ]
703+ opts = ChunkingOptions (max_characters = 200 , include_orig_elements = True )
704+ pre_chunk = TextPreChunk (elements , overlap_prefix = "e feugiat efficitur." , opts = opts )
613705
614706 chunk_iter = pre_chunk .iter_chunks ()
615707
@@ -619,36 +711,44 @@ def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window
619711 " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat." ,
620712 )
621713 assert chunk .metadata is pre_chunk ._consolidated_metadata
714+ assert chunk .metadata .orig_elements == elements
715+ # --
716+ with pytest .raises (StopIteration ):
717+ next (chunk_iter )
622718
623719 def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size (self ):
624720 # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window.
625721 # -- The pre-chunker will isolate that element in a pre_chunk of its own.
626- pre_chunk = TextPreChunk (
627- [
628- Text (
629- "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
630- " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
631- " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
632- " commodo consequat."
633- ),
634- ],
635- overlap_prefix = "" ,
636- opts = ChunkingOptions (max_characters = 200 , text_splitting_separators = ("\n " , " " )),
637- )
722+ elements = [
723+ Text (
724+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
725+ " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
726+ " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea"
727+ " commodo consequat."
728+ )
729+ ]
730+ opts = ChunkingOptions (max_characters = 200 , include_orig_elements = True )
731+ pre_chunk = TextPreChunk (elements , overlap_prefix = "" , opts = opts )
638732
639733 chunk_iter = pre_chunk .iter_chunks ()
640734
735+ # -- Note that .metadata.orig_elements is the same single original element, "repeated" for
736+ # -- each text-split chunk. This behavior emerges without explicit command as a consequence
737+ # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends
738+ # -- `._consolidated_metadata)` for each text-split chunk.
641739 chunk = next (chunk_iter )
642740 assert chunk == CompositeElement (
643741 "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
644742 " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
645743 " veniam, quis nostrud exercitation ullamco laboris nisi ut"
646744 )
647745 assert chunk .metadata is pre_chunk ._consolidated_metadata
746+ assert chunk .metadata .orig_elements == elements
648747 # --
649748 chunk = next (chunk_iter )
650749 assert chunk == CompositeElement ("aliquip ex ea commodo consequat." )
651750 assert chunk .metadata is pre_chunk ._continuation_metadata
751+ assert chunk .metadata .orig_elements == elements
652752 # --
653753 with pytest .raises (StopIteration ):
654754 next (chunk_iter )
@@ -762,6 +862,23 @@ def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
762862 "parent_id" : ["f87731e0" ],
763863 }
764864
865+ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed (self ):
866+ opts = ChunkingOptions (include_orig_elements = True )
867+ metadata = ElementMetadata (filename = "foo.pdf" )
868+ element = Title ("Lorem Ipsum" , metadata = metadata )
869+ element_2 = Text ("'Lorem ipsum dolor' means 'Thank you very much'." , metadata = metadata )
870+ pre_chunk = TextPreChunk ([element , element_2 ], overlap_prefix = "" , opts = opts )
871+
872+ consolidated_metadata = pre_chunk ._consolidated_metadata
873+
874+ # -- pre-chunk elements are included as metadata --
875+ orig_elements = consolidated_metadata .orig_elements
876+ assert orig_elements is not None
877+ assert orig_elements == [element , element_2 ]
878+ # -- and they are the exact instances, not copies --
879+ assert orig_elements [0 ] is element
880+ assert orig_elements [1 ] is element_2
881+
765882 def it_consolidates_regex_metadata_in_a_field_specific_way (self ):
766883 """regex_metadata of chunk is combined regex_metadatas of its elements.
767884
@@ -868,6 +985,32 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
868985 },
869986 }
870987
988+ def it_computes_the_original_elements_list_to_help (self ):
989+ element = Title ("Introduction" )
990+ element_2 = Text ("Lorem ipsum dolor sit amet consectetur adipiscing elit." )
991+ element_3 = CompositeElement (
992+ "In rhoncus ipsum sed lectus porta volutpat." ,
993+ metadata = ElementMetadata (orig_elements = [Text ("Porta volupat." )]),
994+ )
995+ pre_chunk = TextPreChunk (
996+ [element , element_2 , element_3 ],
997+ overlap_prefix = "" ,
998+ opts = ChunkingOptions (include_orig_elements = True ),
999+ )
1000+
1001+ orig_elements = pre_chunk ._orig_elements
1002+
1003+ # -- all elements of pre-chunk are included --
1004+ assert orig_elements == [element , element_2 , element_3 ]
1005+ # -- orig_elements that are chunks (having orig-elements of their own) are copied and the
1006+ # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data
1007+ # -- structure that nests orig_elements within orig_elements.
1008+ assert orig_elements [0 ] is element
1009+ assert orig_elements [2 ] is not element_3
1010+ assert orig_elements [2 ].metadata .orig_elements is None
1011+ # -- computation is only on first call, all chunks get exactly the same orig-elements --
1012+ assert pre_chunk ._orig_elements is orig_elements
1013+
8711014 @pytest .mark .parametrize (
8721015 ("elements" , "overlap_prefix" , "expected_value" ),
8731016 [
0 commit comments