@@ -520,14 +520,19 @@ def _handle_parsing_of_children(
520520 remaining_snipped = text_w_prev_child
521521 elif self ._is_within_targetlen_w_buffer (text_w_prev_child ):
522522 child ["text" ] = text_w_prev_child
523+
524+ # Make sure text in within token limit
525+ limited_child_text = self ._cut_to_tokenlen (child ["text" ], self .token_limit )
526+
527+ # Build document from text and child metadata
523528 return_doc += [
524529 MarkdownDataContract (
525- md = self . _cut_to_tokenlen ( child [ "text" ], self . token_limit ) ,
530+ md = limited_child_text ,
526531 url = child ["metadata" ]["url" ],
527532 keywords = child ["metadata" ]["keywords" ],
528533 metadata = {
529- "token_len" : self .token_limit ,
530- "char_len" : len (child [ "text" ] ),
534+ "token_len" : self ._get_token_len ( limited_child_text ) ,
535+ "char_len" : len (limited_child_text ),
531536 },
532537 )
533538 ]
@@ -583,7 +588,7 @@ def _md_data_from_dict_cut(self, doc: DocumentNode) -> MarkdownDataContract:
583588 url = doc ["metadata" ]["url" ],
584589 keywords = doc ["metadata" ]["keywords" ],
585590 metadata = {
586- "token_len" : self .token_limit ,
591+ "token_len" : self ._get_token_len ( text ) ,
587592 "char_len" : len (text ),
588593 },
589594 )
@@ -677,14 +682,15 @@ def _parse_hierarchical(
677682
678683 # add potential short remaining spillovers
679684 if self ._get_token_len (remaining_snipped ) >= self .token_limit_min :
685+ limited_remaining_snipped = self ._cut_to_tokenlen (remaining_snipped , self .token_limit )
680686 return_doc += [
681687 MarkdownDataContract (
682- md = self . _cut_to_tokenlen ( remaining_snipped , self . token_limit ) ,
688+ md = limited_remaining_snipped ,
683689 url = doc ["metadata" ]["url" ],
684690 keywords = doc ["metadata" ]["keywords" ],
685691 metadata = {
686- "token_len" : self .token_limit ,
687- "char_len" : len (remaining_snipped ),
692+ "token_len" : self ._get_token_len ( limited_remaining_snipped ) ,
693+ "char_len" : len (limited_remaining_snipped ),
688694 },
689695 )
690696 ]
0 commit comments