@@ -101,6 +101,10 @@ class _ByTitleChunkingOptions(ChunkingOptions):
101101
102102 `by_title`-specific options:
103103
104+ combine_text_under_n_chars
105+ A remedy to over-chunking caused by elements mis-identified as Title elements.
106+ Every Title element would start a new chunk and this setting mitigates that, at the
107+ expense of sometimes violating legitimate semantic boundaries.
104108 multipage_sections
105109 Indicates that page-boundaries should not be respected while chunking, i.e. elements
106110 appearing on two different pages can appear in the same chunk.
@@ -133,15 +137,9 @@ def combine_text_under_n_chars(self) -> int:
133137 - Defaults to `max_characters` when not specified.
134138 - Is reduced to `new_after_n_chars` when it exceeds that value.
135139 """
136- max_characters = self .hard_max
137- soft_max = self .soft_max
138- arg_value = self ._kwargs .get ("combine_text_under_n_chars" )
139-
140140 # -- `combine_text_under_n_chars` defaults to `max_characters` when not specified --
141- combine_text_under_n_chars = max_characters if arg_value is None else arg_value
142-
143- # -- `new_after_n_chars` takes precendence on conflict with `combine_text_under_n_chars` --
144- return soft_max if combine_text_under_n_chars > soft_max else combine_text_under_n_chars
141+ arg_value = self ._kwargs .get ("combine_text_under_n_chars" )
142+ return self .hard_max if arg_value is None else arg_value
145143
146144 @lazyproperty
147145 def multipage_sections (self ) -> bool :
@@ -154,22 +152,20 @@ def _validate(self) -> None:
154152 # -- start with base-class validations --
155153 super ()._validate ()
156154
157- combine_text_under_n_chars_arg = self ._kwargs .get ("combine_text_under_n_chars" )
158- if combine_text_under_n_chars_arg is not None :
159- # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
160- # -- but a negative value is not
161- if combine_text_under_n_chars_arg < 0 :
162- raise ValueError (
163- f"'combine_text_under_n_chars' argument must be >= 0,"
164- f" got { combine_text_under_n_chars_arg } "
165- )
166-
167- # -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
168- # -- users. The chunking behavior would be no different than when
169- # -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
170- # -- default (500) then it can look like chunk-combining isn't working.
171- if combine_text_under_n_chars_arg > self .hard_max :
172- raise ValueError (
173- f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
174- f" value, got { combine_text_under_n_chars_arg } > { self .hard_max } "
175- )
155+ # -- `combine_text_under_n_chars == 0` is valid (suppresses chunk combination)
156+ # -- but a negative value is not
157+ if self .combine_text_under_n_chars < 0 :
158+ raise ValueError (
159+ f"'combine_text_under_n_chars' argument must be >= 0,"
160+ f" got { self .combine_text_under_n_chars } "
161+ )
162+
163+ # -- `combine_text_under_n_chars` > `max_characters` can produce behavior confusing to
164+ # -- users. The chunking behavior would be no different than when
165+ # -- `combine_text_under_n_chars == max_characters`, but if `max_characters` is left to
166+ # -- default (500) then it can look like chunk-combining isn't working.
167+ if self .combine_text_under_n_chars > self .hard_max :
168+ raise ValueError (
169+ f"'combine_text_under_n_chars' argument must not exceed `max_characters`"
170+ f" value, got { self .combine_text_under_n_chars } > { self .hard_max } "
171+ )
0 commit comments