feat(algorithm): improved quality of chunks, particularly with low chunk sizes or few newlines (#17)

umarbutler · umarbutler · commit e24fbd414d58 · 2025-03-20T15:27:50.000+11:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 ## Changelog 🔄
 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.2.0] - 2025-03-20
+### Changed
+- Significantly improved the quality of chunks produced when chunking with low chunk sizes or documents with minimal varying levels of whitespace by adding a new rule to the `semchunk` algorithm that prioritizes splitting at the occurrence of single whitespace characters preceded by hierarchically meaningful non-whitespace characters over splitting at all single whitespace characters in general ([#17](https://github.com/isaacus-dev/semchunk/issues/17)).
+
 ## [3.1.3] - 2025-03-10
 ### Changed
 - Added mention of Isaacus to the README.
@@ -141,6 +145,7 @@ All notable changes to `semchunk` will be documented here. This project adheres
 ### Added
 - Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter.
 
+[3.2.0]: https://github.com/isaacus-dev/semchunk/compare/v3.1.3...v3.2.0
 [3.1.3]: https://github.com/isaacus-dev/semchunk/compare/v3.1.2...v3.1.3
 [3.1.2]: https://github.com/isaacus-dev/semchunk/compare/v3.1.1...v3.1.2
 [3.1.1]: https://github.com/isaacus-dev/semchunk/compare/v3.1.0...v3.1.1
diff --git a/README.md b/README.md
@@ -141,7 +141,7 @@ This function returns a list of chunks up to `chunk_size`-tokens-long, with any
 To ensure that chunks are as semantically meaningful as possible, `semchunk` uses the following splitters, in order of precedence:
 1. The largest sequence of newlines (`\n`) and/or carriage returns (`\r`);
 1. The largest sequence of tabs;
-1. The largest sequence of whitespace characters (as defined by regex's `\s` character class);
+1. The largest sequence of whitespace characters (as defined by regex's `\s` character class) or, since version 3.2.0, if the largest sequence of whitespace characters is only a single character and there exist whitespace characters preceded by any of the semantically meaningful non-whitespace characters listed below (in the same order of precedence), then only those specific whitespace characters;
 1. Sentence terminators (`.`, `?`, `!` and `*`);
 1. Clause separators (`;`, `,`, `(`, `)`, `[`, `]`, `“`, `”`, `‘`, `’`, `'`, `"` and `` ` ``);
 1. Sentence interrupters (`:`, `—` and `…`);
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "semchunk"
-version = "3.1.3"
+version = "3.2.0"
 authors = [
     {name="Isaacus", email="support@isaacus.com"},
     {name="Umar Butler", email="umar@umar.au"},
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -55,6 +55,7 @@
 )
 """A tuple of semantically meaningful non-whitespace splitters that may be used to chunk texts, ordered from most desirable to least desirable."""
 
+_REGEX_ESCAPED_NON_WHITESPACE_SEMANTIC_SPLITTERS = tuple(re.escape(splitter) for splitter in _NON_WHITESPACE_SEMANTIC_SPLITTERS)
 
 def _split_text(text: str) -> tuple[str, bool, list[str]]:
     """Split text using the most semantically meaningful splitter possible."""
@@ -64,7 +65,7 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
     # Try splitting at, in order of most desirable to least desirable:
     # - The largest sequence of newlines and/or carriage returns;
     # - The largest sequence of tabs;
-    # - The largest sequence of whitespace characters; and
+    # - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character;
     # - A semantically meaningful non-whitespace splitter.
     if "\n" in text or "\r" in text:
         splitter = max(re.findall(r"[\r\n]+", text))
@@ -74,6 +75,15 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
 
     elif re.search(r"\s", text):
         splitter = max(re.findall(r"\s+", text))
+        
+        # If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences.
+        if len(splitter) == 1:
+            for escaped_preceder in _REGEX_ESCAPED_NON_WHITESPACE_SEMANTIC_SPLITTERS:
+                if (whitespace_preceded_by_preceder := re.search(rf'{escaped_preceder}(\s)', text)):
+                    splitter = whitespace_preceded_by_preceder.group(1)
+                    escaped_splitter = re.escape(splitter)
+                    
+                    return splitter, splitter_is_whitespace, re.split(rf'(?<={escaped_preceder}){escaped_splitter}', text)
 
     else:
         # Identify the most desirable semantically meaningful non-whitespace splitter present in the text.