fix: Setting token length correctly in splitter metadata (#186)

malteos · web-flow · commit bdf30a5f681e · 2025-11-19T16:55:39.000+01:00
diff --git a/tests/steps/embedding/e2e_test.py b/tests/steps/embedding/e2e_test.py
@@ -192,7 +192,7 @@ def test_embedding_step_log_statistics(mock_embedding, default_embedding_data, e
 
     # Check values if a small tolerance
     expected_char_length_mean = pytest.approx(609.18, abs=0.1)
-    expected_token_length_mean = pytest.approx(257.18, abs=0.1)
+    expected_token_length_mean = pytest.approx(188.3, abs=0.1)
     expected_chunks_count_mean = pytest.approx(3.18, abs=0.2)
 
     assert char_length_record.count == expected_char_length_count, (
diff --git a/tests/steps/simple_splitter/e2e_simple_splitter_test.py b/tests/steps/simple_splitter/e2e_simple_splitter_test.py
@@ -40,3 +40,13 @@ def test_simple_splitter_step(default_markdown_data, env):
 
     assert len(step_output) == 11, "Step outputs have wrong count."
     assert step_report.results == 11, "Step report has wrong count of outputs."
+
+    hash_count = [o.md.count("#") for o in step_output]
+    nl_count = [o.md.count("\n") for o in step_output]
+    token_lens = [o.metadata["token_len"] for o in step_output]
+    char_lens = [o.metadata["char_len"] for o in step_output]
+
+    assert hash_count == [9, 3, 10, 13, 12, 12, 15, 1, 9, 3, 0], "Chunks have invalid hash count"
+    assert nl_count == [4, 0, 16, 23, 23, 24, 22, 9, 4, 0, 6], "Chunks have invalid new line count"
+    assert token_lens == [236, 74, 243, 278, 240, 225, 247, 136, 245, 67, 81], "Chunks have invalid token length"
+    assert char_lens == [757, 235, 839, 917, 776, 699, 797, 447, 787, 227, 220], "Chunks have invalid char length"
diff --git a/wurzel/utils/splitters/semantic_splitter.py b/wurzel/utils/splitters/semantic_splitter.py
@@ -520,14 +520,19 @@ def _handle_parsing_of_children(
             remaining_snipped = text_w_prev_child
         elif self._is_within_targetlen_w_buffer(text_w_prev_child):
             child["text"] = text_w_prev_child
+
+            # Make sure text in within token limit
+            limited_child_text = self._cut_to_tokenlen(child["text"], self.token_limit)
+
+            # Build document from text and child metadata
             return_doc += [
                 MarkdownDataContract(
-                    md=self._cut_to_tokenlen(child["text"], self.token_limit),
+                    md=limited_child_text,
                     url=child["metadata"]["url"],
                     keywords=child["metadata"]["keywords"],
                     metadata={
-                        "token_len": self.token_limit,
-                        "char_len": len(child["text"]),
+                        "token_len": self._get_token_len(limited_child_text),
+                        "char_len": len(limited_child_text),
                     },
                 )
             ]
@@ -583,7 +588,7 @@ def _md_data_from_dict_cut(self, doc: DocumentNode) -> MarkdownDataContract:
             url=doc["metadata"]["url"],
             keywords=doc["metadata"]["keywords"],
             metadata={
-                "token_len": self.token_limit,
+                "token_len": self._get_token_len(text),
                 "char_len": len(text),
             },
         )
@@ -677,14 +682,15 @@ def _parse_hierarchical(
 
         # add potential short remaining spillovers
         if self._get_token_len(remaining_snipped) >= self.token_limit_min:
+            limited_remaining_snipped = self._cut_to_tokenlen(remaining_snipped, self.token_limit)
             return_doc += [
                 MarkdownDataContract(
-                    md=self._cut_to_tokenlen(remaining_snipped, self.token_limit),
+                    md=limited_remaining_snipped,
                     url=doc["metadata"]["url"],
                     keywords=doc["metadata"]["keywords"],
                     metadata={
-                        "token_len": self.token_limit,
-                        "char_len": len(remaining_snipped),
+                        "token_len": self._get_token_len(limited_remaining_snipped),
+                        "char_len": len(limited_remaining_snipped),
                     },
                 )
             ]