Update CHANGELOG, examples and documentation

NathalieCharbel · NathalieCharbel · commit 385492b8e43b · 2025-01-20T13:46:48.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 ### Changed
 - Updated LLM implementations to handle message history consistently across providers.
 - The `id_prefix` parameter in the `LexicalGraphConfig` is deprecated.
+- Changed the default behaviour of `FixedSizeSplitter` to avoid words cut-off in the chunks whenever it is possible.
 
 ### Fixed
 - IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs.
diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst
@@ -581,9 +581,12 @@ that can be processed within the LLM token limits:
 
     from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
 
-    splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200)
+    splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False)
     splitter.run(text="Hello World. Life is beautiful.")
 
+.. note::
+
+    `approximate` flag is by default set to True to ensure clean chunk start and end (i.e. avoid words cut in the middle) whenever it is possible.
 
 Wrappers for LangChain and LlamaIndex text splitters are included in this package:
 
diff --git a/examples/customize/build_graph/components/splitters/fixed_size_splitter.py b/examples/customize/build_graph/components/splitters/fixed_size_splitter.py
@@ -6,9 +6,10 @@
 
 async def main() -> TextChunks:
     splitter = FixedSizeSplitter(
-        # optionally, configure chunk_size and chunk_overlap
+        # optionally, configure chunk_size, chunk_overlap, and approximate flag
         # chunk_size=4000,
         # chunk_overlap=200,
+        # approximate = False
     )
     chunks = await splitter.run(text="text to split")
     return chunks
diff --git a/examples/customize/build_graph/pipeline/kg_builder_from_pdf.py b/examples/customize/build_graph/pipeline/kg_builder_from_pdf.py
@@ -83,7 +83,7 @@ async def define_and_run_pipeline(
     pipe = Pipeline()
     pipe.add_component(PdfLoader(), "pdf_loader")
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200), "splitter"
+        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False), "splitter"
     )
     pipe.add_component(SchemaBuilder(), "schema")
     pipe.add_component(
diff --git a/examples/customize/build_graph/pipeline/kg_builder_from_text.py b/examples/customize/build_graph/pipeline/kg_builder_from_text.py
@@ -58,7 +58,7 @@ async def define_and_run_pipeline(
     # define the components
     pipe.add_component(
         # chunk_size=50 for the sake of this demo
-        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200),
+        FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")
diff --git a/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py b/examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py
@@ -27,7 +27,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=20, chunk_overlap=1),
+        FixedSizeSplitter(chunk_size=20, chunk_overlap=1, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")
diff --git a/examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py b/examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py
@@ -56,7 +56,7 @@ async def define_and_run_pipeline(
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
+        FixedSizeSplitter(chunk_size=200, chunk_overlap=50,approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")
diff --git a/examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py b/examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py
@@ -47,7 +47,7 @@ async def build_lexical_graph(
     pipe = Pipeline()
     # define the components
     pipe.add_component(
-        FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
+        FixedSizeSplitter(chunk_size=200, chunk_overlap=50, approximate=False),
         "splitter",
     )
     pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ async def define_and_run_pipeline(`
`83`	`83`	`pipe = Pipeline()`
`84`	`84`	`pipe.add_component(PdfLoader(), "pdf_loader")`
`85`	`85`	`pipe.add_component(`
`86`		`- FixedSizeSplitter(chunk_size=4000, chunk_overlap=200), "splitter"`
	`86`	`+ FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False), "splitter"`
`87`	`87`	`)`
`88`	`88`	`pipe.add_component(SchemaBuilder(), "schema")`
`89`	`89`	`pipe.add_component(`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ async def define_and_run_pipeline(`
`58`	`58`	`# define the components`
`59`	`59`	`pipe.add_component(`
`60`	`60`	`# chunk_size=50 for the sake of this demo`
`61`		`- FixedSizeSplitter(chunk_size=4000, chunk_overlap=200),`
	`61`	`+ FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False),`
`62`	`62`	`"splitter",`
`63`	`63`	`)`
`64`	`64`	`pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:`
`27`	`27`	`pipe = Pipeline()`
`28`	`28`	`# define the components`
`29`	`29`	`pipe.add_component(`
`30`		`- FixedSizeSplitter(chunk_size=20, chunk_overlap=1),`
	`30`	`+ FixedSizeSplitter(chunk_size=20, chunk_overlap=1, approximate=False),`
`31`	`31`	`"splitter",`
`32`	`32`	`)`
`33`	`33`	`pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ async def define_and_run_pipeline(`
`56`	`56`	`pipe = Pipeline()`
`57`	`57`	`# define the components`
`58`	`58`	`pipe.add_component(`
`59`		`- FixedSizeSplitter(chunk_size=200, chunk_overlap=50),`
	`59`	`+ FixedSizeSplitter(chunk_size=200, chunk_overlap=50,approximate=False),`
`60`	`60`	`"splitter",`
`61`	`61`	`)`
`62`	`62`	`pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ async def build_lexical_graph(`
`47`	`47`	`pipe = Pipeline()`
`48`	`48`	`# define the components`
`49`	`49`	`pipe.add_component(`
`50`		`- FixedSizeSplitter(chunk_size=200, chunk_overlap=50),`
	`50`	`+ FixedSizeSplitter(chunk_size=200, chunk_overlap=50, approximate=False),`
`51`	`51`	`"splitter",`
`52`	`52`	`)`
`53`	`53`	`pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")`