Skip to content

Commit 385492b

Browse files
Update CHANGELOG, examples and documentation
1 parent 2d3b4fd commit 385492b

File tree

8 files changed

+12
-7
lines changed

8 files changed

+12
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
### Changed
1111
- Updated LLM implementations to handle message history consistently across providers.
1212
- The `id_prefix` parameter in the `LexicalGraphConfig` is deprecated.
13+
- Changed the default behaviour of `FixedSizeSplitter` to avoid words cut-off in the chunks whenever it is possible.
1314

1415
### Fixed
1516
- IDs for the Document and Chunk nodes in the lexical graph are now randomly generated and unique across multiple runs, fixing issues in the lexical graph where relationships were created between chunks that were created by different pipeline runs.

docs/source/user_guide_kg_builder.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,9 +581,12 @@ that can be processed within the LLM token limits:
581581
582582
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
583583
584-
splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200)
584+
splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False)
585585
splitter.run(text="Hello World. Life is beautiful.")
586586
587+
.. note::
588+
589+
`approximate` flag is by default set to True to ensure clean chunk start and end (i.e. avoid words cut in the middle) whenever it is possible.
587590

588591
Wrappers for LangChain and LlamaIndex text splitters are included in this package:
589592

examples/customize/build_graph/components/splitters/fixed_size_splitter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66

77
async def main() -> TextChunks:
88
splitter = FixedSizeSplitter(
9-
# optionally, configure chunk_size and chunk_overlap
9+
# optionally, configure chunk_size, chunk_overlap, and approximate flag
1010
# chunk_size=4000,
1111
# chunk_overlap=200,
12+
# approximate = False
1213
)
1314
chunks = await splitter.run(text="text to split")
1415
return chunks

examples/customize/build_graph/pipeline/kg_builder_from_pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ async def define_and_run_pipeline(
8383
pipe = Pipeline()
8484
pipe.add_component(PdfLoader(), "pdf_loader")
8585
pipe.add_component(
86-
FixedSizeSplitter(chunk_size=4000, chunk_overlap=200), "splitter"
86+
FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False), "splitter"
8787
)
8888
pipe.add_component(SchemaBuilder(), "schema")
8989
pipe.add_component(

examples/customize/build_graph/pipeline/kg_builder_from_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ async def define_and_run_pipeline(
5858
# define the components
5959
pipe.add_component(
6060
# chunk_size=50 for the sake of this demo
61-
FixedSizeSplitter(chunk_size=4000, chunk_overlap=200),
61+
FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=False),
6262
"splitter",
6363
)
6464
pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

examples/customize/build_graph/pipeline/lexical_graph_builder_from_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult:
2727
pipe = Pipeline()
2828
# define the components
2929
pipe.add_component(
30-
FixedSizeSplitter(chunk_size=20, chunk_overlap=1),
30+
FixedSizeSplitter(chunk_size=20, chunk_overlap=1, approximate=False),
3131
"splitter",
3232
)
3333
pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_single_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ async def define_and_run_pipeline(
5656
pipe = Pipeline()
5757
# define the components
5858
pipe.add_component(
59-
FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
59+
FixedSizeSplitter(chunk_size=200, chunk_overlap=50,approximate=False),
6060
"splitter",
6161
)
6262
pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

examples/customize/build_graph/pipeline/text_to_lexical_graph_to_entity_graph_two_pipelines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ async def build_lexical_graph(
4747
pipe = Pipeline()
4848
# define the components
4949
pipe.add_component(
50-
FixedSizeSplitter(chunk_size=200, chunk_overlap=50),
50+
FixedSizeSplitter(chunk_size=200, chunk_overlap=50, approximate=False),
5151
"splitter",
5252
)
5353
pipe.add_component(TextChunkEmbedder(embedder=OpenAIEmbeddings()), "chunk_embedder")

0 commit comments

Comments
 (0)