diff --git a/README.md b/README.md index 5af880307..cd7431d11 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind # Split the document into chunks, put into `chunks` field doc["chunks"] = doc["content"].transform( cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=300, chunk_overlap=100) + language="markdown", chunk_size=2000, chunk_overlap=500) # Transform data of each chunk with doc["chunks"].row() as chunk: diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md index 77792cc10..486763996 100644 --- a/docs/docs/getting_started/quickstart.md +++ b/docs/docs/getting_started/quickstart.md @@ -79,7 +79,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind # Split the document into chunks, put into `chunks` field doc["chunks"] = doc["content"].transform( cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=300, chunk_overlap=100) + language="markdown", chunk_size=2000, chunk_overlap=500) # Transform data of each chunk with doc["chunks"].row() as chunk: diff --git a/examples/code_embedding/code_embedding.py b/examples/code_embedding/code_embedding.py index 80b4b288b..61ceea9e8 100644 --- a/examples/code_embedding/code_embedding.py +++ b/examples/code_embedding/code_embedding.py @@ -22,7 +22,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind with data_scope["files"].row() as file: file["chunks"] = file["content"].transform( cocoindex.functions.SplitRecursively(), - language="javascript", chunk_size=300, chunk_overlap=100) + language="python", chunk_size=2000, chunk_overlap=500) with file["chunks"].row() as chunk: chunk["embedding"] = chunk["text"].call(code_to_embedding) code_embeddings.collect(filename=file["filename"], location=chunk["location"], diff --git a/examples/pdf_embedding/pdf_embedding.py b/examples/pdf_embedding/pdf_embedding.py index ae0833aee..0f7994ba1 100644 --- a/examples/pdf_embedding/pdf_embedding.py +++ b/examples/pdf_embedding/pdf_embedding.py @@ -51,7 +51,7 @@ def pdf_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoinde doc["markdown"] = doc["content"].transform(PdfToMarkdown()) doc["chunks"] = doc["markdown"].transform( cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=300, chunk_overlap=100) + language="markdown", chunk_size=2000, chunk_overlap=500) with doc["chunks"].row() as chunk: chunk["embedding"] = chunk["text"].call(text_to_embedding) diff --git a/examples/text_embedding/text_embedding.py b/examples/text_embedding/text_embedding.py index 7a05bbcf1..70b3807c5 100644 --- a/examples/text_embedding/text_embedding.py +++ b/examples/text_embedding/text_embedding.py @@ -24,7 +24,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind with data_scope["documents"].row() as doc: doc["chunks"] = doc["content"].transform( cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=300, chunk_overlap=100) + language="markdown", chunk_size=2000, chunk_overlap=500) with doc["chunks"].row() as chunk: chunk["embedding"] = text_to_embedding(chunk["text"])