cocoindex-io
diff --git a/‎docs/docs/getting_started/overview.md‎
Lines changed: 24 additions & 4 deletions b/‎docs/docs/getting_started/overview.md‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎docs/docs/ops/functions.md‎
Lines changed: 32 additions & 3 deletions b/‎docs/docs/ops/functions.md‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎docs/docs/ops/storages.md‎
Lines changed: 52 additions & 40 deletions b/‎docs/docs/ops/storages.md‎
Lines changed: 52 additions & 40 deletions
diff --git a/‎docs/docusaurus.config.ts‎
Lines changed: 5 additions & 0 deletions b/‎docs/docusaurus.config.ts‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/amazon_s3_embedding/main.py‎
Lines changed: 14 additions & 13 deletions b/‎examples/amazon_s3_embedding/main.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎examples/code_embedding/main.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/code_embedding/main.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/docs_to_knowledge_graph/README.md‎
Lines changed: 7 additions & 10 deletions b/‎examples/docs_to_knowledge_graph/README.md‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎examples/docs_to_knowledge_graph/main.py‎
Lines changed: 26 additions & 23 deletions b/‎examples/docs_to_knowledge_graph/main.py‎
Lines changed: 26 additions & 23 deletions
@@ -5,10 +5,30 @@ slug: /
 
 # Welcome to CocoIndex
 
-Prepare high quality data that is tailored for the purpose is essential for a successful AI application in production.   
+CocoIndex is an ultra-performant real-time data transformation framework for AI, with incremental processing. 
 
-CocoIndex is a data indexing platform for AI use cases - semantic search, RAG, agentic workflow on top of embedding / knowledge graph etc.  CocoIndex aims to be the best in class scalable data indexing infrastructure with built in observability and lineage. 
+As a data framework, CocoIndex takes it to the next level on data freshness. **Incremental processing** is one of the core values provided by CocoIndex.
 
-CocoIndex can help you connecting to all the data sources, identify the best indexing strategy and setup the most robust pipeline -  chunking, embedding model, deduping/reconciling, vector stores, knowledge graph etc.  And then providing standard API to access the index.
+## Programming Model
+CocoIndex follows the idea of [Dataflow programming](https://en.wikipedia.org/wiki/Dataflow_programming) model. Each transformation creates a new field solely based on input fields, without hidden states and value mutation. All data before/after each transformation is observable, with lineage out of the box.
+
+The gist of an example data transformation:
+```python
+# import
+data['content'] = flow_builder.add_source(...) 
+
+# transform
+data['out'] = data['content'] 
+    .transform(...)
+    .transform(...)
+
+# collect data
+collector.collect(...)
+
+# export to db, vector db, graph db ...
+collector.export(...)
+```
+
+Get Started:
+- [Quick Start](https://cocoindex.io/docs/getting_started/quickstart)
 
-CocoIndex does all the heavy lifting work and plumbing for the data, so you can focus on your business logic and build your AI application on top of robust data indices.
 
@@ -26,11 +26,40 @@ Input data:
 
 *   `text` (type: `str`, required): The text to split.
 *   `chunk_size` (type: `int`, required): The maximum size of each chunk, in bytes.
+*   `min_chunk_size` (type: `int`, optional): The minimum size of each chunk, in bytes. If not provided, default to `chunk_size / 2`.
+
+    :::note
+
+    `SplitRecursively` will do its best to make the output chunks sized between `min_chunk_size` and `chunk_size`.
+    However, it's possible that some chunks are smaller than `min_chunk_size` or larger than `chunk_size` in rare cases, e.g. too short input text, or non-splittable large text.
+
+    Please avoid setting `min_chunk_size` to a value too close to `chunk_size`, to leave more rooms for the function to plan the optimal chunking.
+
+    :::
+
 *   `chunk_overlap` (type: `int`, optional): The maximum overlap size between adjacent chunks, in bytes.
 *   `language` (type: `str`, optional): The language of the document.
-    Can be a langauge name (e.g. `Python`, `Javascript`, `Markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
-    To see all supported language names and extensions, see [the code](https://github.com/search?q=org%3Acocoindex-io+lang%3Arust++%22static+TREE_SITTER_LANGUAGE_BY_LANG%22&type=code).
-    If it's unspecified or the specified language is not supported, it will be treated as plain text.
+    Can be a language name (e.g. `Python`, `Javascript`, `Markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
+
+*   `custom_languages` (type: `list[CustomLanguageSpec]`, optional): This allows you to customize the way to chunking specific languages using regular expressions. Each `CustomLanguageSpec` is a dict with the following fields:
+    *   `language_name` (type: `str`, required): Name of the language.
+    *   `aliases` (type: `list[str]`, optional): A list of aliases for the language.
+        It's an error if any language name or alias is duplicated.
+
+    *   `separators_regex` (type: `list[str]`, required): A list of regex patterns to split the text.
+        Higher-level boundaries should come first, and lower-level should be listed later. e.g. `[r"\n# ", r"\n## ", r"\n\n", r"\. "]`.
+        See [regex Syntax](https://docs.rs/regex/latest/regex/#syntax) for supported regular expression syntax.
+
+    :::note
+
+    We use the `language` field to determine how to split the input text, following these rules:
+    
+    *   We'll match the input `language` field against the `language_name` or `aliases` of each custom language specification, and use the matched one. If value of `language` is null, it'll be treated as empty string when matching `language_name` or `aliases`.
+    *   If no match is found, we'll match the `language` field against the builtin language configurations.
+        For all supported builtin language names and aliases (extensions), see [the code](https://github.com/search?q=org%3Acocoindex-io+lang%3Arust++%22static+TREE_SITTER_LANGUAGE_BY_LANG%22&type=code).
+    *   If no match is found, the input will be treated as plain text.
+
+    :::
 
 Return type: [KTable](/docs/core/data_types#ktable), each row represents a chunk, with the following sub fields:
 
 
@@ -54,34 +54,21 @@ Here's how CocoIndex data elements map to Qdrant elements during export:
 |-------------------|------------------|
 | an export target  | a unique collection |  
 | a collected row   | a point |
-| a field           | a named vector (for fields with vector type); a field within payload (otherwise) |
+| a field           | a named vector, if fits into Qdrant vector; or a field within payload otherwise |
+
+A vector with `Float32`, `Float64` or `Int64` type, and with fixed dimension, fits into Qdrant vector.
 
 #### Spec
 
 The spec takes the following fields:
 
-*   `collection_name` (type: `str`, required): The name of the collection to export the data to.
-
-*   `grpc_url` (type: `str`, optional): The [gRPC URL](https://qdrant.tech/documentation/interfaces/#grpc-interface) of the Qdrant instance. Defaults to `http://localhost:6334/`.
-
-*   `api_key` (type: `str`, optional). API key to authenticate requests with.
+*   `connection` (type: [auth reference](../core/flow_def#auth-registry) to `QdrantConnection`, optional): The connection to the Qdrant instance. `QdrantConnection` has the following fields:
+    *   `grpc_url` (type: `str`): The [gRPC URL](https://qdrant.tech/documentation/interfaces/#grpc-interface) of the Qdrant instance, e.g. `http://localhost:6334/`.
+    *   `api_key` (type: `str`, optional). API key to authenticate requests with.
 
-Before exporting, you must create a collection with a [vector name](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) that matches the vector field name in CocoIndex, and set `setup_by_user=True` during export.
+    If `connection` is not provided, will use local Qdrant instance at `http://localhost:6334/` by default.
 
-Example:
-
-```python
-doc_embeddings.export(
-    "doc_embeddings",
-    cocoindex.storages.Qdrant(
-        collection_name="cocoindex",
-        grpc_url="https://xyz-example.cloud-region.cloud-provider.cloud.qdrant.io:6334/",
-        api_key="<your-api-key-here>",
-    ),
-    primary_key_fields=["id_field"],
-    setup_by_user=True,
-)
-```
+*   `collection_name` (type: `str`, required): The name of the collection to export the data to.
 
 You can find an end-to-end example [here](https://github.com/cocoindex-io/cocoindex/tree/main/examples/text_embedding_qdrant).
 
@@ -399,19 +386,7 @@ You can find end-to-end examples fitting into any of supported property graphs i
 
 ### Neo4j
 
-If you don't have a Neo4j database, you can start a Neo4j database using our docker compose config:
-
-```bash
-docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/neo4j.yaml) up -d
-```
-
-:::warning
-
-The docker compose config above will start a Neo4j Enterprise instance under the [Evaluation License](https://neo4j.com/terms/enterprise_us/),
-with 30 days trial period.
-Please read and agree the license before starting the instance.
-
-:::
+#### Spec
 
 The `Neo4j` target spec takes the following fields:
 
@@ -430,17 +405,32 @@ Neo4j also provides a declaration spec `Neo4jDeclaration`, to configure indexing
     *   `primary_key_fields` (required)
     *   `vector_indexes` (optional)
 
-### Kuzu
+#### Neo4j dev instance
 
-CocoIndex supports talking to Kuzu through its [API server](https://github.com/kuzudb/api-server).
-You can bring up a Kuzu API server locally by running:
+If you don't have a Neo4j database, you can start a Neo4j database using our docker compose config:
 
 ```bash
-KUZU_DB_DIR=$HOME/.kuzudb
-KUZU_PORT=8123
-docker run -d --name kuzu -p ${KUZU_PORT}:8000 -v ${KUZU_DB_DIR}:/database kuzudb/api-server:latest
+docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/neo4j.yaml) up -d
 ```
 
+If will bring up a Neo4j instance, which can be accessed by username `neo4j` and password `cocoindex`.
+You can access the Neo4j browser at [http://localhost:7474](http://localhost:7474).
+
+:::warning
+
+The docker compose config above will start a Neo4j Enterprise instance under the [Evaluation License](https://neo4j.com/terms/enterprise_us/),
+with 30 days trial period.
+Please read and agree the license before starting the instance.
+
+:::
+
+
+### Kuzu
+
+#### Spec
+
+CocoIndex supports talking to Kuzu through its [API server](https://github.com/kuzudb/api-server).
+
 The `Kuzu` target spec takes the following fields:
 
 *   `connection` (type: [auth reference](../core/flow_def#auth-registry) to `KuzuConnectionSpec`): The connection to the Kuzu database. `KuzuConnectionSpec` has the following fields:
@@ -453,3 +443,25 @@ Kuzu also provides a declaration spec `KuzuDeclaration`, to configure indexing o
 *   Fields for [nodes to declare](#declare-extra-node-labels), including
     *   `nodes_label` (required)
     *   `primary_key_fields` (required)
+
+#### Kuzu dev instance
+
+If you don't have a Kuzu instance yet, you can bring up a Kuzu API server locally by running:
+
+```bash
+KUZU_DB_DIR=$HOME/.kuzudb
+KUZU_PORT=8123
+docker run -d --name kuzu -p ${KUZU_PORT}:8000 -v ${KUZU_DB_DIR}:/database kuzudb/api-server:latest
+```
+
+To explore the graph you built with Kuzu, you can use the [Kuzu Explorer](https://github.com/kuzudb/explorer).
+Currently Kuzu API server and the explorer cannot be up at the same time. So you need to stop the API server before running the explorer.
+
+To start the instance of the explorer, run:
+
+```bash
+KUZU_EXPLORER_PORT=8124
+docker run -d --name kuzu-explorer -p ${KUZU_EXPLORER_PORT}:8000  -v ${KUZU_DB_DIR}:/database -e MODE=READ_ONLY  kuzudb/explorer:latest
+```
+
+You can then access the explorer at [http://localhost:8124](http://localhost:8124).
@@ -86,6 +86,11 @@ const config: Config = {
     // Replace with your project's social card
     image: 'img/social-card.jpg',
     metadata: [{ name: 'description', content: 'Official documentation for CocoIndex - Learn how to use CocoIndex to build robust data indexing pipelines for AI applications. Comprehensive guides, API references, and best practices for implementing efficient data processing workflows.' }],
+    colorMode: {
+      defaultMode: 'light',
+      disableSwitch: false,
+      respectPrefersColorScheme: true,
+    },
     navbar: {
       title: 'CocoIndex',
       logo: {
 
@@ -98,19 +98,20 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
 def _main():
     # Initialize the database connection pool.
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
-    # Run queries in a loop to demonstrate the query capabilities.
-    while True:
-        query = input("Enter search query (or Enter to quit): ")
-        if query == "":
-            break
-        # Run the query function with the database connection pool and the query.
-        results = search(pool, query)
-        print("\nSearch results:")
-        for result in results:
-            print(f"[{result['score']:.3f}] {result['filename']}")
-            print(f"    {result['text']}")
-            print("---")
-        print()
+    with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
+        # Run queries in a loop to demonstrate the query capabilities.
+        while True:
+            query = input("Enter search query (or Enter to quit): ")
+            if query == "":
+                break
+            # Run the query function with the database connection pool and the query.
+            results = search(pool, query)
+            print("\nSearch results:")
+            for result in results:
+                print(f"[{result['score']:.3f}] {result['filename']}")
+                print(f"    {result['text']}")
+                print("---")
+            print()
 
 
 if __name__ == "__main__":
 
@@ -27,7 +27,7 @@ def code_to_embedding(
 @cocoindex.flow_def(name="CodeEmbedding")
 def code_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds files into a vector database.
     """
@@ -46,6 +46,7 @@ def code_embedding_flow(
             cocoindex.functions.SplitRecursively(),
             language=file["extension"],
             chunk_size=1000,
+            min_chunk_size=300,
             chunk_overlap=300,
         )
         with file["chunks"].row() as chunk:
 
@@ -12,10 +12,10 @@ Please drop [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a s
 
 ![example-explanation](https://github.com/user-attachments/assets/07ddbd60-106f-427f-b7cc-16b73b142d27)
 
-
 ## Prerequisite
 *   [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
-*   [Install Neo4j](https://cocoindex.io/docs/ops/storages#neo4j) if you don't have one.
+*   Install [Neo4j](https://cocoindex.io/docs/ops/storages#neo4j-dev-instance) or [Kuzu](https://cocoindex.io/docs/ops/storages#kuzu-dev-instance) if you don't have one.
+    *   The example uses Neo4j by default for now. If you want to use Kuzu, find out the "SELECT ONE GRAPH DATABASE TO USE" section and switch the active branch.
 *   [Configure your OpenAI API key](https://cocoindex.io/docs/ai/llm#openai).
 
 ## Documentation
@@ -45,21 +45,18 @@ cocoindex update main.py
 
 ### Browse the knowledge graph
 
-After the knowledge graph is build, you can explore the knowledge graph you built in Neo4j Browser.
+After the knowledge graph is built, you can explore the knowledge graph.
 
-For the dev enviroment, you can connect neo4j browser using credentials:
-- username: `neo4j`
-- password: `cocoindex`
-which is pre-configured in the our docker compose [config.yaml](https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/dev/neo4j.yaml).
+* If you're using Neo4j, you can open the explorer at [http://localhost:7474](http://localhost:7474), with username `neo4j` and password `cocoindex`.
+* If you're using Kuzu, you can start a Kuzu explorer locally. See [Kuzu dev instance](https://cocoindex.io/docs/ops/storages#kuzu-dev-instance) for more details.
 
-You can open it at [http://localhost:7474](http://localhost:7474), and run the following Cypher query to get all relationships:
+You can run the following Cypher query to get all relationships:
 
 ```cypher
 MATCH p=()-->() RETURN p
 ```
-<img width="1366" alt="neo4j-for-coco-docs" src="https://github.com/user-attachments/assets/3c8b6329-6fee-4533-9480-571399b57e57" />
-
 
+<img width="1366" alt="neo4j-for-coco-docs" src="https://github.com/user-attachments/assets/3c8b6329-6fee-4533-9480-571399b57e57" />
 
 ## CocoInsight 
 I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. 
 
@@ -5,27 +5,6 @@
 import dataclasses
 import cocoindex
 
-
-@dataclasses.dataclass
-class DocumentSummary:
-    """Describe a summary of a document."""
-
-    title: str
-    summary: str
-
-
-@dataclasses.dataclass
-class Relationship:
-    """
-    Describe a relationship between two entities.
-    Subject and object should be Core CocoIndex concepts only, should be nouns. For example, `CocoIndex`, `Incremental Processing`, `ETL`,  `Data` etc.
-    """
-
-    subject: str
-    predicate: str
-    object: str
-
-
 neo4j_conn_spec = cocoindex.add_auth_entry(
     "Neo4jConnection",
     cocoindex.storages.Neo4jConnection(
@@ -41,19 +20,43 @@ class Relationship:
     ),
 )
 
-# Use Neo4j as the graph database
+# SELECT ONE GRAPH DATABASE TO USE
+# This example can use either Neo4j or Kuzu as the graph database.
+# Please make sure only one branch is live and others are commented out.
+
+# Use Neo4j
 GraphDbSpec = cocoindex.storages.Neo4j
 GraphDbConnection = cocoindex.storages.Neo4jConnection
 GraphDbDeclaration = cocoindex.storages.Neo4jDeclaration
 conn_spec = neo4j_conn_spec
 
-# Use Kuzu as the graph database
+# Use Kuzu
 #  GraphDbSpec = cocoindex.storages.Kuzu
 #  GraphDbConnection = cocoindex.storages.KuzuConnection
 #  GraphDbDeclaration = cocoindex.storages.KuzuDeclaration
 #  conn_spec = kuzu_conn_spec
 
 
+@dataclasses.dataclass
+class DocumentSummary:
+    """Describe a summary of a document."""
+
+    title: str
+    summary: str
+
+
+@dataclasses.dataclass
+class Relationship:
+    """
+    Describe a relationship between two entities.
+    Subject and object should be Core CocoIndex concepts only, should be nouns. For example, `CocoIndex`, `Incremental Processing`, `ETL`,  `Data` etc.
+    """
+
+    subject: str
+    predicate: str
+    object: str
+
+
 @cocoindex.flow_def(name="DocsToKG")
 def docs_to_kg_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope