From 556789f1708dd80757bf180b0685040d3d9d1e30 Mon Sep 17 00:00:00 2001
From: lemorage
Date: Sat, 21 Jun 2025 03:11:49 +0200
Subject: [PATCH 1/4] ops: add pre-commit and hooks for checking
---
.pre-commit-config.yaml | 70 +++++++++++++++++++++++++++++++++++++++++
pyproject.toml | 3 +-
2 files changed, 72 insertions(+), 1 deletion(-)
create mode 100644 .pre-commit-config.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..ccb5fbc6e
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,70 @@
+ci:
+ autofix_prs: false
+ autoupdate_schedule: 'monthly'
+
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v5.0.0
+ hooks:
+ - id: check-case-conflict
+ # Check for files with names that would conflict on a case-insensitive
+ # filesystem like MacOS HFS+ or Windows FAT.
+ - id: check-merge-conflict
+ # Check for files that contain merge conflict strings.
+ - id: check-symlinks
+ # Checks for symlinks which do not point to anything.
+ exclude: ".*(.github.*)$"
+ - id: detect-private-key
+ # Checks for the existence of private keys.
+ - id: end-of-file-fixer
+ # Makes sure files end in a newline and only a newline.
+ exclude: ".*(data.*|extern.*|licenses.*|_static.*|\\.ya?ml)$"
+ - id: trailing-whitespace
+ # Trims trailing whitespace.
+ exclude_types: [python]
+ exclude: ".*(data.*|extern.*|licenses.*|_static.*|\\.ya?ml)$"
+
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.12.0
+ hooks:
+ - id: ruff-format
+ types: [python]
+ pass_filenames: true
+
+ - repo: local
+ hooks:
+ - id: pytest
+ name: pytest test
+ entry: pytest
+ language: system
+ types: [python]
+ pass_filenames: false
+ always_run: false
+ args: ["-v", "--strict-markers"]
+
+ - id: mypy-check
+ name: mypy type check
+ entry: mypy
+ language: system
+ types: [python]
+ pass_filenames: false
+
+ - id: maturin-develop
+ name: maturin develop
+ entry: maturin develop
+ language: system
+ types: [rust]
+ pass_filenames: false
+
+ - id: cargo-fmt
+ name: cargo fmt
+ entry: cargo fmt
+ language: system
+ types: [rust]
+ pass_filenames: false
+
+ - id: cargo-test
+ name: cargo test
+ entry: cargo test
+ language: system
+ types: [rust]
diff --git a/pyproject.toml b/pyproject.toml
index a9db65f28..f2c15b106 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,9 +29,10 @@ features = ["pyo3/extension-module"]
[project.optional-dependencies]
test = ["pytest"]
-dev = ["ruff"]
+dev = ["ruff", "pre-commit"]
[tool.mypy]
python_version = "3.11"
strict = true
files = "python/cocoindex"
+exclude = "(\\.venv|site-packages)"
From 973c5043bba7eb79b75b2f68aad89f83c1b6daac Mon Sep 17 00:00:00 2001
From: lemorage
Date: Sat, 21 Jun 2025 06:27:17 +0200
Subject: [PATCH 2/4] chore: pass pre-commit checks
---
.../\360\237\222\241-feature-request.md" | 2 +-
.github/scripts/update_version.sh | 2 +-
.pre-commit-config.yaml | 7 ++++---
.vscode/settings.json | 2 +-
CONTRIBUTING.md | 2 +-
README.md | 18 +++++++++---------
check.sh | 12 ------------
docs/docs/ai/llm.mdx | 6 +++---
docs/docs/core/basics.md | 4 ++--
docs/docs/core/cli.mdx | 2 +-
docs/docs/core/custom_function.mdx | 2 +-
docs/docs/core/flow_def.mdx | 2 +-
docs/docs/core/flow_methods.mdx | 6 +++---
docs/docs/core/settings.mdx | 2 +-
docs/docs/getting_started/installation.md | 5 ++---
docs/docs/getting_started/overview.md | 7 +++----
docs/docs/getting_started/quickstart.md | 6 +++---
docs/docs/ops/functions.md | 2 +-
docs/docs/ops/sources.md | 8 ++++----
docs/docs/ops/targets.md | 2 +-
docs/docs/query.mdx | 1 -
docs/sidebars.ts | 2 +-
docs/src/css/custom.css | 14 +++++++-------
docs/src/theme/Root.js | 2 +-
docs/static/robots.txt | 2 +-
examples/amazon_s3_embedding/.env.example | 2 +-
examples/amazon_s3_embedding/.gitignore | 2 +-
examples/amazon_s3_embedding/README.md | 4 ++--
examples/code_embedding/README.md | 13 ++++++-------
examples/docs_to_knowledge_graph/README.md | 10 ++++------
examples/fastapi_server_docker/README.md | 4 ++--
.../files/1810.04805v2.md | 2 +-
.../fastapi_server_docker/requirements.txt | 2 +-
examples/gdrive_text_embedding/.env.example | 4 ++--
examples/gdrive_text_embedding/.gitignore | 2 +-
examples/gdrive_text_embedding/README.md | 8 ++++----
examples/image_search/.env | 2 +-
examples/image_search/README.md | 1 -
examples/image_search/requirements.txt | 2 +-
examples/manuals_llm_extraction/README.md | 2 +-
examples/pdf_embedding/README.md | 2 +-
examples/product_recommendation/.env | 1 -
examples/product_recommendation/README.md | 10 ++++------
.../product_recommendation/products/p1.json | 2 +-
.../product_recommendation/products/p2.json | 2 +-
.../product_recommendation/products/p3.json | 2 +-
.../product_recommendation/products/p4.json | 2 +-
.../product_recommendation/products/p6.json | 2 +-
.../product_recommendation/products/p7.json | 2 +-
.../product_recommendation/products/p8.json | 2 +-
.../product_recommendation/products/p9.json | 2 +-
examples/text_embedding/README.md | 7 +++----
.../markdown_files/1706.03762v7.md | 2 +-
.../markdown_files/1810.04805v2.md | 2 +-
.../text_embedding/markdown_files/rfc8259.md | 2 +-
examples/text_embedding_qdrant/README.md | 8 +++-----
.../markdown_files/rfc8259.md | 2 +-
python/cocoindex/cli.py | 12 ++++++------
python/cocoindex/tests/__init__.py | 1 -
src/execution/db_tracking_setup.rs | 6 +++---
src/llm/litellm.rs | 12 +++++++++---
src/llm/mod.rs | 9 +++------
src/llm/openrouter.rs | 12 +++++++++---
63 files changed, 134 insertions(+), 150 deletions(-)
delete mode 100755 check.sh
diff --git "a/.github/ISSUE_TEMPLATE/\360\237\222\241-feature-request.md" "b/.github/ISSUE_TEMPLATE/\360\237\222\241-feature-request.md"
index 803c8ec3d..1c8bfc736 100644
--- "a/.github/ISSUE_TEMPLATE/\360\237\222\241-feature-request.md"
+++ "b/.github/ISSUE_TEMPLATE/\360\237\222\241-feature-request.md"
@@ -17,4 +17,4 @@ assignees: ''
---
❤️ Contributors, please refer to 📙[Contributing Guide](https://cocoindex.io/docs/about/contributing).
-Unless the PR can be sent immediately (e.g. just a few lines of code), we recommend you to leave a comment on the issue like **`I'm working on it`** or **`Can I work on this issue?`** to avoid duplicating work. Our [Discord server](https://discord.com/invite/zpA9S2DR7s) is always open and friendly.
\ No newline at end of file
+Unless the PR can be sent immediately (e.g. just a few lines of code), we recommend you to leave a comment on the issue like **`I'm working on it`** or **`Can I work on this issue?`** to avoid duplicating work. Our [Discord server](https://discord.com/invite/zpA9S2DR7s) is always open and friendly.
diff --git a/.github/scripts/update_version.sh b/.github/scripts/update_version.sh
index defdd773e..b2e60857d 100755
--- a/.github/scripts/update_version.sh
+++ b/.github/scripts/update_version.sh
@@ -19,4 +19,4 @@ else
fi
# Update Cargo.toml
-sed "${SED_INLINE[@]}" "s/^version = .*/version = \"$VERSION\"/" Cargo.toml
\ No newline at end of file
+sed "${SED_INLINE[@]}" "s/^version = .*/version = \"$VERSION\"/" Cargo.toml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ccb5fbc6e..b0e6c0e8b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,11 +18,11 @@ repos:
# Checks for the existence of private keys.
- id: end-of-file-fixer
# Makes sure files end in a newline and only a newline.
- exclude: ".*(data.*|extern.*|licenses.*|_static.*|\\.ya?ml)$"
+ exclude: ".*(data.*|licenses.*|_static.*|\\.ya?ml|\\.jpe?g|\\.png|\\.svg|\\.webp)$"
- id: trailing-whitespace
# Trims trailing whitespace.
- exclude_types: [python]
- exclude: ".*(data.*|extern.*|licenses.*|_static.*|\\.ya?ml)$"
+ exclude_types: [python] # Covered by Ruff W291.
+ exclude: ".*(data.*|licenses.*|_static.*|\\.ya?ml|\\.jpe?g|\\.png|\\.svg|\\.webp)$"
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.0
@@ -68,3 +68,4 @@ repos:
entry: cargo test
language: system
types: [rust]
+ pass_filenames: false
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 34129185f..ee4aa7297 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,4 +6,4 @@
],
"editor.formatOnSave": true,
"python.formatting.provider": "ruff"
-}
\ No newline at end of file
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c2c1812fd..b89f06d67 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1 +1 @@
-We love contributions from our community ❤️. Please check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
\ No newline at end of file
+We love contributions from our community ❤️. Please check out our [contributing guide](https://cocoindex.io/docs/about/contributing).
diff --git a/README.md b/README.md
index 444f45005..5a7c4f1bd 100644
--- a/README.md
+++ b/README.md
@@ -32,10 +32,10 @@ Unlike a workflow orchestration framework where data is usually opaque, in CocoI
```python
# import
-data['content'] = flow_builder.add_source(...)
+data['content'] = flow_builder.add_source(...)
# transform
-data['out'] = data['content']
+data['out'] = data['content']
.transform(...)
.transform(...)
@@ -56,17 +56,17 @@ As a data framework, CocoIndex takes it to the next level on data freshness. **I
The frameworks takes care of
- Change data capture.
- Figure out what exactly needs to be updated, and only updating that without having to recompute everything.
-
+
This makes it fast to reflect any source updates to the target store. If you have concerns with surfacing stale data to AI agents and are spending lots of efforts working on infra piece to optimize the latency, the framework actually handles it for you.
## Quick Start:
-If you're new to CocoIndex, we recommend checking out
+If you're new to CocoIndex, we recommend checking out
- 📖 [Documentation](https://cocoindex.io/docs)
- ⚡ [Quick Start Guide](https://cocoindex.io/docs/getting_started/quickstart)
-- 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
+- 🎬 [Quick Start Video Tutorial](https://youtu.be/gv5R8nOXsWU?si=9ioeKYkMEnYevTXT)
-### Setup
+### Setup
1. Install CocoIndex Python library
@@ -136,8 +136,8 @@ It defines an index flow like this:
| [Google Drive Text Embedding](examples/gdrive_text_embedding) | Index text documents from Google Drive |
| [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph |
| [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search |
-| [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup |
-| [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database|
+| [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup |
+| [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database|
| [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
More coming and stay tuned 👀!
@@ -159,7 +159,7 @@ Join our community here:
- 📜 [Read our blog posts](https://cocoindex.io/blogs/)
## Support us:
-We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
+We are constantly improving, and more features and examples are coming soon. If you love this project, please drop us a star ⭐ at GitHub repo [](https://github.com/cocoindex-io/cocoindex) to stay tuned and help us grow.
## License
CocoIndex is Apache 2.0 licensed.
diff --git a/check.sh b/check.sh
deleted file mode 100755
index 6ef77c546..000000000
--- a/check.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash -e
-
-maturin develop
-mypy
-
-cargo test
-pytest
-
-cargo fmt
-ruff format
-
-echo "All checks passed"
\ No newline at end of file
diff --git a/docs/docs/ai/llm.mdx b/docs/docs/ai/llm.mdx
index 0042099cb..8a1e47fe6 100644
--- a/docs/docs/ai/llm.mdx
+++ b/docs/docs/ai/llm.mdx
@@ -136,9 +136,9 @@ pip install 'litellm[proxy]'
**Example for OpenAI:**
```yaml
model_list:
- - model_name: "*"
+ - model_name: "*"
litellm_params:
- model: openai/*
+ model: openai/*
api_key: os.environ/LITELLM_API_KEY
```
@@ -176,7 +176,7 @@ litellm --config config.yml
```python
cocoindex.LlmSpec(
api_type=cocoindex.LlmApiType.LITE_LLM,
- model="deepseek-r1",
+ model="deepseek-r1",
address="http://127.0.0.1:4000", # default url of LiteLLM
)
```
diff --git a/docs/docs/core/basics.md b/docs/docs/core/basics.md
index d2407b50a..4f4129d29 100644
--- a/docs/docs/core/basics.md
+++ b/docs/docs/core/basics.md
@@ -71,7 +71,7 @@ An indexing flow, once set up, maintains a long-lived relationship between data
* **One time update**: Once triggered, CocoIndex updates the target data to reflect the version of source data up to the current moment.
* **Live update**: CocoIndex continuously reacts to changes of source data and updates the target data accordingly, based on various **change capture mechanisms** for the source.
-
+
See more details in the [build / update target data](flow_methods#build--update-target-data) section.
3. CocoIndex intelligently reprocesses to propagate source changes to target by:
@@ -101,4 +101,4 @@ As an indexing flow is long-lived, it needs to store intermediate data to keep t
CocoIndex uses internal storage for this purpose.
Currently, CocoIndex uses Postgres database as the internal storage.
-See [Settings](settings#databaseconnectionspec) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
\ No newline at end of file
+See [Settings](settings#databaseconnectionspec) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
diff --git a/docs/docs/core/cli.mdx b/docs/docs/core/cli.mdx
index 1f83f06eb..1d990e86c 100644
--- a/docs/docs/core/cli.mdx
+++ b/docs/docs/core/cli.mdx
@@ -72,4 +72,4 @@ Use `--help` to see the full list of subcommands, and `subcommand --help` to see
```sh
cocoindex --help # Show all subcommands
cocoindex show --help # Show usage of "show" subcommand
-```
\ No newline at end of file
+```
diff --git a/docs/docs/core/custom_function.mdx b/docs/docs/core/custom_function.mdx
index ead705cef..841ac2fd0 100644
--- a/docs/docs/core/custom_function.mdx
+++ b/docs/docs/core/custom_function.mdx
@@ -133,7 +133,7 @@ The cocoindex repository contains the following examples of custom functions def
* In the [pdf_embedding](https://github.com/cocoindex-io/cocoindex/blob/main/examples/pdf_embedding/main.py) example, we define a custom function `PdfToMarkdown`
* The `SentenceTransformerEmbed` function shipped with the CocoIndex Python package is defined by Python SDK.
Search for [`SentenceTransformerEmbedExecutor`](https://github.com/search?q=repo%3Acocoindex-io%2Fcocoindex+lang%3Apython+SentenceTransformerEmbedExecutor&type=code) to see the code.
-
+
## Parameters for custom functions
Custom functions take the following additional parameters:
diff --git a/docs/docs/core/flow_def.mdx b/docs/docs/core/flow_def.mdx
index 6bbd525da..616b5e3ca 100644
--- a/docs/docs/core/flow_def.mdx
+++ b/docs/docs/core/flow_def.mdx
@@ -56,7 +56,7 @@ A data scope has a bunch of fields and collectors, and users can add new fields
### Get or Add a Field
-You can get or add a field of a data scope (which is a data slice).
+You can get or add a field of a data scope (which is a data slice).
:::note
diff --git a/docs/docs/core/flow_methods.mdx b/docs/docs/core/flow_methods.mdx
index 6bef5c79d..0f401fd2d 100644
--- a/docs/docs/core/flow_methods.mdx
+++ b/docs/docs/core/flow_methods.mdx
@@ -182,10 +182,10 @@ CocoIndex also provides asynchronous versions of APIs for blocking operations, i
my_updater = cocoindex.FlowLiveUpdater(demo_flow)
# Start the updater.
await my_updater.start_async()
-
+
# Perform your own logic (e.g. a query loop).
...
-
+
# Print the update stats.
print(my_updater.update_stats())
# Abort the updater.
@@ -245,4 +245,4 @@ demo_flow.evaluate_and_dump(EvaluateAndDumpOptions(output_dir="./eval_output"))
```
-
\ No newline at end of file
+
diff --git a/docs/docs/core/settings.mdx b/docs/docs/core/settings.mdx
index 30901af66..fb1c7a924 100644
--- a/docs/docs/core/settings.mdx
+++ b/docs/docs/core/settings.mdx
@@ -113,4 +113,4 @@ This is the list of environment variables, each of which has a corresponding fie
| `COCOINDEX_DATABASE_URL` | `database.url` | Yes |
| `COCOINDEX_DATABASE_USER` | `database.user` | No |
| `COCOINDEX_DATABASE_PASSWORD` | `database.password` | No |
-| `COCOINDEX_APP_NAMESPACE` | `app_namespace` | No |
\ No newline at end of file
+| `COCOINDEX_APP_NAMESPACE` | `app_namespace` | No |
diff --git a/docs/docs/getting_started/installation.md b/docs/docs/getting_started/installation.md
index 4a4b3f818..4ef24d693 100644
--- a/docs/docs/getting_started/installation.md
+++ b/docs/docs/getting_started/installation.md
@@ -1,5 +1,5 @@
---
-title: Installation
+title: Installation
description: Setup the CocoIndex environment in 0-3 min
---
@@ -17,7 +17,7 @@ pip install -U cocoindex
## 📦 Install Postgres
-You can skip this step if you already have a Postgres database with pgvector extension installed.
+You can skip this step if you already have a Postgres database with pgvector extension installed.
If you don't have a Postgres database:
@@ -31,4 +31,3 @@ docker compose -f <(curl -L https://raw.githubusercontent.com/cocoindex-io/cocoi
## 🎉 All set!
You can now start using CocoIndex.
-
diff --git a/docs/docs/getting_started/overview.md b/docs/docs/getting_started/overview.md
index 50279c400..3be69e133 100644
--- a/docs/docs/getting_started/overview.md
+++ b/docs/docs/getting_started/overview.md
@@ -5,7 +5,7 @@ slug: /
# Welcome to CocoIndex
-CocoIndex is an ultra-performant real-time data transformation framework for AI, with incremental processing.
+CocoIndex is an ultra-performant real-time data transformation framework for AI, with incremental processing.
As a data framework, CocoIndex takes it to the next level on data freshness. **Incremental processing** is one of the core values provided by CocoIndex.
@@ -17,10 +17,10 @@ CocoIndex follows the idea of [Dataflow programming](https://en.wikipedia.org/wi
The gist of an example data transformation:
```python
# import
-data['content'] = flow_builder.add_source(...)
+data['content'] = flow_builder.add_source(...)
# transform
-data['out'] = data['content']
+data['out'] = data['content']
.transform(...)
.transform(...)
@@ -33,4 +33,3 @@ collector.export(...)
Get Started:
- [Quick Start](https://cocoindex.io/docs/getting_started/quickstart)
-
diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md
index 8ccfc70b9..81dccdd2c 100644
--- a/docs/docs/getting_started/quickstart.md
+++ b/docs/docs/getting_started/quickstart.md
@@ -19,7 +19,7 @@ This guide will help you get up and running with CocoIndex in just a few minutes
We'll need to install a bunch of dependencies for this project.
1. Install CocoIndex:
-
+
```bash
pip install -U cocoindex
```
@@ -149,7 +149,7 @@ documents: 3 added, 0 removed, 0 updated
## Step 4 (optional): Run queries against the index
-CocoIndex excels at transforming your data and storing it (a.k.a. indexing).
+CocoIndex excels at transforming your data and storing it (a.k.a. indexing).
The goal of transforming your data is usually to query against it.
Once you already have your index built, you can directly access the transformed data in the target database.
CocoIndex also provides utilities for you to do this more seamlessly.
@@ -291,4 +291,4 @@ Next, you may want to:
* Learn about [CocoIndex Basics](../core/basics.md).
* Learn about other examples in the [examples](https://github.com/cocoindex-io/cocoindex/tree/main/examples) directory.
* The `text_embedding` example is this quickstart.
- * Pick other examples to learn upon your interest.
\ No newline at end of file
+ * Pick other examples to learn upon your interest.
diff --git a/docs/docs/ops/functions.md b/docs/docs/ops/functions.md
index 4412f02b4..9b583fe71 100644
--- a/docs/docs/ops/functions.md
+++ b/docs/docs/ops/functions.md
@@ -53,7 +53,7 @@ Input data:
:::note
We use the `language` field to determine how to split the input text, following these rules:
-
+
* We'll match the input `language` field against the `language_name` or `aliases` of each custom language specification, and use the matched one. If value of `language` is null, it'll be treated as empty string when matching `language_name` or `aliases`.
* If no match is found, we'll match the `language` field against the builtin language configurations.
For all supported builtin language names and aliases (extensions), see [the code](https://github.com/search?q=org%3Acocoindex-io+lang%3Arust++%22static+TREE_SITTER_LANGUAGE_BY_LANG%22&type=code).
diff --git a/docs/docs/ops/sources.md b/docs/docs/ops/sources.md
index 5e5c58c6d..9cc2faefc 100644
--- a/docs/docs/ops/sources.md
+++ b/docs/docs/ops/sources.md
@@ -22,9 +22,9 @@ The spec takes the following fields:
If not specified, no files will be excluded.
:::info
-
+
`included_patterns` and `excluded_patterns` are using Unix-style glob syntax. See [globset syntax](https://docs.rs/globset/latest/globset/index.html#syntax) for the details.
-
+
:::
### Schema
@@ -131,9 +131,9 @@ The spec takes the following fields:
If not specified, no files will be excluded.
:::info
-
+
`included_patterns` and `excluded_patterns` are using Unix-style glob syntax. See [globset syntax](https://docs.rs/globset/latest/globset/index.html#syntax) for the details.
-
+
:::
* `sqs_queue_url` (type: `str`, optional): if provided, the source will receive change event notifications from Amazon S3 via this SQS queue.
diff --git a/docs/docs/ops/targets.md b/docs/docs/ops/targets.md
index cbae30cb9..f92cda5b9 100644
--- a/docs/docs/ops/targets.md
+++ b/docs/docs/ops/targets.md
@@ -52,7 +52,7 @@ Here's how CocoIndex data elements map to Qdrant elements during export:
| CocoIndex Element | Qdrant Element |
|-------------------|------------------|
-| an export target | a unique collection |
+| an export target | a unique collection |
| a collected row | a point |
| a field | a named vector, if fits into Qdrant vector; or a field within payload otherwise |
diff --git a/docs/docs/query.mdx b/docs/docs/query.mdx
index 2972c0568..32438765a 100644
--- a/docs/docs/query.mdx
+++ b/docs/docs/query.mdx
@@ -99,4 +99,3 @@ query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s DESC
-
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 4bb8dd428..bf645bdd6 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -61,4 +61,4 @@ const sidebars: SidebarsConfig = {
],
};
-export default sidebars;
\ No newline at end of file
+export default sidebars;
diff --git a/docs/src/css/custom.css b/docs/src/css/custom.css
index 499b82026..91f8b05af 100644
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@@ -18,17 +18,17 @@
--ifm-color-primary-lighter: #8F8FE6;
--ifm-color-primary-lightest: #ABABEF;
-
+
--docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
--my-color-text-black: #111827;
-
+
/* Additional colors */
--ifm-navbar-background-color: var(--ifm-background-color);
--ifm-background-color: #ffffff;
--ifm-footer-background-color: #ffffff;
--ifm-menu-color: #374151;
--ifm-toc-link-color: #374151;
-
+
/* Theme colors for breadcrumbs */
--theme-color-text-light: #6b7280;
--theme-color-text-default: #111827;
@@ -44,17 +44,17 @@
--ifm-color-primary-light: #ABABEF;
--ifm-color-primary-lighter: #C4C4F5;
--ifm-color-primary-lightest: #E1E1FF;
-
+
--docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
--my-color-text-black: #f9fafb;
-
+
/* Dark mode specific colors */
--ifm-navbar-background-color: var(--ifm-background-color);
--ifm-background-color: #111827;
--ifm-footer-background-color: #111827;
--ifm-menu-color: #f3f4f6;
--ifm-toc-link-color: #f3f4f6;
-
+
/* Dark mode theme colors for breadcrumbs */
--theme-color-text-light: #9ca3af;
--theme-color-text-default: #f9fafb;
@@ -113,7 +113,7 @@
.footer__title {
font-family: 'Questrial', sans-serif;
- font-size: 1rem;
+ font-size: 1rem;
font-weight: 600;
margin-bottom: 1rem;
color: var(--my-color-text-black);
diff --git a/docs/src/theme/Root.js b/docs/src/theme/Root.js
index bfe08476e..502460c85 100644
--- a/docs/src/theme/Root.js
+++ b/docs/src/theme/Root.js
@@ -15,4 +15,4 @@ export default function Root({ children }) {
}, []);
return <>{children}>;
-}
\ No newline at end of file
+}
diff --git a/docs/static/robots.txt b/docs/static/robots.txt
index 6f27bb66a..eb0536286 100644
--- a/docs/static/robots.txt
+++ b/docs/static/robots.txt
@@ -1,2 +1,2 @@
User-agent: *
-Disallow:
\ No newline at end of file
+Disallow:
diff --git a/examples/amazon_s3_embedding/.env.example b/examples/amazon_s3_embedding/.env.example
index 843822ecd..4025da55d 100644
--- a/examples/amazon_s3_embedding/.env.example
+++ b/examples/amazon_s3_embedding/.env.example
@@ -8,4 +8,4 @@ AMAZON_S3_BUCKET_NAME=your-bucket-name
# AMAZON_S3_PREFIX=
# Optional
-# AMAZON_S3_SQS_QUEUE_URL=
\ No newline at end of file
+# AMAZON_S3_SQS_QUEUE_URL=
diff --git a/examples/amazon_s3_embedding/.gitignore b/examples/amazon_s3_embedding/.gitignore
index 2eea525d8..4c49bd78f 100644
--- a/examples/amazon_s3_embedding/.gitignore
+++ b/examples/amazon_s3_embedding/.gitignore
@@ -1 +1 @@
-.env
\ No newline at end of file
+.env
diff --git a/examples/amazon_s3_embedding/README.md b/examples/amazon_s3_embedding/README.md
index 4f56ed581..66bf5fa39 100644
--- a/examples/amazon_s3_embedding/README.md
+++ b/examples/amazon_s3_embedding/README.md
@@ -53,7 +53,7 @@ During running, it will keep observing changes in the Amazon S3 bucket and updat
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
-## CocoInsight
+## CocoInsight
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
Run CocoInsight to understand your RAG data pipeline:
@@ -68,4 +68,4 @@ You can also add a `-L` flag to make the server keep updating the index to refle
cocoindex server -ci -L main.py
```
-Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
\ No newline at end of file
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/code_embedding/README.md b/examples/code_embedding/README.md
index 09bb0cc18..4f54f3154 100644
--- a/examples/code_embedding/README.md
+++ b/examples/code_embedding/README.md
@@ -1,7 +1,7 @@
-# Build real-time index for codebase
+# Build real-time index for codebase
[](https://github.com/cocoindex-io/cocoindex)
-CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex.
+CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex.
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
@@ -22,9 +22,9 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
1. We will ingest CocoIndex codebase.
-2. For each file, perform chunking (Tree-sitter) and then embedding.
+2. For each file, perform chunking (Tree-sitter) and then embedding.
3. We will save the embeddings and the metadata in Postgres with PGVector.
-
+
### Query:
We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
@@ -46,7 +46,7 @@ We will match against user-provided text by a SQL query, reusing the embedding o
```
- Update index:
-
+
```bash
cocoindex update main.py
```
@@ -58,7 +58,7 @@ We will match against user-provided text by a SQL query, reusing the embedding o
```
## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run the following command to start CocoInsight:
```
@@ -68,4 +68,3 @@ cocoindex server -ci main.py
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
-
diff --git a/examples/docs_to_knowledge_graph/README.md b/examples/docs_to_knowledge_graph/README.md
index 8143d35f4..e1551944b 100644
--- a/examples/docs_to_knowledge_graph/README.md
+++ b/examples/docs_to_knowledge_graph/README.md
@@ -1,6 +1,6 @@
# Build Real-Time Knowledge Graph For Documents with LLM
-We will process a list of documents and use LLM to extract relationships between the concepts in each document.
+We will process a list of documents and use LLM to extract relationships between the concepts in each document.
We will generate two kinds of relationships:
1. Relationships between subjects and objects. E.g., "CocoIndex supports Incremental Processing"
@@ -58,16 +58,14 @@ MATCH p=()-->() RETURN p
-## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+## CocoInsight
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```bash
cocoindex server -ci main.py
```
-And then open the url https://cocoindex.io/cocoinsight.
+And then open the url https://cocoindex.io/cocoinsight.
-
-
diff --git a/examples/fastapi_server_docker/README.md b/examples/fastapi_server_docker/README.md
index 6fe2d634f..dd5659ea7 100644
--- a/examples/fastapi_server_docker/README.md
+++ b/examples/fastapi_server_docker/README.md
@@ -38,7 +38,7 @@ COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
```bash
uvicorn main:fastapi_app --reload --host 0.0.0.0 --port 8000
```
-
+
## Query the endpoint
```bash
@@ -54,7 +54,7 @@ In the `.env` file, use Docker Postgres URL
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@coco_db:5436/cocoindex
```
-Build the docker container via:
+Build the docker container via:
```bash
docker compose up --build
```
diff --git a/examples/fastapi_server_docker/files/1810.04805v2.md b/examples/fastapi_server_docker/files/1810.04805v2.md
index 21ac07f46..112540fa2 100644
--- a/examples/fastapi_server_docker/files/1810.04805v2.md
+++ b/examples/fastapi_server_docker/files/1810.04805v2.md
@@ -527,4 +527,4 @@ The results are presented in Table [8.](#page-15-4) In the table, MASK means tha
The numbers in the left part of the table represent the probabilities of the specific strategies used during MLM pre-training (BERT uses 80%, 10%, 10%). The right part of the paper represents the Dev set results. For the feature-based approach, we concatenate the last 4 layers of BERT as the features, which was shown to be the best approach in Section [5.3.](#page-8-2)
-From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well.
\ No newline at end of file
+From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well.
diff --git a/examples/fastapi_server_docker/requirements.txt b/examples/fastapi_server_docker/requirements.txt
index d64dcf4af..df7388bac 100644
--- a/examples/fastapi_server_docker/requirements.txt
+++ b/examples/fastapi_server_docker/requirements.txt
@@ -4,4 +4,4 @@ fastapi==0.115.12
fastapi-cli==0.0.7
uvicorn==0.34.2
psycopg[binary]==3.2.6
-psycopg_pool==3.2.6
\ No newline at end of file
+psycopg_pool==3.2.6
diff --git a/examples/gdrive_text_embedding/.env.example b/examples/gdrive_text_embedding/.env.example
index aabcb757b..62d8fd9bf 100644
--- a/examples/gdrive_text_embedding/.env.example
+++ b/examples/gdrive_text_embedding/.env.example
@@ -1,10 +1,10 @@
# Postgres database address for cocoindex
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
-# Google Drive service account credential path.
+# Google Drive service account credential path.
#! PLEASE FILL IN
GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/path/to/service_account_credential.json
# Google Drive root folder IDs, comma separated.
#! PLEASE FILL IN
-GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2
\ No newline at end of file
+GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2
diff --git a/examples/gdrive_text_embedding/.gitignore b/examples/gdrive_text_embedding/.gitignore
index 2eea525d8..4c49bd78f 100644
--- a/examples/gdrive_text_embedding/.gitignore
+++ b/examples/gdrive_text_embedding/.gitignore
@@ -1 +1 @@
-.env
\ No newline at end of file
+.env
diff --git a/examples/gdrive_text_embedding/README.md b/examples/gdrive_text_embedding/README.md
index 6f7bbb790..4b7c9a171 100644
--- a/examples/gdrive_text_embedding/README.md
+++ b/examples/gdrive_text_embedding/README.md
@@ -15,7 +15,7 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
1. We will ingest files from Google Drive folders.
2. For each file, perform chunking (recursively split) and then embedding.
3. We will save the embeddings and the metadata in Postgres with PGVector.
-
+
### Query
We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow.
@@ -55,7 +55,7 @@ Before running the example, you need to:
```
- Run:
-
+
```sh
python main.py
```
@@ -64,8 +64,8 @@ During running, it will keep observing changes in the source folders and update
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
-## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+## CocoInsight
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```sh
diff --git a/examples/image_search/.env b/examples/image_search/.env
index 95c869c10..ea6235f9a 100644
--- a/examples/image_search/.env
+++ b/examples/image_search/.env
@@ -1 +1 @@
-COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@127.0.0.1:5432/cocoindex"
\ No newline at end of file
+COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@127.0.0.1:5432/cocoindex"
diff --git a/examples/image_search/README.md b/examples/image_search/README.md
index 4f982552f..017fc1fc9 100644
--- a/examples/image_search/README.md
+++ b/examples/image_search/README.md
@@ -41,4 +41,3 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
```
Go to `http://localhost:5174` to search.
-
diff --git a/examples/image_search/requirements.txt b/examples/image_search/requirements.txt
index 0a359dc85..c0b7c3407 100644
--- a/examples/image_search/requirements.txt
+++ b/examples/image_search/requirements.txt
@@ -2,4 +2,4 @@ cocoindex>=0.1.52
python-dotenv>=1.0.1
requests>=2.31.0
uvicorn>=0.29.0
-fastapi>=0.110.0
\ No newline at end of file
+fastapi>=0.110.0
diff --git a/examples/manuals_llm_extraction/README.md b/examples/manuals_llm_extraction/README.md
index 22ef240a3..d509743f3 100644
--- a/examples/manuals_llm_extraction/README.md
+++ b/examples/manuals_llm_extraction/README.md
@@ -59,7 +59,7 @@ You should see results like:

-## CocoInsight
+## CocoInsight
CocoInsight is a tool to help you understand your data pipeline and data index. CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
Run CocoInsight to understand your RAG data pipeline:
diff --git a/examples/pdf_embedding/README.md b/examples/pdf_embedding/README.md
index 0715da995..b34dcb882 100644
--- a/examples/pdf_embedding/README.md
+++ b/examples/pdf_embedding/README.md
@@ -51,7 +51,7 @@ Run:
python main.py
```
-## CocoInsight
+## CocoInsight
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```
diff --git a/examples/product_recommendation/.env b/examples/product_recommendation/.env
index 0cd63f97f..335f30600 100644
--- a/examples/product_recommendation/.env
+++ b/examples/product_recommendation/.env
@@ -1,3 +1,2 @@
# Postgres database address for cocoindex
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
-
diff --git a/examples/product_recommendation/README.md b/examples/product_recommendation/README.md
index 256d13d45..aef8939b0 100644
--- a/examples/product_recommendation/README.md
+++ b/examples/product_recommendation/README.md
@@ -1,6 +1,6 @@
# Build Real-Time Recommendation Engine with LLM and Graph Database
-We will build a real-time product recommendation engine with LLM and graph database. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook).
+We will build a real-time product recommendation engine with LLM and graph database. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook).
We will use Graph to explore the relationships between products that can be further used for product recommendations or labeling.
@@ -53,16 +53,14 @@ MATCH p=()-->() RETURN p

-## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+## CocoInsight
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```bash
cocoindex server -ci main.py
```
-And then open the url https://cocoindex.io/cocoinsight.
+And then open the url https://cocoindex.io/cocoinsight.

-
-
diff --git a/examples/product_recommendation/products/p1.json b/examples/product_recommendation/products/p1.json
index 0871d2732..3084e4dbc 100644
--- a/examples/product_recommendation/products/p1.json
+++ b/examples/product_recommendation/products/p1.json
@@ -19,4 +19,4 @@
"Ideal for students, professionals, and everyday writing tasks."
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/product_recommendation/products/p2.json b/examples/product_recommendation/products/p2.json
index 30be4a475..8447e41f1 100644
--- a/examples/product_recommendation/products/p2.json
+++ b/examples/product_recommendation/products/p2.json
@@ -19,4 +19,4 @@
"500 sheets per ream for bulk efficiency."
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/product_recommendation/products/p3.json b/examples/product_recommendation/products/p3.json
index 8c37201a8..8229bec55 100644
--- a/examples/product_recommendation/products/p3.json
+++ b/examples/product_recommendation/products/p3.json
@@ -19,4 +19,4 @@
"Dimensions: 2\" x 0.75\" x 0.4\" each."
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/product_recommendation/products/p4.json b/examples/product_recommendation/products/p4.json
index 949901e65..d7242a413 100644
--- a/examples/product_recommendation/products/p4.json
+++ b/examples/product_recommendation/products/p4.json
@@ -19,4 +19,4 @@
"Ideal for students, professionals, and everyday note-taking."
]
}
-}
\ No newline at end of file
+}
diff --git a/examples/product_recommendation/products/p6.json b/examples/product_recommendation/products/p6.json
index b5f149a38..ccc760f9a 100644
--- a/examples/product_recommendation/products/p6.json
+++ b/examples/product_recommendation/products/p6.json
@@ -19,4 +19,4 @@
"Safety & Ergonomics Certified — lab-tested for spinal load distribution, child ergonomics, and non-toxic material compliance"
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/product_recommendation/products/p7.json b/examples/product_recommendation/products/p7.json
index 83a72d0a0..2023ab19b 100644
--- a/examples/product_recommendation/products/p7.json
+++ b/examples/product_recommendation/products/p7.json
@@ -23,4 +23,4 @@
"Safety certified — meets FDA and international food safety standards"
]
}
-}
\ No newline at end of file
+}
diff --git a/examples/product_recommendation/products/p8.json b/examples/product_recommendation/products/p8.json
index 6c1375ffa..0c9706d9a 100644
--- a/examples/product_recommendation/products/p8.json
+++ b/examples/product_recommendation/products/p8.json
@@ -18,4 +18,4 @@
"Ideal for outdoor study, remote learning, and low-energy environments"
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/product_recommendation/products/p9.json b/examples/product_recommendation/products/p9.json
index d4d2d3f61..3295a937c 100644
--- a/examples/product_recommendation/products/p9.json
+++ b/examples/product_recommendation/products/p9.json
@@ -18,4 +18,4 @@
"Compact design fits neatly on any workstation"
]
}
- }
\ No newline at end of file
+ }
diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md
index ace7a716a..dcdf6e0ce 100644
--- a/examples/text_embedding/README.md
+++ b/examples/text_embedding/README.md
@@ -13,9 +13,9 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
1. We will ingest a list of local files.
-2. For each file, perform chunking (recursively split) and then embedding.
+2. For each file, perform chunking (recursively split) and then embedding.
3. We will save the embeddings and the metadata in Postgres with PGVector.
-
+
### Query
We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow.
@@ -52,7 +52,7 @@ python main.py
## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```
@@ -60,4 +60,3 @@ cocoindex server -ci main.py
```
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
-
diff --git a/examples/text_embedding/markdown_files/1706.03762v7.md b/examples/text_embedding/markdown_files/1706.03762v7.md
index b3e569cca..665a1972b 100644
--- a/examples/text_embedding/markdown_files/1706.03762v7.md
+++ b/examples/text_embedding/markdown_files/1706.03762v7.md
@@ -351,4 +351,4 @@ Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anap
**Input-Input Layer5**
-Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.
\ No newline at end of file
+Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the sentence. We give two such examples above, from two different heads from the encoder self-attention at layer 5 of 6. The heads clearly learned to perform different tasks.
diff --git a/examples/text_embedding/markdown_files/1810.04805v2.md b/examples/text_embedding/markdown_files/1810.04805v2.md
index 21ac07f46..112540fa2 100644
--- a/examples/text_embedding/markdown_files/1810.04805v2.md
+++ b/examples/text_embedding/markdown_files/1810.04805v2.md
@@ -527,4 +527,4 @@ The results are presented in Table [8.](#page-15-4) In the table, MASK means tha
The numbers in the left part of the table represent the probabilities of the specific strategies used during MLM pre-training (BERT uses 80%, 10%, 10%). The right part of the paper represents the Dev set results. For the feature-based approach, we concatenate the last 4 layers of BERT as the features, which was shown to be the best approach in Section [5.3.](#page-8-2)
-From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well.
\ No newline at end of file
+From the table it can be seen that fine-tuning is surprisingly robust to different masking strategies. However, as expected, using only the MASK strategy was problematic when applying the featurebased approach to NER. Interestingly, using only the RND strategy performs much worse than our strategy as well.
diff --git a/examples/text_embedding/markdown_files/rfc8259.md b/examples/text_embedding/markdown_files/rfc8259.md
index dc01ae004..bf6c29410 100644
--- a/examples/text_embedding/markdown_files/rfc8259.md
+++ b/examples/text_embedding/markdown_files/rfc8259.md
@@ -359,4 +359,4 @@ Author's Address
Email: tbray@textuality.com
-Bray Standards Track [Page 16]
\ No newline at end of file
+Bray Standards Track [Page 16]
diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md
index 827395247..b9309b82e 100644
--- a/examples/text_embedding_qdrant/README.md
+++ b/examples/text_embedding_qdrant/README.md
@@ -13,9 +13,9 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
1. We will ingest a list of local files.
-2. For each file, perform chunking (recursively split) and then embedding.
+2. For each file, perform chunking (recursively split) and then embedding.
3. We will save the embeddings and the metadata in Postgres with PGVector.
-
+
### Query
We use Qdrant client to query the index, and reuse the embedding operation in the indexing flow.
@@ -59,7 +59,7 @@ We use Qdrant client to query the index, and reuse the embedding operation in th
```
## CocoInsight
-I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
```bash
@@ -67,5 +67,3 @@ cocoindex server -ci main.py
```
Open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
-
-
diff --git a/examples/text_embedding_qdrant/markdown_files/rfc8259.md b/examples/text_embedding_qdrant/markdown_files/rfc8259.md
index b0911a4da..76b7d577a 100644
--- a/examples/text_embedding_qdrant/markdown_files/rfc8259.md
+++ b/examples/text_embedding_qdrant/markdown_files/rfc8259.md
@@ -331,4 +331,4 @@ Author's Address
Email: tbray@textuality.com
-Bray Standards Track [Page 16]
\ No newline at end of file
+Bray Standards Track [Page 16]
diff --git a/python/cocoindex/cli.py b/python/cocoindex/cli.py
index f0cd8401d..7f4f94522 100644
--- a/python/cocoindex/cli.py
+++ b/python/cocoindex/cli.py
@@ -1,19 +1,19 @@
-import click
+import atexit
import datetime
-import sys
import importlib.util
import os
-import atexit
+import sys
import types
+from typing import Any
-from dotenv import load_dotenv, find_dotenv
+import click
+from dotenv import find_dotenv, load_dotenv
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
-from typing import Any
from . import flow, lib, setting
-from .setup import sync_setup, drop_setup, flow_names_with_setup, apply_setup_changes
+from .setup import apply_setup_changes, drop_setup, flow_names_with_setup, sync_setup
# Create ServerSettings lazily upon first call, as environment variables may be loaded from files, etc.
COCOINDEX_HOST = "https://cocoindex.io"
diff --git a/python/cocoindex/tests/__init__.py b/python/cocoindex/tests/__init__.py
index 8b1378917..e69de29bb 100644
--- a/python/cocoindex/tests/__init__.py
+++ b/python/cocoindex/tests/__init__.py
@@ -1 +0,0 @@
-
diff --git a/src/execution/db_tracking_setup.rs b/src/execution/db_tracking_setup.rs
index 9ef358aad..d3bbfc687 100644
--- a/src/execution/db_tracking_setup.rs
+++ b/src/execution/db_tracking_setup.rs
@@ -24,19 +24,19 @@ async fn upgrade_tracking_table(
"CREATE TABLE IF NOT EXISTS {table_name} (
source_id INTEGER NOT NULL,
source_key JSONB NOT NULL,
-
+
-- Update in the precommit phase: after evaluation done, before really applying the changes to the target storage.
max_process_ordinal BIGINT NOT NULL,
staging_target_keys JSONB NOT NULL,
memoization_info JSONB,
-
+
-- Update after applying the changes to the target storage.
processed_source_ordinal BIGINT,
process_logic_fingerprint BYTEA,
process_ordinal BIGINT,
process_time_micros BIGINT,
target_keys JSONB,
-
+
PRIMARY KEY (source_id, source_key)
);",
);
diff --git a/src/llm/litellm.rs b/src/llm/litellm.rs
index 1dc628d9e..27648747a 100644
--- a/src/llm/litellm.rs
+++ b/src/llm/litellm.rs
@@ -1,16 +1,22 @@
-use async_openai::config::OpenAIConfig;
use async_openai::Client as OpenAIClient;
+use async_openai::config::OpenAIConfig;
pub use super::openai::Client;
impl Client {
pub async fn new_litellm(spec: super::LlmSpec) -> anyhow::Result {
- let address = spec.address.clone().unwrap_or_else(|| "http://127.0.0.1:4000".to_string());
+ let address = spec
+ .address
+ .clone()
+ .unwrap_or_else(|| "http://127.0.0.1:4000".to_string());
let api_key = std::env::var("LITELLM_API_KEY").ok();
let mut config = OpenAIConfig::new().with_api_base(address);
if let Some(api_key) = api_key {
config = config.with_api_key(api_key);
}
- Ok(Client::from_parts(OpenAIClient::with_config(config), spec.model))
+ Ok(Client::from_parts(
+ OpenAIClient::with_config(config),
+ spec.model,
+ ))
}
}
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
index a3652955f..ea4aa58ee 100644
--- a/src/llm/mod.rs
+++ b/src/llm/mod.rs
@@ -56,9 +56,9 @@ pub trait LlmGenerationClient: Send + Sync {
mod anthropic;
mod gemini;
+mod litellm;
mod ollama;
mod openai;
-mod litellm;
mod openrouter;
pub async fn new_llm_generation_client(spec: LlmSpec) -> Result> {
@@ -78,11 +78,8 @@ pub async fn new_llm_generation_client(spec: LlmSpec) -> Result {
Box::new(litellm::Client::new_litellm(spec).await?) as Box
}
- LlmApiType::OpenRouter => {
- Box::new(openrouter::Client::new_openrouter(spec).await?) as Box
- }
-
-
+ LlmApiType::OpenRouter => Box::new(openrouter::Client::new_openrouter(spec).await?)
+ as Box,
};
Ok(client)
}
diff --git a/src/llm/openrouter.rs b/src/llm/openrouter.rs
index 5dde06b91..cb7757889 100644
--- a/src/llm/openrouter.rs
+++ b/src/llm/openrouter.rs
@@ -1,16 +1,22 @@
-use async_openai::config::OpenAIConfig;
use async_openai::Client as OpenAIClient;
+use async_openai::config::OpenAIConfig;
pub use super::openai::Client;
impl Client {
pub async fn new_openrouter(spec: super::LlmSpec) -> anyhow::Result {
- let address = spec.address.clone().unwrap_or_else(|| "https://openrouter.ai/api/v1".to_string());
+ let address = spec
+ .address
+ .clone()
+ .unwrap_or_else(|| "https://openrouter.ai/api/v1".to_string());
let api_key = std::env::var("OPENROUTER_API_KEY").ok();
let mut config = OpenAIConfig::new().with_api_base(address);
if let Some(api_key) = api_key {
config = config.with_api_key(api_key);
}
- Ok(Client::from_parts(OpenAIClient::with_config(config), spec.model))
+ Ok(Client::from_parts(
+ OpenAIClient::with_config(config),
+ spec.model,
+ ))
}
}
From 7fb6163040f2a21427012fd6e6ad081116daeff8 Mon Sep 17 00:00:00 2001
From: lemorage
Date: Sat, 21 Jun 2025 06:50:41 +0200
Subject: [PATCH 3/4] docs: add pre-commit setup instructions
---
.pre-commit-config.yaml | 18 +++++++++---------
docs/docs/about/contributing.md | 29 +++++++++++++++++++----------
2 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b0e6c0e8b..7e3a583da 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,18 +30,18 @@ repos:
- id: ruff-format
types: [python]
pass_filenames: true
+
+ - repo: https://github.com/christophmeissner/pytest-pre-commit
+ rev: 1.0.0
+ hooks:
+ - id: pytest
+ language: system
+ types: [python]
+ pass_filenames: false
+ always_run: false
- repo: local
hooks:
- - id: pytest
- name: pytest test
- entry: pytest
- language: system
- types: [python]
- pass_filenames: false
- always_run: false
- args: ["-v", "--strict-markers"]
-
- id: mypy-check
name: mypy type check
entry: mypy
diff --git a/docs/docs/about/contributing.md b/docs/docs/about/contributing.md
index e9e3bae59..96d568338 100644
--- a/docs/docs/about/contributing.md
+++ b/docs/docs/about/contributing.md
@@ -15,22 +15,22 @@ We use [GitHub Issues](https://github.com/cocoindex-io/cocoindex/issues) to trac
We tag issues with the ["good first issue"](https://github.com/cocoindex-io/cocoindex/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) label for beginner contributors.
-## How to Contribute
+## How to Contribute
- If you decide to work on an issue, unless the PR can be sent immediately (e.g. just a few lines of code), we recommend you to leave a comment on the issue like **`I'm working on it`** or **`Can I work on this issue?`** to avoid duplicating work.
- For larger features, we recommend you to discuss with us first in our [Discord server](https://discord.com/invite/zpA9S2DR7s) to coordinate the design and work.
- Our [Discord server](https://discord.com/invite/zpA9S2DR7s) are constantly open. If you are unsure about anything, it is a good place to discuss! We'd love to collaborate and will always be friendly.
-## Start hacking! Setting Up Development Environment
+## Start hacking! Setting Up Development Environment
Following the steps below to get cocoindex build on latest codebase locally - if you are making changes to cocoindex funcionality and want to test it out.
- 🦀 [Install Rust](https://rust-lang.org/tools/install)
-
+
If you don't have Rust installed, run
```sh
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
```
- Already have Rust? Make sure it's up to date
- ```sh
+ Already have Rust? Make sure it's up to date
+ ```sh
rustup update
```
@@ -46,7 +46,7 @@ Following the steps below to get cocoindex build on latest codebase locally - if
- Install required tools:
```sh
- pip install maturin mypy ruff
+ pip install maturin mypy pre-commit
```
- Build the library. Run at the root of cocoindex directory:
@@ -54,6 +54,11 @@ Following the steps below to get cocoindex build on latest codebase locally - if
maturin develop
```
+- Install and enable pre-commit hooks. This ensures all checks run automatically before each commit:
+ ```sh
+ pre-commit install
+ ```
+
- Before running a specific example, set extra environment variables, for exposing extra traces, allowing dev UI, etc.
```sh
. ./.env.lib_debug
@@ -67,10 +72,14 @@ To submit your code:
1. Fork the [CocoIndex repository](https://github.com/cocoindex-io/cocoindex)
2. [Create a new branch](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop) on your fork
3. Make your changes
-4. Make sure all tests and linting pass by running
- ```sh
- ./check.sh
- ```
+4. Run the pre-commit checks (automatically triggered on `git commit`)
+
+ :::tip
+ To run them manually (same as CI):
+ ```sh
+ pre-commit run --all-files
+ ```
+ :::
5. [Open a Pull Request (PR)](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) when your work is ready for review
From 2910794a81f09432c701e5f4b6aebee91a10cc48 Mon Sep 17 00:00:00 2001
From: lemorage
Date: Sat, 21 Jun 2025 14:32:16 +0200
Subject: [PATCH 4/4] fix: run maturin before other hooks on relevant file
changes
---
.pre-commit-config.yaml | 48 ++++++++++++++++++++---------------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e3a583da..0dd0b9b69 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,36 +24,13 @@ repos:
exclude_types: [python] # Covered by Ruff W291.
exclude: ".*(data.*|licenses.*|_static.*|\\.ya?ml|\\.jpe?g|\\.png|\\.svg|\\.webp)$"
- - repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.12.0
- hooks:
- - id: ruff-format
- types: [python]
- pass_filenames: true
-
- - repo: https://github.com/christophmeissner/pytest-pre-commit
- rev: 1.0.0
- hooks:
- - id: pytest
- language: system
- types: [python]
- pass_filenames: false
- always_run: false
-
- repo: local
hooks:
- - id: mypy-check
- name: mypy type check
- entry: mypy
- language: system
- types: [python]
- pass_filenames: false
-
- id: maturin-develop
name: maturin develop
entry: maturin develop
language: system
- types: [rust]
+ files: ^(python/|src/|Cargo\.toml|pyproject\.toml)
pass_filenames: false
- id: cargo-fmt
@@ -69,3 +46,26 @@ repos:
language: system
types: [rust]
pass_filenames: false
+
+ - id: mypy-check
+ name: mypy type check
+ entry: mypy
+ language: system
+ types: [python]
+ pass_filenames: false
+
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.12.0
+ hooks:
+ - id: ruff-format
+ types: [python]
+ pass_filenames: true
+
+ - repo: https://github.com/christophmeissner/pytest-pre-commit
+ rev: 1.0.0
+ hooks:
+ - id: pytest
+ language: system
+ types: [python]
+ pass_filenames: false
+ always_run: false